Merge branch 'bert-opt2' into bert-opt3

9747cc44 · Paul · 48dbbd11 · db0301d7 · 9747cc44 · 9747cc44
Commit 9747cc44 authored Jul 06, 2022 by Paul
18 changed files
--- a/src/targets/gpu/include/migraphx/gpu/context.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/context.hpp
@@ -244,6 +244,15 @@ struct context
        return hip_event_ptr{event};
    }
+    static hip_event_ptr create_event_for_timing()
+    {
+        hipEvent_t event;
+        auto status = hipEventCreate(&event);
+        if(status != hipSuccess)
+            MIGRAPHX_THROW("Failed to create event");
+        return hip_event_ptr{event};
+    }
    value to_value() const
    {
        value result;
@@ -267,10 +276,49 @@ struct context
    any_ptr get_queue() { return get_stream().get(); }
+    void enable_perf_measurement(bool b = true)
+    {
+        if(b)
+        {
+            start_event = create_event_for_timing();
+            stop_event  = create_event_for_timing();
+            get_stream().record(start_event.get());
+            get_stream().record(stop_event.get());
+        }
+        else
+        {
+            start_event = nullptr;
+            stop_event  = nullptr;
+        }
+        measure_perf = b;
+    }
+    std::pair<hipEvent_t, hipEvent_t> get_perf_events() const
+    {
+        if(measure_perf)
+            return std::make_pair(start_event.get(), stop_event.get());
+        return std::make_pair(nullptr, nullptr);
+    }
+    float get_elapsed_ms() const
+    {
+        float result = 0;
+        if(start_event != nullptr and stop_event != nullptr)
+        {
+            auto status = hipEventElapsedTime(&result, start_event.get(), stop_event.get());
+            if(status != hipSuccess)
+                MIGRAPHX_THROW("Failed hipEventElapsedTime: " + hip_error(status));
+        }
+        return result;
+    }
    private:
    // TODO: Make this a vector to support multiple devices
    std::shared_ptr<hip_device> current_device;
    std::vector<shared<hip_event_ptr>> events;
+    bool measure_perf                 = false;
+    shared<hip_event_ptr> start_event = nullptr;
+    shared<hip_event_ptr> stop_event  = nullptr;
 };
 inline void migraphx_to_value(value& v, const context& ctx) { v = ctx.to_value(); }

--- a/src/targets/gpu/include/migraphx/gpu/deconvolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/deconvolution.hpp
@@ -39,20 +39,20 @@ struct miopen_deconvolution
    op::deconvolution op;
    shared<convolution_descriptor> cd;
    miopenConvFwdAlgorithm_t algo{};
-    miopenHandle_t handle = nullptr;
+    uint64_t solution_id = 0;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
-        // TODO: Add algo
+        return pack_join(op::deconvolution::reflect(self.op, f),
-        return op::convolution::reflect(self.op, f);
+                         pack(f(self.solution_id, "solution_id")));
    }
    std::string name() const { return "gpu::deconv"; }
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    shape compile(context& ctx, const shape& output_shape, std::vector<shape> inputs);
+    shape find(context& ctx, const shape& output_shape, std::vector<shape> inputs);
    void finalize(context& ctx, const shape& output_shape, std::vector<shape> inputs);
    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {

--- a/src/targets/gpu/include/migraphx/gpu/mlir_conv.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/mlir_conv.hpp
@@ -21,8 +21,8 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_MIOPEN_MLIR_CONV_HPP
+#ifndef MIGRAPHX_GUARD_GPU_FUSE_MLIR_HPP
-#define MIGRAPHX_GUARD_RTGLIB_MIOPEN_MLIR_CONV_HPP
+#define MIGRAPHX_GUARD_GPU_FUSE_MLIR_HPP
 #include <migraphx/config.hpp>
 #include <migraphx/gpu/context.hpp>
@@ -30,18 +30,19 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-struct module;
+struct module_pass_manager;
 namespace gpu {
-struct mlir_conv
+struct fuse_mlir
 {
-    context* ctx;
+    context* ctx = nullptr;
-    std::string name() const { return "mlir::convolution"; }
+    std::string name() const { return "gpu::fuse_mlir"; }
-    void apply(module& m) const;
+    void apply(module_pass_manager& mpm) const;
 };
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_FUSE_MLIR_HPP
-#endif
--- a/src/targets/gpu/include/migraphx/gpu/hip.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/hip.hpp
@@ -37,6 +37,8 @@ namespace gpu {
 struct context;
+std::string hip_error(int error);
 argument allocate_gpu(const shape& s, bool host = false);
 argument register_on_gpu(const argument& arg);

--- a/src/targets/gpu/include/migraphx/gpu/kernel.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/kernel.hpp
@@ -50,17 +50,22 @@ struct kernel
    void launch(hipStream_t stream,
                std::size_t global,
                std::size_t local,
-                const std::vector<kernel_argument>& args) const;
+                const std::vector<kernel_argument>& args,
+                hipEvent_t start = nullptr,
+                hipEvent_t stop  = nullptr) const;
    void launch(hipStream_t stream,
                std::size_t global,
                std::size_t local,
-                std::vector<void*> args) const;
+                std::vector<void*> args,
+                hipEvent_t start = nullptr,
+                hipEvent_t stop  = nullptr) const;
-    auto launch(hipStream_t stream, std::size_t global, std::size_t local) const
+    template <class... Ts>
+    auto launch(hipStream_t stream, std::size_t global, std::size_t local, Ts... zs) const
    {
        return [=](auto&&... xs) {
-            launch(stream, global, local, std::vector<kernel_argument>{xs...});
+            launch(stream, global, local, std::vector<kernel_argument>{xs...}, zs...);
        };
    }

--- a/src/targets/gpu/include/migraphx/gpu/mlir.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/mlir.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_GPU_MLIR_HPP
+#define MIGRAPHX_GUARD_RTGLIB_GPU_MLIR_HPP
+#include <string>
+#include <vector>
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/code_object_op.hpp>
+#include <migraphx/instruction_ref.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+struct module;
+namespace gpu {
+std::string dump_mlir(const module& m);
+code_object_op compile_mlir(const context& ctx, const module& m);
+instruction_ref insert_mlir(module& m,
+                            instruction_ref ins,
+                            code_object_op co,
+                            const std::vector<instruction_ref>& inputs);
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/quant_convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/quant_convolution.hpp
@@ -41,7 +41,7 @@ struct miopen_quant_convolution
    bool int8_x4_format = false;
    shared<convolution_descriptor> cd;
    miopenConvFwdAlgorithm_t algo{};
-    miopenHandle_t handle = nullptr;
+    uint64_t solution_id = 0;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
@@ -55,7 +55,7 @@ struct miopen_quant_convolution
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    shape compile(context& ctx, const shape& output_shape, std::vector<shape> inputs);
+    shape find(context& ctx, const shape& output_shape, std::vector<shape> inputs);
    void finalize(context& ctx, const shape& output_shape, std::vector<shape> inputs);
    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {

--- a/src/targets/gpu/jit/mlir.cpp
+++ b/src/targets/gpu/jit/mlir.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/mlir.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+struct mlir_compiler : compiler<mlir_compiler>
+{
+    std::vector<std::string> names() const { return {"gpu::mlir_conv"}; }
+    operation compile_op(context&, const std::vector<shape>&, const value&) const { return {}; }
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation&) const
+    {
+        auto* smod = ins->module_inputs().front();
+        assert(smod->get_parameter_names().size() == ins->inputs().size() - 1);
+        return insert(compile_mlir(ctx, *smod));
+    }
+    compiler_replace insert(code_object_op co) const
+    {
+        return [co = std::move(co)](module& m, instruction_ref ins) {
+            auto mlir = insert_mlir(m, ins, co, ins->inputs());
+            m.replace_instruction(ins, mlir);
+        };
+    }
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/kernel.cpp
+++ b/src/targets/gpu/kernel.cpp
@@ -80,7 +80,9 @@ void launch_kernel(hipFunction_t fun,
                   std::size_t global,
                   std::size_t local,
                   void* kernargs,
-                   std::size_t size)
+                   std::size_t size,
+                   hipEvent_t start,
+                   hipEvent_t stop)
 {
    assert(global > 0);
    assert(local > 0);
@@ -97,34 +99,55 @@ void launch_kernel(hipFunction_t fun,
 #endif
    };
-    auto status = hipExtModuleLaunchKernel(
+    auto status = hipExtModuleLaunchKernel(fun,
-        fun, global, 1, 1, local, 1, 1, 0, stream, nullptr, reinterpret_cast<void**>(&config));
+                                           global,
+                                           1,
+                                           1,
+                                           local,
+                                           1,
+                                           1,
+                                           0,
+                                           stream,
+                                           nullptr,
+                                           reinterpret_cast<void**>(&config),
+                                           start,
+                                           stop);
    if(status != hipSuccess)
        MIGRAPHX_THROW("Failed to launch kernel: " + hip_error(status));
+    if(stop)
+    {
+        status = hipEventSynchronize(stop);
+        if(status != hipSuccess)
+            MIGRAPHX_THROW("Failed to sync event: " + hip_error(status));
+    }
 }
 void kernel::launch(hipStream_t stream,
                    std::size_t global,
                    std::size_t local,
-                    std::vector<void*> args) const
+                    std::vector<void*> args,
+                    hipEvent_t start,
+                    hipEvent_t stop) const
 {
    assert(impl != nullptr);
    void* kernargs   = args.data();
    std::size_t size = args.size() * sizeof(void*);
-    launch_kernel(impl->fun, stream, global, local, kernargs, size);
+    launch_kernel(impl->fun, stream, global, local, kernargs, size, start, stop);
 }
 void kernel::launch(hipStream_t stream,
                    std::size_t global,
                    std::size_t local,
-                    const std::vector<kernel_argument>& args) const
+                    const std::vector<kernel_argument>& args,
+                    hipEvent_t start,
+                    hipEvent_t stop) const
 {
    assert(impl != nullptr);
    std::vector<char> kernargs = pack_args(args);
    std::size_t size           = kernargs.size();
-    launch_kernel(impl->fun, stream, global, local, kernargs.data(), size);
+    launch_kernel(impl->fun, stream, global, local, kernargs.data(), size, start, stop);
 }
 } // namespace gpu

--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -300,7 +300,7 @@ struct miopen_apply
            auto&& op = any_cast<op::deconvolution>(ins->get_operator());
            auto conv = miopen_deconvolution{op, make_deconv(op)};
-            auto ws   = conv.compile(get_context(), ins->get_shape(), to_shapes(ins->inputs()));
+            auto ws   = conv.find(get_context(), ins->get_shape(), to_shapes(ins->inputs()));
            auto workspace = insert_allocation(ins, ws);
            auto output    = insert_allocation(ins, ins->get_shape());
@@ -331,7 +331,7 @@ struct miopen_apply
            miopen_quant_convolution conv;
            auto compile_quant_conv_with_format = [&](bool format) {
                conv = miopen_quant_convolution{op, format, make_conv(op)};
-                ws   = conv.compile(get_context(), ins->get_shape(), to_shapes(ins->inputs()));
+                ws   = conv.find(get_context(), ins->get_shape(), to_shapes(ins->inputs()));
            };
            try

--- a/src/targets/gpu/mlir.cpp
+++ b/src/targets/gpu/mlir.cpp
--- a/src/targets/gpu/mlir_conv.cpp
+++ b/src/targets/gpu/mlir_conv.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/gpu/mlir_conv.hpp>
-#include <migraphx/manage_ptr.hpp>
-#include <migraphx/instruction.hpp>
-#include <migraphx/make_op.hpp>
-#include <migraphx/op/convolution.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <migraphx/gpu/convolution.hpp>
-#include <migraphx/iterator_for.hpp>
-#include <migraphx/program.hpp>
-#include <migraphx/ranges.hpp>
-#include <migraphx/make_op.hpp>
-#include <migraphx/generate.hpp>
-#include <migraphx/program.hpp>
-#include <migraphx/gpu/kernel.hpp>
-#include <migraphx/gpu/target.hpp>
-#include <migraphx/gpu/hip.hpp>
-#include <migraphx/gpu/compile_hip.hpp>
-#include <utility>
-#include <functional>
-#include <algorithm>
-#ifdef MIGRAPHX_MLIR_MIOPEN_SUPPORT
-#include <Miir.h>
-#endif // MIGRAPHX_MLIR_MIOPEN_SUPPORT
-#include <cstdio>
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-struct mlir_apply
-{
-    module* mod           = nullptr;
-    const mlir_conv* pass = nullptr;
-    const char* mlir_kernel_name = "migraphx_conv2d";
-    std::unordered_map<uint64_t, instruction_ref> literal_map{};
-    struct execution_spec
-    {
-        migraphx::value::binary binary;
-        size_t global_size;
-        size_t local_size;
-        execution_spec(migraphx::value::binary&& binary_m, size_t global_s, size_t local_s)
-            : binary(std::move(binary_m)), global_size(global_s), local_size(local_s)
-        {
-        }
-    };
-    std::unordered_map<std::string, std::shared_ptr<execution_spec>> binary_map{};
-    context& get_context() const
-    {
-        assert(pass != nullptr);
-        assert(pass->ctx != nullptr);
-        return *pass->ctx;
-    }
-    void init() const
-    {
-        assert(mod != nullptr);
-        assert(pass != nullptr);
-    }
-    std::shared_ptr<execution_spec> make_mlir_binary(instruction_ref op_r)
-    {
-        std::shared_ptr<execution_spec> result;
-#ifdef MIGRAPHX_MLIR_MIOPEN_SUPPORT
-        auto conv  = any_cast<op::convolution>(op_r->get_operator());
-        auto inp_t = op_r->inputs().at(0)->get_shape();
-        auto flt_t = op_r->inputs().at(1)->get_shape();
-        auto out_t = op_r->get_shape();
-        auto get_type_str = [](const shape& s) -> const char* {
-            switch(s.type())
-            {
-            case shape::float_type: return "f32";
-            case shape::half_type: return "f16";
-            case shape::bool_type:
-            case shape::double_type:
-            case shape::uint8_type:
-            case shape::int8_type:
-            case shape::uint16_type:
-            case shape::int16_type:
-            case shape::int32_type:
-            case shape::int64_type:
-            case shape::uint32_type:
-            case shape::uint64_type:
-            case shape::tuple_type: break;
-            }
-            return nullptr;
-        };
-        const auto* inp_t_s = get_type_str(inp_t);
-        const auto* flt_t_s = get_type_str(flt_t);
-        const auto* out_t_s = get_type_str(out_t);
-        if(out_t_s == nullptr || inp_t_s == nullptr || flt_t_s == nullptr)
-            return result;
-        std::string mlir_options = "--kernel_name " + std::string(mlir_kernel_name);
-        // platform spec
-        auto& device = get_context().get_current_device();
-        char dev_name[64];
-        sprintf(dev_name, "gfx%lu%02lu", device.get_device_major(), device.get_device_minor());
-        mlir_options += " --arch " + std::string(dev_name) + " --num_cu " +
-                        std::to_string(device.get_cu_count()); // ???
-        // Conv spec
-        mlir_options +=
-            " --operation "
-            "conv2d"
-            " --batchsize " +
-            std::to_string(conv.group) + " --groupsize " + std::to_string(1) + " --padding_h " +
-            std::to_string(conv.padding[0]) + " --padding_w " + std::to_string(conv.padding[1]) +
-            " --conv_stride_h " + std::to_string(conv.stride[0]) + " --conv_stride_w " +
-            std::to_string(conv.stride[1]) + " --dilation_h " + std::to_string(conv.dilation[0]) +
-            " --dilation_w " + std::to_string(conv.dilation[1]);
-        // Input spec
-        mlir_options += " --in_layout "
-                        "NCHWG"
-                        " --in_type " +
-                        std::string(inp_t_s) + " --in_channels " + std::to_string(inp_t.lens()[1]) +
-                        " --in_h " + std::to_string(inp_t.lens()[2]) + " --in_w " +
-                        std::to_string(inp_t.lens()[3]);
-        // Filter spec
-        mlir_options += " --fil_layout "
-                        "NCHWG"
-                        " --fil_type " +
-                        std::string(flt_t_s) + " --fil_h " + std::to_string(flt_t.lens()[2]) +
-                        " --fil_w " + std::to_string(flt_t.lens()[3]);
-        // Output spec
-        mlir_options += " --out_layout "
-                        "NCHWG"
-                        " --out_type " +
-                        std::string(out_t_s) + " --out_channels " +
-                        std::to_string(out_t.lens()[1]) + " --out_h " +
-                        std::to_string(out_t.lens()[2]) + " --out_w " +
-                        std::to_string(out_t.lens()[3]);
-        auto bin_i = binary_map.find(mlir_options);
-        if(bin_i == binary_map.end())
-        {
-            size_t bin_size = 0;
-            using mlir_handle = MIGRAPHX_MANAGE_PTR(MiirHandle, miirDestroyHandle);
-            auto handle       = mlir_handle(miirCreateHandle(mlir_options.c_str()));
-            if(miirLowerBin(handle.get()) == MIIR_SUCCESS &&
-               miirBufferGet(handle.get(), nullptr, &bin_size) == MIIR_SUCCESS)
-            {
-                migraphx::value::binary bin(bin_size);
-                if(miirBufferGet(handle.get(), reinterpret_cast<char*>(bin.data()), &bin_size) ==
-                   MIIR_SUCCESS)
-                {
-                    size_t global_size;
-                    size_t block_size;
-                    if(miirGetExecutionDims(handle.get(), &global_size, &block_size) ==
-                       MIIR_SUCCESS)
-                    {
-                        result = std::make_shared<execution_spec>(
-                            std::move(bin), global_size, block_size);
-                    }
-                }
-            }
-            binary_map[mlir_options] = result;
-        }
-        else
-        {
-            result = bin_i->second;
-        }
-#else  // MIGRAPHX_MLIR_MIOPEN_SUPPORT
-        (void)op_r;
-#endif // MIGRAPHX_MLIR_MIOPEN_SUPPORT
-        return result;
-    }
-    instruction_ref get_literal(uint64_t value)
-    {
-        auto fi = literal_map.find(value);
-        if(fi != literal_map.end())
-            return fi->second;
-        auto lit = mod->add_literal(value);
-        literal_map.emplace(value, lit);
-        return lit;
-    }
-    operation make_code_object_op(instruction_ref op_r, const std::shared_ptr<execution_spec>& spec)
-    {
-        // each pointer is expanded out to a MemRefDescriptor
-        auto inp_t = op_r->inputs().at(0)->get_shape();
-        auto flt_t = op_r->inputs().at(1)->get_shape();
-        auto out_t = op_r->get_shape();
-        auto i64 = shape(shape::uint64_type);
-        std::vector<shape> expected_inputs = {
-            flt_t, flt_t, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64,  i64,   inp_t,
-            inp_t, i64,   i64, i64, i64, i64, i64, i64, i64, i64, i64, i64,  out_t, out_t,
-            i64,   i64,   i64, i64, i64, i64, i64, i64, i64, i64, i64, out_t};
-        return migraphx::make_op("gpu::code_object",
-                                 {
-                                     {"code_object", spec->binary},
-                                     {"symbol_name", mlir_kernel_name},
-                                     {"global", spec->global_size},
-                                     {"local", spec->local_size},
-                                     {"expected_inputs", migraphx::to_value(expected_inputs)},
-                                     {"output", migraphx::to_value(out_t)},
-                                 });
-    }
-    void add_memref_descriptor(std::vector<instruction_ref>& refs, instruction_ref inst)
-    {
-        const size_t offset = 0;
-        auto inst_t         = inst->get_shape();
-        refs.push_back(inst);
-        refs.push_back(inst);
-        refs.push_back(get_literal(offset)); // offset
-        // dim sizes
-        std::transform(inst_t.lens().begin(),
-                       inst_t.lens().end(),
-                       std::back_inserter(refs),
-                       [&](const auto& lval) { return get_literal(lval); });
-        refs.push_back(get_literal(1)); // G
-        // dim strides
-        std::transform(inst_t.strides().begin(),
-                       inst_t.strides().end(),
-                       std::back_inserter(refs),
-                       [&](const auto& lval) { return get_literal(lval); });
-        refs.push_back(get_literal(1)); // G
-    }
-    instruction_ref insert_allocation(instruction_ref ins, const shape& s) const
-    {
-        return mod->insert_instruction(ins, hip_allocate{s});
-    }
-    void replace_conv_op(instruction_ref ins)
-    {
-        auto conv_bin = make_mlir_binary(ins);
-        if(conv_bin)
-        {
-            auto conv = make_code_object_op(ins, conv_bin);
-            auto inp = ins->inputs().at(0);
-            auto flt = ins->inputs().at(1);
-            auto out = insert_allocation(ins, ins->get_shape());
-            std::vector<instruction_ref> refs;
-            refs.reserve(3 * 13 + 1);
-            add_memref_descriptor(refs, flt);
-            add_memref_descriptor(refs, inp);
-            add_memref_descriptor(refs, out);
-            refs.push_back(out);
-            mod->replace_instruction(ins, conv, refs);
-        }
-    }
-    void apply()
-    {
-        init();
-        for(auto it : iterator_for(*mod))
-        {
-            if(it->name() == "convolution")
-            {
-                replace_conv_op(it);
-            }
-        }
-    }
-};
-void mlir_conv::apply(module& m) const { mlir_apply{&m, this}.apply(); }
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/quant_convolution.cpp
+++ b/src/targets/gpu/quant_convolution.cpp
@@ -67,9 +67,9 @@ argument miopen_quant_convolution::compute(context& ctx,
    return args[3];
 }
-shape miopen_quant_convolution::compile(context& ctx,
+shape miopen_quant_convolution::find(context& ctx,
-                                        const shape& output_shape,
+                                     const shape& output_shape,
-                                        std::vector<shape> inputs)
+                                     std::vector<shape> inputs)
 {
    shape workspace_shape{};
    auto x_desc = make_tensor(inputs[0], int8_x4_format);
@@ -92,18 +92,18 @@ shape miopen_quant_convolution::compile(context& ctx,
        x_shape = pack_int8_shape(x_shape);
        w_shape = pack_int8_shape(w_shape);
    }
-    auto arg_vec4_x = to_gpu(generate_argument(x_shape));
+    auto x         = to_gpu(generate_argument(x_shape));
-    auto arg_vec4_w = to_gpu(generate_argument(w_shape));
+    auto w         = to_gpu(generate_argument(w_shape));
-    auto y          = allocate_gpu(output_shape);
+    auto y         = allocate_gpu(output_shape);
-    auto workspace  = allocate_gpu(workspace_shape);
+    auto workspace = allocate_gpu(workspace_shape);
    int algo_count = 1;
    miopenConvAlgoPerf_t perf;
    auto status = miopenFindConvolutionForwardAlgorithm(ctx.get_stream().get_miopen(),
                                                        x_desc.get(),
-                                                        arg_vec4_x.implicit(),
+                                                        x.implicit(),
                                                        w_desc.get(),
-                                                        arg_vec4_w.implicit(),
+                                                        w.implicit(),
                                                        cd.get(),
                                                        y_desc.get(),
                                                        y.implicit(),
@@ -114,11 +114,35 @@ shape miopen_quant_convolution::compile(context& ctx,
                                                        workspace_size,
                                                        false);
    if(status != miopenStatusSuccess)
-    {
+        MIGRAPHX_THROW("MIOpen Quant Convolution: find convolution failed");
-        MIGRAPHX_THROW("QUANT_CONVOLUTION: find convolution failed");
+    algo = perf.fwd_algo;
-    }
-    handle = ctx.get_stream().get_miopen();
+    size_t solution_count;
-    algo   = perf.fwd_algo;
+    status = miopenConvolutionForwardGetSolutionCount(ctx.get_stream().get_miopen(),
+                                                      w_desc.get(),
+                                                      x_desc.get(),
+                                                      cd.get(),
+                                                      y_desc.get(),
+                                                      &solution_count);
+    if(status != miopenStatusSuccess)
+        MIGRAPHX_THROW("MIOpen Quant Convolution: get solution count failed");
+    std::vector<miopenConvSolution_t> solutions(solution_count);
+    status = miopenConvolutionForwardGetSolution(ctx.get_stream().get_miopen(),
+                                                 w_desc.get(),
+                                                 x_desc.get(),
+                                                 cd.get(),
+                                                 y_desc.get(),
+                                                 solution_count,
+                                                 &solution_count,
+                                                 solutions.data());
+    if(status != miopenStatusSuccess)
+        MIGRAPHX_THROW("MIOpen Quant Convolution: get solution failed");
+    solution_id = solutions.front().solution_id;
    return shape{shape::int8_type, {perf.memory}};
 }
@@ -126,13 +150,29 @@ void miopen_quant_convolution::finalize(context& ctx,
                                        const shape& output_shape,
                                        std::vector<shape> inputs)
 {
-    if(handle == ctx.get_stream().get_miopen())
+    if(cd == nullptr)
-        return;
+        cd = make_conv(op);
-    // Check that workspace hasn't changed
+    if(solution_id == 0)
-    auto size = inputs.at(2).bytes();
+    {
-    auto ws   = compile(ctx, output_shape, std::move(inputs));
+        // Check that workspace hasn't changed
-    if(ws.bytes() > size)
+        auto size = inputs.at(2).bytes();
-        MIGRAPHX_THROW("Workspace has changed during finalization.");
+        auto ws   = find(ctx, output_shape, inputs);
+        if(ws.bytes() > size)
+            MIGRAPHX_THROW("MIOpen Quant Convolution: workspace has changed during finalization.");
+    }
+    auto x_desc = make_tensor(inputs[0], int8_x4_format);
+    auto w_desc = make_tensor(inputs[1], int8_x4_format);
+    auto y_desc = make_tensor(output_shape);
+    auto status = miopenConvolutionForwardCompileSolution(ctx.get_stream().get_miopen(),
+                                                          w_desc.get(),
+                                                          x_desc.get(),
+                                                          cd.get(),
+                                                          y_desc.get(),
+                                                          solution_id);
+    if(status != miopenStatusSuccess)
+        MIGRAPHX_THROW("MIOpen Quant Convolution: compile solution failed");
 }
 shape miopen_quant_convolution::pack_int8_shape(const shape& s) const

--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -53,10 +53,10 @@
 #include <migraphx/gpu/compile_ops.hpp>
 #include <migraphx/gpu/concat_gpu_opt.hpp>
 #include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/fuse_mlir.hpp>
 #include <migraphx/gpu/fuse_ops.hpp>
 #include <migraphx/gpu/prefuse_ops.hpp>
 #include <migraphx/gpu/lowering.hpp>
-#include <migraphx/gpu/mlir_conv.hpp>
 #include <migraphx/gpu/pack_int8_args.hpp>
 #include <migraphx/gpu/schedule_model.hpp>
 #include <migraphx/gpu/sync_device.hpp>
@@ -128,7 +128,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        dead_code_elimination{},
        enable_pass(not enabled(MIGRAPHX_DISABLE_POINTWISE_FUSION{}), fuse_pointwise{}),
        dead_code_elimination{},
-        mlir_conv{&ctx},
+        fuse_mlir{&ctx},
+        dead_code_elimination{},
        lowering{&ctx, options.offload_copy},
        eliminate_contiguous{"gpu::contiguous"},
        dead_code_elimination{},

--- a/test/gpu/mlir.cpp
+++ b/test/gpu/mlir.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/mlir.hpp>
+#include <migraphx/gpu/target.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/write_literals.hpp>
+#include <migraphx/ref/target.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/program.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/verify_args.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/functional.hpp>
+#include <test.hpp>
+using migraphx::trim;
+// m test_gpu_mlir && ./bin/test_gpu_mlir
+struct mlir_gpu_target : migraphx::gpu::target
+{
+    std::string name() const { return "mlir"; }
+    std::vector<migraphx::pass> get_passes(migraphx::context& gctx,
+                                           const migraphx::compile_options&) const
+    {
+        auto& ctx = migraphx::any_cast<migraphx::gpu::context>(gctx);
+        return {migraphx::gpu::write_literals{&ctx}};
+    }
+};
+std::string encode(const std::string& s)
+{
+    std::stringstream ss;
+    bool prespace = false;
+    for(auto c : s)
+    {
+        if(std::isspace(c) != 0)
+        {
+            if(not prespace)
+                ss << "  ";
+            prespace = true;
+        }
+        else if(std::isprint(c) != 0)
+        {
+            ss << c;
+            prespace = false;
+        }
+    }
+    return migraphx::trim(ss.str());
+}
+migraphx::program create_program_from_mlir(const migraphx::module& mmlir)
+{
+    migraphx::program p;
+    auto* mm   = p.get_main_module();
+    auto names = mmlir.get_parameter_names();
+    std::vector<migraphx::instruction_ref> inputs;
+    std::transform(names.begin(), names.end(), std::back_inserter(inputs), [&](const auto& name) {
+        return mm->add_parameter(name, mmlir.get_parameter_shape(name));
+    });
+    std::sort(inputs.begin(), inputs.end(), migraphx::by(std::less<>{}, [](auto ins) {
+                  return to_string(ins->get_operator());
+              }));
+    inputs.push_back(mm->add_parameter("output", mmlir.get_output_shapes().front()));
+    migraphx::gpu::context ctx;
+    migraphx::gpu::insert_mlir(*mm, mm->end(), compile_mlir(ctx, mmlir), inputs);
+    return p;
+}
+migraphx::parameter_map generate_params(const migraphx::program& p)
+{
+    migraphx::parameter_map m;
+    std::size_t i = 0;
+    for(auto&& x : p.get_parameter_shapes())
+    {
+        // m[x.first] = migraphx::fill_argument(x.second, 1);
+        m[x.first] = migraphx::generate_argument(x.second, i++);
+    }
+    return m;
+}
+migraphx::argument run_gpu(migraphx::program p, const migraphx::parameter_map& inputs)
+{
+    mlir_gpu_target t;
+    p.compile(t);
+    migraphx::parameter_map m;
+    for(auto&& input : inputs)
+    {
+        m[input.first] = t.copy_to(input.second);
+    }
+    for(auto&& x : p.get_parameter_shapes())
+    {
+        if(m.count(x.first) == 0)
+        {
+            m[x.first] = t.allocate(x.second);
+        }
+    }
+    return t.copy_from(p.eval(m).front());
+}
+migraphx::argument run_ref(migraphx::program p, const migraphx::parameter_map& inputs)
+{
+    p.compile(migraphx::ref::target{});
+    return p.eval(inputs).front();
+}
+bool verify_mlir(const migraphx::module& mmlir)
+{
+    migraphx::program ref;
+    ref.get_main_module()->insert_instructions(ref.get_main_module()->end(), &mmlir);
+    auto inputs = generate_params(ref);
+    auto mlir = create_program_from_mlir(mmlir);
+    return migraphx::verify_args("mlir", run_ref(ref, inputs), run_gpu(mlir, inputs));
+}
+TEST_CASE(conv)
+{
+    const std::string mlir_output = R"__migraphx__(
+module {
+  func @main(%arg0: tensor<2x8x3x3xf32>, %arg1: tensor<1x8x4x4xf32>) -> tensor<1x2x2x2xf32> attributes {kernel = "mixr"} {
+    %0 = migraphx.convolution(%arg1, %arg0) {dilation = [1, 1], group = 1 : i64, padding = [0, 0, 0, 0], padding_mode = 0 : i64, stride = [1, 1]} : (tensor<1x8x4x4xf32>, tensor<2x8x3x3xf32>) -> tensor<1x2x2x2xf32>
+    return %0 : tensor<1x2x2x2xf32>
+  }
+}
+)__migraphx__";
+    migraphx::module m;
+    auto x    = m.add_parameter("x", {migraphx::shape::float_type, {1, 8, 4, 4}});
+    auto w    = m.add_parameter("w", {migraphx::shape::float_type, {2, 8, 3, 3}});
+    auto conv = m.add_instruction(migraphx::make_op("convolution"), x, w);
+    m.add_return({conv});
+    auto s = migraphx::gpu::dump_mlir(m);
+    // Skip test if MLIR is not enabled
+    if(s.empty())
+        return;
+    CHECK(encode(s) == encode(mlir_output));
+    EXPECT(verify_mlir(m));
+}
+TEST_CASE(conv_add_relu)
+{
+    const std::string mlir_output = R"__migraphx__(
+module {
+  func @main(%arg0: tensor<1x2x2x2xf32>, %arg1: tensor<2x8x3x3xf32>, %arg2: tensor<1x8x4x4xf32>) -> tensor<1x2x2x2xf32> attributes {kernel = "mixr"} {
+    %0 = migraphx.convolution(%arg2, %arg1) {dilation = [1, 1], group = 1 : i64, padding = [0, 0, 0, 0], padding_mode = 0 : i64, stride = [1, 1]} : (tensor<1x8x4x4xf32>, tensor<2x8x3x3xf32>) -> tensor<1x2x2x2xf32>
+    %1 = migraphx.add(%0, %arg0) : (tensor<1x2x2x2xf32>, tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>
+    %2 = migraphx.relu(%1) : (tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>
+    return %2 : tensor<1x2x2x2xf32>
+  }
+}
+)__migraphx__";
+    migraphx::module m;
+    auto x    = m.add_parameter("x", {migraphx::shape::float_type, {1, 8, 4, 4}});
+    auto w    = m.add_parameter("w", {migraphx::shape::float_type, {2, 8, 3, 3}});
+    auto b    = m.add_parameter("b", {migraphx::shape::float_type, {1, 2, 2, 2}});
+    auto conv = m.add_instruction(migraphx::make_op("convolution"), x, w);
+    auto add  = m.add_instruction(migraphx::make_op("add"), conv, b);
+    auto relu = m.add_instruction(migraphx::make_op("relu"), add);
+    m.add_return({relu});
+    auto s = migraphx::gpu::dump_mlir(m);
+    // Skip test if MLIR is not enabled
+    if(s.empty())
+        return;
+    CHECK(encode(s) == encode(mlir_output));
+    EXPECT(verify_mlir(m));
+}
+int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/simplify_reshapes_test.cpp
+++ b/test/simplify_reshapes_test.cpp
@@ -1231,12 +1231,12 @@ TEST_CASE(transpose_slice_single_transpose)
        auto sqrt1  = m1.add_instruction(migraphx::make_op("sqrt"), slice1);
        auto slice2 = m1.add_instruction(
            migraphx::make_op("slice", {{"axes", {2}}, {"starts", {12}}, {"ends", {24}}}), x);
-        auto transpose2 = m1.add_instruction(
+        auto transpose = m1.add_instruction(
            migraphx::make_op("transpose", {{"permutation", {0, 2, 1, 3}}}), slice2);
        auto slice3 = m1.add_instruction(
            migraphx::make_op("slice", {{"axes", {2}}, {"starts", {24}}, {"ends", {36}}}), x);
        auto sqrt3 = m1.add_instruction(migraphx::make_op("sqrt"), slice3);
-        m1.add_return({sqrt1, transpose2, sqrt3});
+        m1.add_return({sqrt1, transpose, sqrt3});
    }
    migraphx::module m2 = m1;
    run_pass(m1);

--- a/test/verify/run_verify.cpp
+++ b/test/verify/run_verify.cpp
@@ -30,6 +30,7 @@
 #include <migraphx/ranges.hpp>
 #include <migraphx/generate.hpp>
 #include <migraphx/load_save.hpp>
+#include <migraphx/tmp_dir.hpp>
 #include <migraphx/verify_args.hpp>
 #include <set>
@@ -57,6 +58,15 @@ std::future<typename std::result_of<Function()>::type> detach_async(Function&& f
    return std::async(std::launch::deferred, std::forward<Function>(f));
 }
+inline void verify_load_save(const migraphx::program& p)
+{
+    migraphx::tmp_dir td{"migraphx_test"};
+    auto path = td.path / "test.mxr";
+    migraphx::save(p, path.string());
+    auto loaded = migraphx::load(path.string());
+    EXPECT(p == loaded);
+}
 inline void compile_check(migraphx::program& p, const migraphx::target& t, bool show_trace = false)
 {
    auto name   = t.name();
@@ -82,6 +92,8 @@ inline void compile_check(migraphx::program& p, const migraphx::target& t, bool
            throw std::runtime_error("Compiling program with " + name + " alters its shape");
        }
    }
+    if(t.name() != "ref")
+        verify_load_save(p);
 }
 target_info run_verify::get_target_info(const std::string& name) const
@@ -152,6 +164,7 @@ void run_verify::verify(const std::string& name, const migraphx::program& p) con
    auto_print::set_terminate_handler(name);
    if(migraphx::enabled(MIGRAPHX_DUMP_TEST{}))
        migraphx::save(p, name + ".mxr");
+    verify_load_save(p);
    std::vector<std::string> target_names;
    for(const auto& tname : migraphx::get_targets())
    {

--- a/test/verify/test_conv_add_relu.cpp
+++ b/test/verify/test_conv_add_relu.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/instruction.hpp>
+struct test_conv_add_relu : verify_program<test_conv_add_relu>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+        auto input =
+            mm->add_parameter("x", migraphx::shape{migraphx::shape::float_type, {4, 3, 3, 3}});
+        auto weights =
+            mm->add_parameter("w", migraphx::shape{migraphx::shape::float_type, {4, 3, 3, 3}});
+        auto bias_literal = migraphx::literal{migraphx::shape{migraphx::shape::float_type, {4}},
+                                              {2.0f, 2.0f, 2.0f, 2.0f}};
+        auto bias         = mm->add_literal(bias_literal);
+        auto conv         = mm->add_instruction(migraphx::make_op("convolution"), input, weights);
+        auto bcast_bias   = mm->add_instruction(
+            migraphx::make_op("broadcast", {{"axis", 1}, {"out_lens", conv->get_shape().lens()}}),
+            bias);
+        auto bias_add = mm->add_instruction(migraphx::make_op("add"), conv, bcast_bias);
+        mm->add_instruction(migraphx::make_op("relu"), bias_add);
+        return p;
+    }
+};