Merge branch 'bert-opt2' into bert-opt3

9747cc44 · Paul · 48dbbd11 · db0301d7 · 9747cc44 · 9747cc44
Commit 9747cc44 authored Jul 06, 2022 by Paul
20 changed files
--- a/Dockerfile
+++ b/Dockerfile
@@ -86,7 +86,7 @@ RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXR

 ADD tools/build_and_test_onnxrt.sh /onnxruntime/build_and_test_onnxrt.sh

-RUN PATH=/opt/cmake/bin:$PATH cget -p /usr/local install ROCmSoftwarePlatform/llvm-project-mlir@02078ce236ad90e3aec04c0c770ef5bfc99e49c2
+RUN cget -p /usr/local install ROCmSoftwarePlatform/llvm-project-mlir@26a4b3cfc0a1a15181490f24ae461608fef1b04e -DBUILD_MIXR_TARGET=On

 ENV MIOPEN_FIND_DB_PATH=/tmp/miopen/find-db
 ENV MIOPEN_USER_DB_PATH=/tmp/miopen/user-db

--- a/src/include/migraphx/matcher.hpp
+++ b/src/include/migraphx/matcher.hpp
@@ -349,25 +349,27 @@ match::matcher_result find_match(module& modl, M&& m)
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_MATCHES)

 /// Find matches for an instruction in the module
-template <class... Ms>
-void find_matches(module& mod, instruction_ref ins, Ms&&... ms)
+template <class Mod, class... Ms>
+void find_matches(Mod& mod, instruction_ref ins, Ms&&... ms)
 {
 #if !defined(__GNUC__) || defined(__clang__) || __GNUC__ > 5
    const
 #endif
-        bool trace = enabled(MIGRAPHX_TRACE_MATCHES{});
-    bool match     = false;
+        int trace = value_of(MIGRAPHX_TRACE_MATCHES{});
+    bool match    = false;
    each_args(
        [&](auto&& m) {
            if(match)
                return;
-            auto r = match_instruction(mod, ins, m.matcher());
-            if(r.result == mod.end())
+            if(trace > 1)
+                std::cout << "Match: " << get_type_name(m) << std::endl;
+            auto r = match_instruction(get_module(mod), ins, m.matcher());
+            if(r.result == get_module(mod).end())
                return;
-            if(trace)
+            if(trace > 0)
            {
                std::cout << "Matched by " << get_type_name(m) << std::endl;
-                mod.debug_print(ins);
+                get_module(mod).debug_print(ins);
            }
            m.apply(mod, r);
            match = true;
@@ -376,10 +378,10 @@ void find_matches(module& mod, instruction_ref ins, Ms&&... ms)
 }

 /// Find matches in a module
-template <class... Ms>
-void find_matches(module& mod, Ms&&... ms)
+template <class Mod, class... Ms>
+void find_matches(Mod& mod, Ms&&... ms)
 {
-    for(auto ins : iterator_for(mod))
+    for(auto ins : iterator_for(get_module(mod)))
    {
        find_matches(mod, ins, ms...);
    }

--- a/src/include/migraphx/module.hpp
+++ b/src/include/migraphx/module.hpp
@@ -124,7 +124,7 @@ struct module
                     std::unordered_map<instruction_ref, instruction_ref> map_ins = {});

    std::vector<instruction_ref>
-    add_instructions(module_ref m,
+    add_instructions(const_module_ref m,
                     std::unordered_map<instruction_ref, instruction_ref> map_ins = {});

    std::vector<instruction_ref>
@@ -139,7 +139,7 @@ struct module

    std::vector<instruction_ref>
    insert_instructions(instruction_ref ins,
-                        module_ref m,
+                        const_module_ref m,
                        std::unordered_map<instruction_ref, instruction_ref> map_ins = {});

    std::vector<instruction_ref>
@@ -164,6 +164,10 @@ struct module

    instruction_ref replace_return(std::vector<instruction_ref> args);

+    instruction_ref insert_literal(instruction_ref ins, literal l);
+
+    instruction_ref insert_parameter(instruction_ref ins, std::string name, shape s);
+
    std::vector<std::string> get_parameter_names() const;

    shape get_parameter_shape(std::string name) const;
@@ -227,6 +231,8 @@ struct module
    std::unique_ptr<module_impl> impl;
 };

+inline module& get_module(module& m) { return m; }
+
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx


--- a/src/include/migraphx/module_ref.hpp
+++ b/src/include/migraphx/module_ref.hpp
@@ -32,7 +32,8 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

 struct module;
-using module_ref = module*;
+using module_ref       = module*;
+using const_module_ref = const module*;

 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/include/migraphx/ranges.hpp
+++ b/src/include/migraphx/ranges.hpp
@@ -198,6 +198,12 @@ void transform(Range&& r, Iterator it, F f)
    std::transform(r.begin(), r.end(), it, f);
 }

+template <class Range1, class Range2, class Iterator, class F>
+void transform(Range1&& r1, Range2&& r2, Iterator it, F f)
+{
+    std::transform(r1.begin(), r1.end(), r2.begin(), it, f);
+}
+
 template <class Range>
 auto reverse(Range& r)
 {

--- a/src/include/migraphx/shape.hpp
+++ b/src/include/migraphx/shape.hpp
@@ -191,6 +191,10 @@ struct shape

        std::size_t size(std::size_t n = 1) const { return sizeof(type) * n; }

+        auto is_integral() const { return std::is_integral<type>{}; }
+        auto is_signed() const { return std::is_signed<type>{}; }
+        auto is_unsigned() const { return std::is_unsigned<type>{}; }
+
        template <class U>
        type* from(U* buffer, std::size_t n = 0) const
        {

--- a/src/module.cpp
+++ b/src/module.cpp
@@ -399,7 +399,8 @@ module::add_instructions(const std::vector<instruction_ref>& instructions,
 }

 std::vector<instruction_ref>
-module::add_instructions(module_ref m, std::unordered_map<instruction_ref, instruction_ref> map_ins)
+module::add_instructions(const_module_ref m,
+                         std::unordered_map<instruction_ref, instruction_ref> map_ins)
 {
    return this->insert_instructions(this->end(), m, std::move(map_ins));
 }
@@ -420,8 +421,10 @@ module::insert_instructions(instruction_ref ins,
    return insert_generic_instructions(*this, ins, instructions, std::move(map_ins));
 }

-std::vector<instruction_ref> module::insert_instructions(
-    instruction_ref ins, module_ref m, std::unordered_map<instruction_ref, instruction_ref> map_ins)
+std::vector<instruction_ref>
+module::insert_instructions(instruction_ref ins,
+                            const_module_ref m,
+                            std::unordered_map<instruction_ref, instruction_ref> map_ins)
 {
    return insert_generic_instructions(*this, ins, iterator_for(*m), std::move(map_ins));
 }
@@ -436,11 +439,7 @@ module::insert_instructions(instruction_ref ins,
    return insert_generic_instructions(*this, ins, iterator_for(r), std::move(map_ins));
 }

-instruction_ref module::add_literal(literal l)
-{
-    impl->emplace_front(std::move(l));
-    return impl->instructions.begin();
-}
+instruction_ref module::add_literal(literal l) { return insert_literal(begin(), std::move(l)); }

 instruction_ref module::add_outline(const shape& s)
 {
@@ -450,10 +449,7 @@ instruction_ref module::add_outline(const shape& s)

 instruction_ref module::add_parameter(std::string name, shape s)
 {
-    assert(get_parameter_shape(name) == shape{});
-    impl->push_front({builtin::param{std::move(name), impl->nparams}, std::move(s), {}});
-    impl->nparams++;
-    return impl->instructions.begin();
+    return insert_parameter(begin(), std::move(name), std::move(s));
 }

 instruction_ref module::add_return(std::vector<instruction_ref> args)
@@ -466,6 +462,20 @@ instruction_ref module::add_return(std::vector<instruction_ref> args)
    return result;
 }

+instruction_ref module::insert_literal(instruction_ref ins, literal l)
+{
+    impl->emplace(ins, std::move(l));
+    return std::prev(ins);
+}
+
+instruction_ref module::insert_parameter(instruction_ref ins, std::string name, shape s)
+{
+    assert(get_parameter_shape(name) == shape{});
+    impl->insert(ins, {builtin::param{std::move(name), impl->nparams}, std::move(s), {}});
+    impl->nparams++;
+    return std::prev(ins);
+}
+
 instruction_ref module::replace_return(std::vector<instruction_ref> args)
 {
    auto last = std::prev(this->end());

--- a/src/program.cpp
+++ b/src/program.cpp
@@ -504,12 +504,14 @@ static void mod_from_val(module_ref mod,

        if(name == "@param")
        {
-            output = mod->add_parameter(fields["parameter"].to<std::string>(),
-                                        migraphx::from_value<shape>(node.at("shape")));
+            output = mod->insert_parameter(mod->end(),
+                                           fields["parameter"].to<std::string>(),
+                                           migraphx::from_value<shape>(node.at("shape")));
        }
        else if(name == "@literal")
        {
-            output = mod->add_literal(migraphx::from_value<literal>(node.at("literal")));
+            output =
+                mod->insert_literal(mod->end(), migraphx::from_value<literal>(node.at("literal")));
        }
        else
        {
@@ -544,11 +546,11 @@ static void mod_from_val(module_ref mod,
            }
            else if(module_inputs.empty())
            {
-                output = mod->add_instruction(op, inputs);
+                output = mod->insert_instruction(mod->end(), op, inputs);
            }
            else
            {
-                output = mod->add_instruction(op, inputs, module_inputs);
+                output = mod->insert_instruction(mod->end(), op, inputs, module_inputs);
            }
        }
        output->set_normalized(normalized);
@@ -681,11 +683,13 @@ void program::perf_report(std::ostream& os,
    double overhead_percent       = overhead_time * 100.0 / total_time;
    double total_instruction_time = 0.0;
    std::unordered_map<std::string, double> op_times;
+    std::unordered_map<std::string, std::size_t> op_n;
    for(auto&& p : ins_vec)
    {
        double avg = common_average(p.second);
        op_times[perf_group(p.first->get_operator())] += avg;
        total_instruction_time += avg;
+        op_n[perf_group(p.first->get_operator())]++;
    }
    double calculate_overhead_time    = total_time - total_instruction_time;
    double calculate_overhead_percent = calculate_overhead_time * 100.0 / total_time;
@@ -706,18 +710,19 @@ void program::perf_report(std::ostream& os,

    os << std::endl;
    os << "Summary:" << std::endl;
-    std::vector<std::pair<double, std::string>> op_times_sorted;
-    std::transform(op_times.begin(),
-                   op_times.end(),
-                   std::back_inserter(op_times_sorted),
-                   [](auto p) { return std::make_pair(p.second, p.first); });
+    std::vector<std::tuple<double, std::size_t, std::string>> op_times_sorted;
+    std::transform(
+        op_times.begin(), op_times.end(), std::back_inserter(op_times_sorted), [&](auto p) {
+            auto&& name = p.first;
+            return std::make_tuple(p.second, op_n.at(name), name);
+        });
    std::sort(op_times_sorted.begin(), op_times_sorted.end(), std::greater<>{});
-    for(auto&& p : op_times_sorted)
+    for(auto&& [avg, nn, name] : op_times_sorted)
    {
-        auto&& name    = p.second;
-        double avg     = p.first;
        double percent = std::ceil(100.0 * avg / total_instruction_time);
-        os << name << ": " << avg << "ms, " << percent << "%" << std::endl;
+        double per_ins = avg / nn;
+        os << name << ": " << avg << "ms / " << nn << " = " << per_ins << "ms, " << percent << "%"
+           << std::endl;
    }

    os << std::endl;

--- a/src/serialize.cpp
+++ b/src/serialize.cpp
@@ -36,7 +36,7 @@ void raw_data_to_value(value& v, const RawData& rd)
    result["shape"] = migraphx::to_value(rd.get_shape());
    if(rd.get_shape().type() == shape::tuple_type)
        result["sub"] = migraphx::to_value(rd.get_sub_objects());
-    else
+    else if(not rd.empty())
        result["data"] = migraphx::value::binary(rd.data(), rd.get_shape().bytes());
    v = result;
 }
@@ -56,7 +56,7 @@ void migraphx_from_value(const value& v, argument& a)
        literal l = migraphx::from_value<literal>(v);
        a         = l.get_argument();
    }
-    else
+    else if(v.contains("sub"))
    {
        a = migraphx::from_value<std::vector<argument>>(v.at("sub"));
    }

--- a/src/targets/cpu/write_literals.cpp
+++ b/src/targets/cpu/write_literals.cpp
@@ -25,6 +25,7 @@
 #include <migraphx/module.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/iterator_for.hpp>
+#include <migraphx/register_op.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -52,6 +53,7 @@ struct cpu_literal
        return os;
    }
 };
+MIGRAPHX_REGISTER_OP(cpu_literal);

 void write_literals::apply(module& m) const
 {

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -164,6 +164,7 @@ add_library(migraphx_gpu
    deconvolution.cpp
    device_name.cpp
    elu.cpp
+    fuse_mlir.cpp
    fuse_ops.cpp
    gather.cpp
    gemm_impl.cpp
@@ -176,7 +177,7 @@ add_library(migraphx_gpu
    loop.cpp
    lrn.cpp
    leaky_relu.cpp
-    mlir_conv.cpp
+    mlir.cpp
    multinomial.cpp
    nonzero.cpp
    pack_args.cpp
@@ -320,16 +321,26 @@ message(STATUS "extractkernel: ${MIGRAPHX_EXTRACT_KERNEL}")

 set(MIGRAPHX_ENABLE_MLIR OFF CACHE BOOL "")
 if(MIGRAPHX_ENABLE_MLIR)
-    find_library(LIBMLIRMIOPEN MLIRMIOpenThin REQUIRED)
+    find_library(MLIRAPI_LIBRARY MLIRMIOpen 
+        PATH_SUFFIXES
+        # Workaournd broken mlir install
+        lib/ lib/lib)
    # REQUIRED is not supported before cmake 3.18
-    if(NOT LIBMLIRMIOPEN)
-        message(FATAL_ERROR "libMLIRMIOpenThin not found")
+    if(NOT MLIRAPI_LIBRARY)
+        message(FATAL_ERROR "libMLIRMIOpen not found")
    else()
-        message(STATUS "Build with libMLIRMIOpenThin: " ${LIBMLIRMIOPEN})
+        message(STATUS "Build with libMLIRMIOpen: " ${MLIRAPI_LIBRARY})
    endif()

-    target_compile_definitions(migraphx_gpu PRIVATE "-DMIGRAPHX_MLIR_MIOPEN_SUPPORT")
-    target_link_libraries(migraphx_gpu PUBLIC ${LIBMLIRMIOPEN})
+    find_path(MLIRAPI_HEADERS NAMES mlir-c/Dialect/MIGraphX.h)
+    # Workaround MLIR broken installation
+    find_path(MLIRAPI_HEADERS2 NAMES mlir-c/Registration.h
+        PATH_SUFFIXES 
+        include/external/include external/include)
+
+    target_compile_definitions(migraphx_gpu PRIVATE "-DMIGRAPHX_MLIR")
+    target_include_directories(migraphx_gpu SYSTEM PRIVATE ${MLIRAPI_HEADERS} ${MLIRAPI_HEADERS2})
+    target_link_libraries(migraphx_gpu PUBLIC ${MLIRAPI_LIBRARY})
 endif()

 set(MIGRAPHX_USE_HIPRTC OFF CACHE BOOL "")

--- a/src/targets/gpu/code_object_op.cpp
+++ b/src/targets/gpu/code_object_op.cpp
@@ -51,8 +51,9 @@ code_object_op::compute(context& ctx, const shape&, const std::vector<argument>&
    std::vector<void*> kargs(args.size());
    std::transform(
        args.begin(), args.end(), kargs.begin(), [](const argument& a) { return a.data(); });
-    k.launch(ctx.get_stream().get(), global, local, std::move(kargs));
-    return args.back();
+    auto [start, stop] = ctx.get_perf_events();
+    k.launch(ctx.get_stream().get(), global, local, std::move(kargs), start, stop);
+    return args[get_output_arg(args.size())];
 }
 void code_object_op::finalize(context&, const shape&, const std::vector<shape>&)
 {

--- a/src/targets/gpu/deconvolution.cpp
+++ b/src/targets/gpu/deconvolution.cpp
@@ -59,31 +59,30 @@ argument miopen_deconvolution::compute(context& ctx,
    auto w_desc = make_tensor(reshape_if_1d(args[1].get_shape()));
    auto y_desc = make_tensor(reshape_if_1d(output_shape));

-    float alpha = 1;
-    float beta  = 0;
-    auto status = miopenConvolutionForward(ctx.get_stream().get_miopen(),
-                                           &alpha,
-                                           x_desc.get(),
-                                           args[0].implicit(),
-                                           w_desc.get(),
-                                           args[1].implicit(),
-                                           cd.get(),
-                                           algo,
-                                           &beta,
-                                           y_desc.get(),
-                                           args[3].implicit(),
-                                           args[2].implicit(),
-                                           args[2].get_shape().bytes());
+    if(solution_id == 0)
+        MIGRAPHX_THROW("MIOpen Deconvolution: invalid solution ID");
+
+    auto status = miopenConvolutionForwardImmediate(ctx.get_stream().get_miopen(),
+                                                    w_desc.get(),
+                                                    args[1].implicit(),
+                                                    x_desc.get(),
+                                                    args[0].implicit(),
+                                                    cd.get(),
+                                                    y_desc.get(),
+                                                    args[3].implicit(),
+                                                    args[2].implicit(),
+                                                    args[2].get_shape().bytes(),
+                                                    solution_id);
+
    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("Running deconvolution failed");
+        MIGRAPHX_THROW("MIOpen Deconvolution: running convolution failed");
    return args[3];
 }

-shape miopen_deconvolution::compile(context& ctx,
-                                    const shape& output_shape,
-                                    std::vector<shape> inputs)
+shape miopen_deconvolution::find(context& ctx, const shape& output_shape, std::vector<shape> inputs)
 {
    shape workspace_shape{};
+
    auto x_desc = make_tensor(reshape_if_1d(inputs[0]));
    auto w_desc = make_tensor(reshape_if_1d(inputs[1]));
    auto y_desc = make_tensor(reshape_if_1d(output_shape));
@@ -119,9 +118,35 @@ shape miopen_deconvolution::compile(context& ctx,
                                                        workspace_size,
                                                        false);
    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("Find deconvolution failed");
-    handle = ctx.get_stream().get_miopen();
-    algo   = perf.fwd_algo;
+        MIGRAPHX_THROW("MIOpen Deconvolution: find convolution failed");
+    algo = perf.fwd_algo;
+
+    size_t solution_count;
+
+    status = miopenConvolutionForwardGetSolutionCount(ctx.get_stream().get_miopen(),
+                                                      w_desc.get(),
+                                                      x_desc.get(),
+                                                      cd.get(),
+                                                      y_desc.get(),
+                                                      &solution_count);
+    if(status != miopenStatusSuccess)
+        MIGRAPHX_THROW("MIOpen Deconvolution: get solution count failed");
+
+    std::vector<miopenConvSolution_t> solutions(solution_count);
+
+    status = miopenConvolutionForwardGetSolution(ctx.get_stream().get_miopen(),
+                                                 w_desc.get(),
+                                                 x_desc.get(),
+                                                 cd.get(),
+                                                 y_desc.get(),
+                                                 solution_count,
+                                                 &solution_count,
+                                                 solutions.data());
+    if(status != miopenStatusSuccess)
+        MIGRAPHX_THROW("MIOpen Deconvolution: get solution failed");
+
+    solution_id = solutions.front().solution_id;
+
    return shape{shape::int8_type, {perf.memory}};
 }

@@ -129,13 +154,29 @@ void miopen_deconvolution::finalize(context& ctx,
                                    const shape& output_shape,
                                    std::vector<shape> inputs)
 {
-    if(handle == ctx.get_stream().get_miopen())
-        return;
-    // Check that workspace hasn't changed
-    auto size = inputs.at(2).bytes();
-    auto ws   = compile(ctx, output_shape, std::move(inputs));
-    if(ws.bytes() > size)
-        MIGRAPHX_THROW("Workspace has changed during finalization.");
+    if(cd == nullptr)
+        cd = make_deconv(op);
+    if(solution_id == 0)
+    {
+        // Check that workspace hasn't changed
+        auto size = inputs.at(2).bytes();
+        auto ws   = find(ctx, output_shape, inputs);
+        if(ws.bytes() > size)
+            MIGRAPHX_THROW("MIOpen Deconvolution: workspace has changed during finalization.");
+    }
+
+    auto x_desc = make_tensor(reshape_if_1d(inputs[0]));
+    auto w_desc = make_tensor(reshape_if_1d(inputs[1]));
+    auto y_desc = make_tensor(reshape_if_1d(output_shape));
+
+    auto status = miopenConvolutionForwardCompileSolution(ctx.get_stream().get_miopen(),
+                                                          w_desc.get(),
+                                                          x_desc.get(),
+                                                          cd.get(),
+                                                          y_desc.get(),
+                                                          solution_id);
+    if(status != miopenStatusSuccess)
+        MIGRAPHX_THROW("MIOpen Deconvolution: compile solution failed");
 }

 } // namespace gpu

--- a/src/targets/gpu/driver/compile_op.cpp
+++ b/src/targets/gpu/driver/compile_op.cpp
@@ -38,8 +38,11 @@ struct compile_op : action<compile_op>
        context ctx;
        auto inputs = p.parse_shapes(v.at("inputs"));
        auto op     = gpu::compile_op(v.at("name").to<std::string>(), ctx, inputs, v);
-        double t    = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
-        std::cout << op << ": " << t << "ms" << std::endl;
+        auto [host_time, device_time] = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
+        std::cout << op << ": " << host_time << "ms";
+        if(device_time > 0)
+            std::cout << ", " << device_time << "ms";
+        std::cout << std::endl;
    }
 };


--- a/src/targets/gpu/driver/include/migraphx/gpu/driver/perf.hpp
+++ b/src/targets/gpu/driver/include/migraphx/gpu/driver/perf.hpp
@@ -33,7 +33,8 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace driver {

-double time_op(context& ctx, operation op, const std::vector<shape>& inputs, int n = 100);
+std::pair<double, double>
+time_op(context& ictx, operation op, const std::vector<shape>& inputs, int n = 100);

 } // namespace driver
 } // namespace gpu

--- a/src/targets/gpu/driver/perf.cpp
+++ b/src/targets/gpu/driver/perf.cpp
@@ -42,22 +42,31 @@ std::vector<argument> generate_arguments(const std::vector<shape>& shapes, unsig
 }

 using milliseconds = std::chrono::duration<double, std::milli>;
-double time_op(context& ctx, operation op, const std::vector<shape>& inputs, int n)
+std::pair<double, double>
+time_op(context& ictx, operation op, const std::vector<shape>& inputs, int n)
 {
+
    // TODO: Use std::ref
-    migraphx::context gctx = ctx;
-    auto output            = op.compute_shape(inputs);
-    op.finalize(gctx, output, inputs);
+    migraphx::context ctx = ictx;
+    auto& gctx            = any_cast<migraphx::gpu::context>(ctx);
+    auto output           = op.compute_shape(inputs);
+    op.finalize(ctx, output, inputs);
    auto args = generate_arguments(inputs);
    auto run  = [&] {
-        op.compute(gctx, output, args);
-        gctx.finish();
+        op.compute(ctx, output, args);
+        ctx.finish();
    };
+    gctx.enable_perf_measurement();
    run();
-    auto r   = range(n);
-    double t = std::accumulate(
-        r.begin(), r.end(), double{0.0}, [&](auto x, auto) { return x + time<milliseconds>(run); });
-    return t / n;
+    double host_time   = 0.0;
+    double device_time = 0.0;
+    for(auto i : range(n))
+    {
+        (void)i;
+        host_time += time<milliseconds>(run);
+        device_time += gctx.get_elapsed_ms();
+    }
+    return std::make_pair(host_time / n, device_time / n);
 }

 } // namespace driver

--- a/src/targets/gpu/driver/run_op.cpp
+++ b/src/targets/gpu/driver/run_op.cpp
@@ -43,8 +43,8 @@ struct run_op : action<run_op>
        auto op = make_op(name);
        if(v.contains("fields"))
            op.from_value(v.at("fields"));
-        double t = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
-        std::cout << op << ": " << t << "ms" << std::endl;
+        auto [host_time, device_time] = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
+        std::cout << op << ": " << host_time << "ms" << std::endl;
    }
 };


--- a/src/targets/gpu/fuse_mlir.cpp
+++ b/src/targets/gpu/fuse_mlir.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/fuse_mlir.hpp>
+#include <migraphx/gpu/mlir.hpp>
+#include <migraphx/matcher.hpp>
+#include <migraphx/pass_manager.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/register_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct module;
+
+namespace gpu {
+
+#ifdef MIGRAPHX_MLIR
+struct mlir_conv
+{
+    operation op = make_op("convolution");
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.op, "op"));
+    }
+
+    std::string name() const { return "gpu::mlir_conv"; }
+    shape compute_shape(std::vector<shape> inputs, const std::vector<module_ref>& mods) const
+    {
+        check_shapes{inputs, *this}.standard();
+        if(mods.size() != 1)
+            MIGRAPHX_THROW("should have one submodule.");
+        if(inputs.size() < 2)
+            MIGRAPHX_THROW("should have at least two inputs.");
+        auto n = inputs.size();
+        return op.compute_shape({inputs[n - 2], inputs[n - 1]});
+    }
+};
+MIGRAPHX_REGISTER_OP(mlir_conv);
+
+namespace {
+struct find_conv_pointwise
+{
+    // Find a convolution followed by a pointwise operation.
+    auto matcher() const
+    {
+        auto convolution =
+            match::skip(match::name("contiguous"))(match::name("convolution").bind("convolution"));
+        return match::name("pointwise")(match::any_of[match::inputs()](convolution.bind("x")));
+    }
+
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
+    {
+        auto ins      = r.result;
+        auto conv_ins = r.instructions["convolution"];
+        auto x_ins    = r.instructions["x"]; // input after contiguous
+        auto* pm      = ins->module_inputs().front();
+        auto names    = pm->get_parameter_names();
+        // Whitelist pointwise operators
+        if(std::any_of(pm->begin(), pm->end(), [](const auto& i) {
+               return not contains({"@literal", "@param", "@return", "convolution", "add", "relu"},
+                                   i.name());
+           }))
+            return;
+        // Only fuse with fp32 for now
+        if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [&](auto i) {
+               return i->get_shape().type() != shape::type_t::float_type;
+           }))
+            return;
+        std::sort(names.begin(), names.end());
+        module_ref mm = mpm.create_module("mlir_" + pm->name());
+        mm->set_bypass();
+        std::unordered_map<instruction_ref, instruction_ref> param_map;
+        auto x    = mm->add_parameter("x" + std::to_string(names.size()),
+                                   conv_ins->inputs().at(0)->get_shape());
+        auto w    = mm->add_parameter("x" + std::to_string(names.size() + 1),
+                                   conv_ins->inputs().at(1)->get_shape());
+        auto conv = mm->add_instruction(conv_ins->get_operator(), {x, w});
+        std::transform(names.begin(),
+                       names.end(),
+                       ins->inputs().begin(),
+                       std::inserter(param_map, param_map.end()),
+                       [&](auto name, auto input) {
+                           if(input == x_ins)
+                               return std::make_pair(pm->get_parameter(name), conv);
+                           return std::make_pair(pm->get_parameter(name),
+                                                 mm->add_parameter(name, input->get_shape()));
+                       });
+        mm->add_return(mm->insert_instructions(mm->end(), pm, param_map));
+
+        std::vector<instruction_ref> inputs;
+        std::copy_if(ins->inputs().begin(),
+                     ins->inputs().end(),
+                     std::back_inserter(inputs),
+                     [&](auto input) { return input != conv_ins; });
+        inputs.insert(inputs.end(), conv_ins->inputs().begin(), conv_ins->inputs().end());
+        mpm.get_module().replace_instruction(
+            ins, mlir_conv{conv_ins->get_operator()}, inputs, {mm});
+    }
+};
+} // namespace
+
+#endif
+
+void fuse_mlir::apply(module_pass_manager& mpm) const
+{
+#ifdef MIGRAPHX_MLIR
+    match::find_matches(mpm, find_conv_pointwise{});
+#else
+    (void)mpm;
+#endif
+}
+
+} // namespace gpu
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -342,6 +342,7 @@ void move_standard_front(std::vector<instruction_ref>& args)

 auto gpu_name(const std::string& s) { return match::name("gpu::" + s); }

+namespace {
 struct find_layernorm
 {
    auto matcher() const { return match::layernorm(&gpu_name); }
@@ -843,15 +844,6 @@ inline auto precompile_name(Strings... names) // NOLINT
    });
 }

-template <class... Ms>
-auto conv_bias_pointwise(Ms... ms)
-{
-    return precompile_name("pointwise")(
-        match::either_arg(0, 1)(bias_shape(match::used_once()).bind("bias"),
-                                fusable_conv(match::used_once()).bind("conv")),
-        ms...);
-}
-
 struct find_conv_bias
 {
    context* ctx = nullptr;
@@ -1087,6 +1079,7 @@ struct find_commutative_broadcast
        m.replace_instruction(ins, ins->get_operator(), args);
    }
 };
+} // namespace

 struct find_contiguous
 {

--- a/src/targets/gpu/include/migraphx/gpu/code_object_op.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/code_object_op.hpp
@@ -38,12 +38,13 @@ struct context;

 struct code_object_op
 {
-    value::binary code_object;
-    std::string symbol_name;
-    std::size_t global;
-    std::size_t local;
-    std::vector<shape> expected_inputs;
-    shape output;
+    value::binary code_object{};
+    std::string symbol_name = "";
+    std::size_t global      = 0;
+    std::size_t local       = 0;
+    std::vector<shape> expected_inputs{};
+    shape output{};
+    std::int64_t output_arg = -1;
    kernel k{};

    template <class Self, class F>
@@ -66,9 +67,13 @@ struct code_object_op
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
    void finalize(context&, const shape&, const std::vector<shape>&);
+    std::int64_t get_output_arg(std::size_t n) const
+    {
+        return output_arg < 0 ? n + output_arg : output_arg;
+    }
    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {
-        return shapes.size() - 1;
+        return get_output_arg(shapes.size());
    }

    friend std::ostream& operator<<(std::ostream& os, const code_object_op& op)