Merge remote-tracking branch 'origin/optimize' into ck-gsg

e7f7ea10 · Alan Turner · 9c6ba1ed · ae13eb93 · e7f7ea10 · e7f7ea10
Commit e7f7ea10 authored Nov 30, 2022 by Alan Turner
20 changed files
--- a/src/simplify_algebra.cpp
+++ b/src/simplify_algebra.cpp
@@ -827,7 +827,7 @@ MIGRAPHX_PRED_MATCHER(horiz_conv_dot, instruction_ref ins)
    };
    auto dots  = std::count_if(ins->outputs().begin(), ins->outputs().end(), pred("dot"));
    auto convs = std::count_if(ins->outputs().begin(), ins->outputs().end(), pred("convolution"));
-    return not(dots < 2 and convs < 2);
+    return (dots >= 2 or convs >= 2);
 }

 struct find_conv_dot_horiz_fusion
@@ -933,6 +933,73 @@ struct find_div_const
    }
 };

+struct find_unit_ops
+{
+    auto matcher() const
+    {
+        auto mul_1 = match::name("mul")(
+            match::either_arg(0, 1)(match::has_value(1.0f), match::any().bind("x")));
+        auto div_1 =
+            match::name("div")(match::args(match::any().bind("x"), match::has_value(1.0f)));
+        auto add_0 = match::name("add")(
+            match::either_arg(0, 1)(match::has_value(0.0f, 1e-12), match::any().bind("x")));
+        auto sub_0 =
+            match::name("sub")(match::args(match::any().bind("x"), match::has_value(0.0f)));
+        return match::any_of(mul_1, div_1, add_0, sub_0);
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins  = r.result;
+        auto c_in = r.instructions["x"];
+
+        m.replace_instruction(ins, c_in);
+    }
+};
+
+struct find_neg_unit_ops
+{
+    auto matcher() const
+    {
+        auto mul_neg_1 = match::name("mul")(
+            match::either_arg(0, 1)(match::has_value(-1.0f), match::any().bind("x")));
+        auto div_neg_1 =
+            match::name("div")(match::args(match::any().bind("x"), match::has_value(-1.0f)));
+        auto sub_0 =
+            match::name("sub")(match::args(match::has_value(0.0f), match::any().bind("x")));
+        return match::any_of(mul_neg_1, div_neg_1, sub_0);
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins  = r.result;
+        auto c_in = r.instructions["x"];
+
+        auto neg = m.add_instruction(make_op("neg"), c_in);
+        m.replace_instruction(ins, neg);
+    }
+};
+
+struct find_zero_ops
+{
+    auto matcher() const
+    {
+        auto mul_zero = match::name("mul")(
+            match::either_arg(0, 1)(match::has_value(0.0f).bind("x"), match::any()));
+        auto div_zero =
+            match::name("div")(match::args(match::has_value(0.0f).bind("x"), match::any()));
+        return match::any_of(mul_zero, div_zero);
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins      = r.result;
+        auto zero_ins = r.instructions["x"];
+
+        m.replace_instruction(ins, zero_ins);
+    }
+};
+
 struct find_sub_const
 {
    auto matcher() const
@@ -1149,6 +1216,9 @@ void simplify_algebra::apply(module& m) const
                            find_mul_conv{},
                            find_mul_slice_conv{},
                            find_mul_add{},
+                            find_unit_ops{},
+                            find_neg_unit_ops{},
+                            find_zero_ops{},
                            find_dot_add{},
                            find_div_const{},
                            find_sub_const{},

--- a/src/targets/cpu/binary.cpp
+++ b/src/targets/cpu/binary.cpp
@@ -51,7 +51,18 @@ struct dnnl_binary : dnnl_op<dnnl_binary, dnnl::binary>
        auto r  = s0;
        if(s0 != s1 or not s0.packed())
        {
-            r = shape{s0.type(), s0.lens()};
+            if(s0.packed() != s1.packed())
+            {
+                r = s0.packed() ? s0 : s1;
+            }
+            else if(s0.broadcasted() != s1.broadcasted())
+            {
+                r = s0.broadcasted() ? s1.with_lens(s0.lens()) : s0.with_lens(s0.lens());
+            }
+            else
+            {
+                r = {s0.type(), s0.lens()};
+            }
        }
        // Call to get_primitive to make sure an algo is available
        this->get_primitive(this->to_memory_desc(r, inputs));

--- a/src/targets/cpu/convolution.cpp
+++ b/src/targets/cpu/convolution.cpp
@@ -43,9 +43,9 @@ struct dnnl_convolution
        return {MIGRAPHX_DNNL_PREFIX(ARG_SRC), MIGRAPHX_DNNL_PREFIX(ARG_WEIGHTS)};
    }

-    shape adjust_shape(const shape& x, int i) const
+    shape adjust_shape(const shape& x, int i, const shape& output) const
    {
-        auto s = base_adjust_shape(x);
+        auto s = base_adjust_shape(x, output);
        if(i == 1 and op.group > 1)
        {
            // TODO: Add support for transposed weights

--- a/src/targets/cpu/deconvolution.cpp
+++ b/src/targets/cpu/deconvolution.cpp
@@ -37,9 +37,9 @@ struct dnnl_deconvolution
        return {MIGRAPHX_DNNL_PREFIX(ARG_SRC), MIGRAPHX_DNNL_PREFIX(ARG_WEIGHTS)};
    }

-    shape adjust_shape(const shape& x, int i) const
+    shape adjust_shape(const shape& x, int i, const shape& output) const
    {
-        auto s = base_adjust_shape(x);
+        auto s = base_adjust_shape(x, output);
        if(i == 1)
        {
            // The input and output channels are flipped for dnnl

--- a/src/targets/cpu/include/migraphx/cpu/dnnl.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/dnnl.hpp
@@ -167,7 +167,7 @@ struct dnnl_op : auto_register_op<Derived>
        std::iota(result.begin(), result.end(), MIGRAPHX_DNNL_PREFIX(ARG_SRC_0));
        return result;
    }
-    shape base_adjust_shape(const shape& s) const
+    shape base_adjust_shape(const shape& s, const shape& output) const
    {
        if(s.broadcasted())
        {
@@ -183,7 +183,8 @@ struct dnnl_op : auto_register_op<Derived>
                               else
                                   return len;
                           });
-            return shape{s.type(), lens};
+            // Use the permutation of the output
+            return output.with_lens(s.type(), lens);
        }
        return s;
    }
@@ -204,7 +205,10 @@ struct dnnl_op : auto_register_op<Derived>
            i++;
        }
    }
-    shape adjust_shape(const shape& s, int) const { return base_adjust_shape(s); }
+    shape adjust_shape(const shape& s, int, const shape& output) const
+    {
+        return base_adjust_shape(s, output);
+    }
    std::vector<int> create_arg_map(std::size_t input_size) const
    {
        const auto& self     = static_cast<const Derived&>(*this);
@@ -224,12 +228,12 @@ struct dnnl_op : auto_register_op<Derived>
        const auto& self = static_cast<const Derived&>(*this);
        std::unordered_map<int, dnnl::memory::desc> result;
        result[MIGRAPHX_DNNL_PREFIX(ARG_DST)] =
-            to_dnnl_memory_desc(self.adjust_shape(output_shape, inputs.size()));
+            to_dnnl_memory_desc(self.adjust_shape(output_shape, inputs.size(), output_shape));
        auto m = create_arg_map(inputs.size());
        assert(m.size() >= inputs.size());
        for(int i = 0; i < inputs.size(); i++)
        {
-            result[m[i]] = to_dnnl_memory_desc(self.adjust_shape(inputs[i], i));
+            result[m[i]] = to_dnnl_memory_desc(self.adjust_shape(inputs[i], i, output_shape));
        }
        return result;
    }

--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -26,7 +26,6 @@
 #include <migraphx/instruction.hpp>
 #include <migraphx/dfor.hpp>
 #include <migraphx/op/identity.hpp>
-#include <migraphx/op/batch_norm_inference.hpp>
 #include <migraphx/op/convolution.hpp>
 #include <migraphx/op/deconvolution.hpp>
 #include <migraphx/op/quant_convolution.hpp>
@@ -216,55 +215,6 @@ struct cpu_pad
 };
 MIGRAPHX_REGISTER_OP(cpu_pad)

-struct leaky_relu_op
-{
-    op::leaky_relu op;
-    std::string name() const { return "cpu::leaky_relu"; }
-    auto fcn() const
-    {
-        auto a = op.alpha;
-        return [a](auto x) { return x > 0 ? x : x * a; };
-    }
-};
-
-template <typename Op>
-struct cpu_unary2 : auto_register_op<cpu_unary2<Op>>
-{
-    cpu_unary2() = default;
-
-    template <class T>
-    cpu_unary2(T pop) : op(Op{std::move(pop)})
-    {
-    }
-
-    Op op;
-
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return migraphx::reflect(self.op.op, f);
-    }
-    std::string name() const { return op.name(); }
-    shape compute_shape(const std::vector<shape>& inputs) const
-    {
-        check_shapes{inputs, *this}.has(1);
-        const auto& s = inputs.at(0);
-        return {s.type(), s.lens()};
-    }
-
-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
-    {
-        argument result{output_shape};
-        visit_all(result, args[0])([&](auto output, auto input) {
-            assert(input.get_shape().standard());
-            std::transform(input.begin(), input.end(), output.begin(), op.fcn());
-        });
-
-        return result;
-    }
-};
-template struct cpu_unary2<leaky_relu_op>;
-
 struct cpu_rnn_var_sl_last_output
 {
    op::rnn_var_sl_last_output op;

--- a/src/targets/cpu/reorder.cpp
+++ b/src/targets/cpu/reorder.cpp
@@ -32,7 +32,7 @@ struct dnnl_reorder : dnnl_op<dnnl_reorder, dnnl::reorder>
 {
    std::string name() const { return "dnnl::reorder"; }

-    shape adjust_shape(const shape& x, int) const { return x; }
+    shape adjust_shape(const shape& x, int, const shape&) const { return x; }

    shape compute_shape(const std::vector<shape>& inputs) const
    {

--- a/src/targets/cpu/target.cpp
+++ b/src/targets/cpu/target.cpp
@@ -33,16 +33,15 @@
 #include <migraphx/eliminate_data_type.hpp>
 #include <migraphx/eliminate_identity.hpp>
 #include <migraphx/eliminate_pad.hpp>
+#include <migraphx/layout_nhwc.hpp>
 #include <migraphx/memory_coloring.hpp>
 #include <migraphx/propagate_constant.hpp>
 #include <migraphx/register_target.hpp>
 #include <migraphx/replace_allocate.hpp>
-#include <migraphx/rewrite_batchnorm.hpp>
 #include <migraphx/rewrite_pooling.hpp>
 #include <migraphx/rewrite_quantization.hpp>
 #include <migraphx/rewrite_rnn.hpp>
 #include <migraphx/schedule.hpp>
-#include <migraphx/memory_coloring.hpp>
 #include <migraphx/simplify_algebra.hpp>
 #include <migraphx/simplify_qdq.hpp>
 #include <migraphx/simplify_reshapes.hpp>
@@ -78,14 +77,15 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
            eliminate_identity{},
            eliminate_pad{},
            dead_code_elimination{},
-            rewrite_batchnorm{},
-            dead_code_elimination{},
            rewrite_rnn{},
            dead_code_elimination{},
            eliminate_common_subexpression{},
            dead_code_elimination{},
            simplify_algebra{},
            simplify_reshapes{},
+            layout_nhwc{},
+            dead_code_elimination{},
+            simplify_reshapes{},
            simplify_algebra{},
            auto_contiguous{},
            simplify_reshapes{},

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -78,15 +78,13 @@ add_library(migraphx_gpu
    allocation_model.cpp
    argmax.cpp
    argmin.cpp
-    batch_norm_inference.cpp
    code_object_op.cpp
    compile_ops.cpp
    compile_gen.cpp
    compile_hip.cpp
    compile_hip_code_object.cpp
+    compile_miopen.cpp
    compiler.cpp
-    convolution.cpp
-    deconvolution.cpp
    device_name.cpp
    elu.cpp
    fuse_ck.cpp
@@ -103,7 +101,6 @@ add_library(migraphx_gpu
    logsoftmax.cpp
    loop.cpp
    lrn.cpp
-    leaky_relu.cpp
    mlir.cpp
    multinomial.cpp
    nonzero.cpp
@@ -113,7 +110,6 @@ add_library(migraphx_gpu
    pad.cpp
    perfdb.cpp
    pooling.cpp
-    quant_convolution.cpp
    reverse.cpp
    rnn_variable_seq_lens.cpp
    rocblas.cpp
@@ -148,16 +144,10 @@ register_migraphx_gpu_ops(hip_
 )
 register_migraphx_gpu_ops(miopen_
    abs
-    batch_norm_inference
    contiguous
-    convolution
-    deconvolution
-    elu
    int8_conv_pack
-    leaky_relu
    lrn
    pooling
-    quant_convolution
 )
 register_op(migraphx_gpu 
    HEADER migraphx/gpu/rnn_variable_seq_lens.hpp 
@@ -171,6 +161,9 @@ register_op(migraphx_gpu
    HEADER migraphx/gpu/gemm.hpp 
    OPERATORS gpu::rocblas_gemm<op::dot> gpu::rocblas_gemm<op::quant_dot>
    INCLUDES migraphx/gpu/context.hpp)
+register_op(migraphx_gpu HEADER migraphx/gpu/convolution.hpp 
+    OPERATORS gpu::miopen_convolution<op::convolution> gpu::miopen_convolution<op::deconvolution> gpu::miopen_convolution<op::quant_convolution>
+    INCLUDES migraphx/gpu/context.hpp)
 rocm_set_soversion(migraphx_gpu ${MIGRAPHX_SO_VERSION})
 rocm_clang_tidy_check(migraphx_gpu)

@@ -245,14 +238,14 @@ endif()
 include(CheckLibraryExists)
 get_target_property(MIOPEN_LOCATION MIOpen LOCATION)
 check_library_exists(MIOpen "miopenHiddenSetConvolutionFindMode" "${MIOPEN_LOCATION}" HAS_FIND_MODE_API)
-check_library_exists(MIOpen "miopenFindSolutions" "${MIOPEN_LOCATION}" HAS_FIND_2_API)
+# check_library_exists(MIOpen "miopenFindSolutions" "${MIOPEN_LOCATION}" HAS_FIND_2_API)

-if(HAS_FIND_2_API) 
-    target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_2_API)
-    message(STATUS "MIGraphx is using Find-2.0 API of MIOpen")
-else()
-    message(STATUS "MIOpen does not have Find-2.0 API")
-endif()
+# if(HAS_FIND_2_API) 
+#     target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_2_API)
+#     message(STATUS "MIGraphx is using Find-2.0 API of MIOpen")
+# else()
+#     message(STATUS "MIOpen does not have Find-2.0 API")
+# endif()

 if(HAS_FIND_MODE_API)
    target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_MODE_API)

--- a/src/targets/gpu/compile_hip_code_object.cpp
+++ b/src/targets/gpu/compile_hip_code_object.cpp
@@ -144,9 +144,8 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over)

 std::size_t compute_block_size(std::size_t n, std::size_t max_block_size)
 {
-    const std::size_t min_block_size  = 64;
-    const std::size_t base_block_size = 32;
-    auto block_size                   = (((n - 1) / base_block_size + 1)) * base_block_size;
+    const std::size_t min_block_size = 64;
+    auto block_size                  = (((n - 1) / min_block_size + 1)) * min_block_size;
    return std::min(std::max(min_block_size, block_size), max_block_size);
 }


--- a/src/targets/gpu/include/migraphx/gpu/deconvolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/deconvolution.hpp
@@ -21,47 +21,81 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_DECONVOLUTION_HPP
-#define MIGRAPHX_GUARD_RTGLIB_DECONVOLUTION_HPP
-
-#include <migraphx/shape.hpp>
-#include <migraphx/op/deconvolution.hpp>
-#include <migraphx/gpu/miopen.hpp>
+#include <migraphx/gpu/compile_miopen.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/register_op.hpp>
+#include <migraphx/op/identity.hpp>
+#include <migraphx/gpu/rocblas.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

-struct context;
-
-struct miopen_deconvolution
+struct miopen_op
 {
-    op::deconvolution op;
-    shared<convolution_descriptor> cd;
-    miopenConvFwdAlgorithm_t algo{};
-    uint64_t solution_id = 0;
+    operation op = op::identity{};

    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
-        return pack_join(op::deconvolution::reflect(self.op, f),
-                         pack(f(self.solution_id, "solution_id")));
+        return pack(f(self.op, "op"));
+    }
+
+    std::string name() const { return "gpu::miopen_op"; }
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        inputs.push_back(inputs.back());
+        return op.compute_shape(inputs);
    }

-    std::string name() const { return "gpu::deconv"; }
-    shape compute_shape(const std::vector<shape>& inputs) const;
-    argument
-    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    shape find(context& ctx, const shape& output_shape, std::vector<shape> inputs);
-    void finalize(context& ctx, const shape& output_shape, std::vector<shape> inputs);
    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {
        return shapes.size() - 1;
    }
 };
+MIGRAPHX_REGISTER_OP(miopen_op);
+
+std::size_t compile_miopen::compile(operation& op, instruction_ref ins, bool format) const
+{
+    op.from_value({{"int8_x4_format", format}});
+    auto v = op.compile(*ctx, ins->get_shape(), to_shapes(ins->inputs()));
+    return v.get<std::size_t>("workspace", 0);
+}
+
+void compile_miopen::apply(module& m) const
+{
+    assert(ctx);
+    const bool int8_x4_format = get_int8_x4_format(any_cast<migraphx::gpu::context>(*ctx));
+    for(auto ins : iterator_for(m))
+    {
+        if(ins->name() != "gpu::miopen_op")
+            continue;
+        auto op        = any_cast<miopen_op>(ins->get_operator()).op;
+        std::size_t ws = 0;
+        try
+        {
+            // for the regular convolution and deconvolution, this try would always succeed
+            ws = compile(op, ins, int8_x4_format);
+        }
+        catch(migraphx::exception&)
+        {
+            // In case no solver supports the default format, retry using the other format.
+            ws = compile(op, ins, not int8_x4_format);
+        }
+        auto inputs = ins->inputs();
+        auto alloc  = m.insert_instruction(
+            ins, make_op("allocate", {{"shape", to_value(shape{shape::int8_type, {ws}})}}));
+        inputs.insert(std::prev(inputs.end()), alloc);
+
+        m.replace_instruction(ins, op, inputs);
+    }
+}

 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
-
-#endif
--- a/src/targets/gpu/compile_ops.cpp
+++ b/src/targets/gpu/compile_ops.cpp
@@ -39,19 +39,26 @@ MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_COMPILE_PARALLEL);

 struct precompile_op
 {
-    operation op = op::identity{};
+    operation op                = op::identity{};
+    std::size_t additional_args = 1;
+    bool ignore_modules         = false;

    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
-        return pack(f(self.op, "op"));
+        return pack(f(self.op, "op"),
+                    f(self.additional_args, "additional_args"),
+                    f(self.ignore_modules, "ignore_modules"));
    }

    std::string name() const { return "gpu::precompile_op"; }

    shape compute_shape(std::vector<shape> inputs, const std::vector<module_ref>& mods) const
    {
-        inputs.pop_back();
+        // Pop off additional args
+        inputs.resize(inputs.size() - additional_args);
+        if(ignore_modules)
+            return op.compute_shape(inputs);
        return op.compute_shape(inputs, mods);
    }


--- a/src/targets/gpu/convolution.cpp
+++ b/src/targets/gpu/convolution.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/gpu/convolution.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <migraphx/generate.hpp>
-#include <miopen/miopen.h>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-shape miopen_convolution::compute_shape(const std::vector<shape>& inputs) const
-{
-    check_shapes{inputs, *this}.has(4).standard();
-    std::vector<shape> conv_inputs(inputs.begin(), inputs.begin() + 2);
-    check_shapes{conv_inputs, *this}.max_ndims(5);
-    return op.normalize_compute_shape(conv_inputs);
-}
-
-inline shape reshape_if_1d(const shape& input)
-{
-    shape new_shape{input};
-    auto dims = new_shape.lens();
-
-    if(dims.size() == 3)
-    {
-        std::vector<size_t> new_dims = dims;
-        new_dims.insert(new_dims.begin() + 2, 1);
-        new_shape = shape{input.type(), new_dims};
-    }
-    return new_shape;
-}
-
-argument miopen_convolution::compute(context& ctx,
-                                     const shape& output_shape,
-                                     const std::vector<argument>& args) const
-{
-    auto x_desc                = make_tensor(reshape_if_1d(args[0].get_shape()));
-    auto w_desc                = make_tensor(reshape_if_1d(args[1].get_shape()));
-    auto y_desc                = make_tensor(reshape_if_1d(output_shape));
-    auto* miopen_stream_handle = ctx.get_stream().get_miopen();
-    auto workspace_size        = args[2].get_shape().bytes();
-
-#ifdef MIGRAPHX_HAS_FIND_2_API
-    {
-        const miopenTensorArgument_t tensor_args[3] = {
-            {miopenTensorConvolutionX, nullptr, args[0].implicit()},
-            {miopenTensorConvolutionW, nullptr, args[1].implicit()},
-            {miopenTensorConvolutionY, nullptr, args[3].implicit()},
-        };
-
-        if(solution_ptr.get() == nullptr)
-            MIGRAPHX_THROW("MIOpen Convolution : Load MIOpen Solution before running it");
-
-        auto status = miopenRunSolution(miopen_stream_handle,
-                                        solution_ptr.get(),
-                                        3,
-                                        tensor_args,
-                                        args[2].implicit(),
-                                        workspace_size);
-        if(status != miopenStatusSuccess)
-            MIGRAPHX_THROW("MIOpen Convolution: running convolution using find_2.0 failed");
-
-        return args[3];
-    }
-#else
-    // else use immediate mode
-    if(solution_id == 0)
-        MIGRAPHX_THROW("MIOpen Convolution: invalid solution ID");
-
-    auto status = miopenConvolutionForwardImmediate(miopen_stream_handle,
-                                                    w_desc.get(),
-                                                    args[1].implicit(),
-                                                    x_desc.get(),
-                                                    args[0].implicit(),
-                                                    cd.get(),
-                                                    y_desc.get(),
-                                                    args[3].implicit(),
-                                                    args[2].implicit(),
-                                                    workspace_size,
-                                                    solution_id);
-
-    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Convolution: running convolution failed");
-    return args[3];
-#endif
-}
-
-shape miopen_convolution::find(context& ctx, const shape& output_shape, std::vector<shape> inputs)
-{
-    shape workspace_shape{};
-    auto x_desc                = make_tensor(reshape_if_1d(inputs[0]));
-    auto w_desc                = make_tensor(reshape_if_1d(inputs[1]));
-    auto y_desc                = make_tensor(reshape_if_1d(output_shape));
-    std::size_t workspace_size = 0;
-
-#ifdef MIGRAPHX_HAS_FIND_2_API
-    {
-        auto conv_problem = make_obj<miopen_problem>(
-            &miopenCreateConvProblem, cd.get(), miopenProblemDirectionForward);
-
-        set_tensor_descriptor(miopenTensorConvolutionX, x_desc, conv_problem);
-        set_tensor_descriptor(miopenTensorConvolutionW, w_desc, conv_problem);
-        set_tensor_descriptor(miopenTensorConvolutionY, y_desc, conv_problem);
-
-        auto* miopen_stream_handle = ctx.get_stream().get_miopen();
-
-        solution_ptr = find_solution(miopen_stream_handle, conv_problem.get());
-
-        auto status = miopenGetSolutionWorkspaceSize(solution_ptr.get(), &workspace_size);
-        if(status != miopenStatusSuccess)
-            MIGRAPHX_THROW("MIOpen Convolution : failed to get solution's workspace size");
-
-        std::size_t solution_size;
-        status = miopenGetSolutionSize(solution_ptr.get(), &solution_size);
-        if(status != miopenStatusSuccess)
-            MIGRAPHX_THROW("MIOpen Convolution: Failed to fetch solution size");
-
-        auto solution_binary = std::vector<char>{};
-        solution_binary.resize(solution_size);
-
-        status = miopenSaveSolution(solution_ptr.get(), solution_binary.data());
-        if(status != miopenStatusSuccess)
-            MIGRAPHX_THROW("MIOpen Convolution: Saving solution failed");
-        solution_object = value::binary{solution_binary.data(), solution_size};
-
-        return shape{shape::int8_type, {workspace_size}};
-    }
-#else
-    // else use immediate find mode
-    auto status = miopenConvolutionForwardGetWorkSpaceSize(ctx.get_stream().get_miopen(),
-                                                           w_desc.get(),
-                                                           x_desc.get(),
-                                                           cd.get(),
-                                                           y_desc.get(),
-                                                           &workspace_size);
-    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Convolution: Failed to get forward workspace size");
-
-    workspace_shape = shape{shape::int8_type, {workspace_size}};
-
-    auto x         = to_gpu(generate_argument(inputs[0]));
-    auto w         = to_gpu(generate_argument(inputs[1]));
-    auto y         = allocate_gpu(output_shape);
-    auto workspace = allocate_gpu(workspace_shape);
-
-    int algo_count = 1;
-    miopenConvAlgoPerf_t perf;
-    status = miopenFindConvolutionForwardAlgorithm(ctx.get_stream().get_miopen(),
-                                                   x_desc.get(),
-                                                   x.implicit(),
-                                                   w_desc.get(),
-                                                   w.implicit(),
-                                                   cd.get(),
-                                                   y_desc.get(),
-                                                   y.implicit(),
-                                                   1,
-                                                   &algo_count,
-                                                   &perf,
-                                                   workspace.implicit(),
-                                                   workspace_size,
-                                                   false);
-    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Convolution: find convolution failed");
-    algo = perf.fwd_algo;
-
-    size_t solution_count;
-
-    status = miopenConvolutionForwardGetSolutionCount(ctx.get_stream().get_miopen(),
-                                                      w_desc.get(),
-                                                      x_desc.get(),
-                                                      cd.get(),
-                                                      y_desc.get(),
-                                                      &solution_count);
-    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Convolution: get solution count failed");
-
-    std::vector<miopenConvSolution_t> solutions(solution_count);
-
-    status = miopenConvolutionForwardGetSolution(ctx.get_stream().get_miopen(),
-                                                 w_desc.get(),
-                                                 x_desc.get(),
-                                                 cd.get(),
-                                                 y_desc.get(),
-                                                 solution_count,
-                                                 &solution_count,
-                                                 solutions.data());
-    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Convolution: get solution failed");
-
-    solution_id = solutions.front().solution_id;
-
-    return shape{shape::int8_type, {perf.memory}};
-#endif
-}
-
-void miopen_convolution::finalize(context& ctx,
-                                  const shape& output_shape,
-                                  const std::vector<shape>& inputs)
-{
-#ifdef MIGRAPHX_HAS_FIND_2_API
-    {
-        (void)(ctx); // avoid warnings
-        (void)(output_shape);
-        (void)(inputs);
-        // load solution
-        if(solution_ptr == nullptr)
-        {
-            miopenSolution_t ptr;
-            auto status  = miopenLoadSolution(&ptr,
-                                             reinterpret_cast<const char*>(solution_object.data()),
-                                             solution_object.size());
-            solution_ptr = miopen_solution{ptr};
-            if(status != miopenStatusSuccess)
-                MIGRAPHX_THROW("MIOpen Convolution: loading convolution solution failed");
-        }
-    }
-#else
-    // Use immediate mode API
-    {
-        if(cd == nullptr)
-            cd = make_conv(op);
-        if(solution_id == 0)
-        {
-            // Check that workspace hasn't changed
-            auto size = inputs.at(2).bytes();
-            auto ws   = find(ctx, output_shape, inputs);
-            if(ws.bytes() > size)
-                MIGRAPHX_THROW("MIOpen Convolution: workspace has changed during finalization.");
-        }
-
-        auto x_desc = make_tensor(reshape_if_1d(inputs[0]));
-        auto w_desc = make_tensor(reshape_if_1d(inputs[1]));
-        auto y_desc = make_tensor(reshape_if_1d(output_shape));
-
-        auto status = miopenConvolutionForwardCompileSolution(ctx.get_stream().get_miopen(),
-                                                              w_desc.get(),
-                                                              x_desc.get(),
-                                                              cd.get(),
-                                                              y_desc.get(),
-                                                              solution_id);
-        if(status != miopenStatusSuccess)
-            MIGRAPHX_THROW("MIOpen Convolution: compile solution failed");
-    }
-#endif
-}
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/deconvolution.cpp
+++ b/src/targets/gpu/deconvolution.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/gpu/deconvolution.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <migraphx/generate.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-shape miopen_deconvolution::compute_shape(const std::vector<shape>& inputs) const
-{
-    check_shapes{inputs, *this}.has(4).standard();
-    std::vector<shape> conv_inputs(inputs.begin(), inputs.begin() + 2);
-    check_shapes{conv_inputs, *this}.max_ndims(5);
-    return op.compute_shape(conv_inputs);
-}
-
-inline shape reshape_if_1d(const shape& input)
-{
-    shape new_shape{input};
-    auto dims = new_shape.lens();
-
-    if(dims.size() == 3)
-    {
-        std::vector<size_t> new_dims = dims;
-        new_dims.insert(new_dims.begin() + 2, 1);
-        new_shape = shape{input.type(), new_dims};
-    }
-    return new_shape;
-}
-
-argument miopen_deconvolution::compute(context& ctx,
-                                       const shape& output_shape,
-                                       const std::vector<argument>& args) const
-{
-    auto x_desc = make_tensor(reshape_if_1d(args[0].get_shape()));
-    auto w_desc = make_tensor(reshape_if_1d(args[1].get_shape()));
-    auto y_desc = make_tensor(reshape_if_1d(output_shape));
-
-    if(solution_id == 0)
-        MIGRAPHX_THROW("MIOpen Deconvolution: invalid solution ID");
-
-    auto status = miopenConvolutionForwardImmediate(ctx.get_stream().get_miopen(),
-                                                    w_desc.get(),
-                                                    args[1].implicit(),
-                                                    x_desc.get(),
-                                                    args[0].implicit(),
-                                                    cd.get(),
-                                                    y_desc.get(),
-                                                    args[3].implicit(),
-                                                    args[2].implicit(),
-                                                    args[2].get_shape().bytes(),
-                                                    solution_id);
-
-    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Deconvolution: running convolution failed");
-    return args[3];
-}
-
-shape miopen_deconvolution::find(context& ctx, const shape& output_shape, std::vector<shape> inputs)
-{
-    shape workspace_shape{};
-
-    auto x_desc = make_tensor(reshape_if_1d(inputs[0]));
-    auto w_desc = make_tensor(reshape_if_1d(inputs[1]));
-    auto y_desc = make_tensor(reshape_if_1d(output_shape));
-
-    std::size_t workspace_size = 0;
-    miopenConvolutionForwardGetWorkSpaceSize(ctx.get_stream().get_miopen(),
-                                             w_desc.get(),
-                                             x_desc.get(),
-                                             cd.get(),
-                                             y_desc.get(),
-                                             &workspace_size);
-    workspace_shape = shape{shape::int8_type, {workspace_size}};
-
-    auto x         = to_gpu(generate_argument(inputs[0]));
-    auto w         = to_gpu(generate_argument(inputs[1]));
-    auto y         = allocate_gpu(output_shape);
-    auto workspace = allocate_gpu(workspace_shape);
-
-    int algo_count = 1;
-    miopenConvAlgoPerf_t perf;
-    auto status = miopenFindConvolutionForwardAlgorithm(ctx.get_stream().get_miopen(),
-                                                        x_desc.get(),
-                                                        x.implicit(),
-                                                        w_desc.get(),
-                                                        w.implicit(),
-                                                        cd.get(),
-                                                        y_desc.get(),
-                                                        y.implicit(),
-                                                        1,
-                                                        &algo_count,
-                                                        &perf,
-                                                        workspace.implicit(),
-                                                        workspace_size,
-                                                        false);
-    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Deconvolution: find convolution failed");
-    algo = perf.fwd_algo;
-
-    size_t solution_count;
-
-    status = miopenConvolutionForwardGetSolutionCount(ctx.get_stream().get_miopen(),
-                                                      w_desc.get(),
-                                                      x_desc.get(),
-                                                      cd.get(),
-                                                      y_desc.get(),
-                                                      &solution_count);
-    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Deconvolution: get solution count failed");
-
-    std::vector<miopenConvSolution_t> solutions(solution_count);
-
-    status = miopenConvolutionForwardGetSolution(ctx.get_stream().get_miopen(),
-                                                 w_desc.get(),
-                                                 x_desc.get(),
-                                                 cd.get(),
-                                                 y_desc.get(),
-                                                 solution_count,
-                                                 &solution_count,
-                                                 solutions.data());
-    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Deconvolution: get solution failed");
-
-    solution_id = solutions.front().solution_id;
-
-    return shape{shape::int8_type, {perf.memory}};
-}
-
-void miopen_deconvolution::finalize(context& ctx,
-                                    const shape& output_shape,
-                                    std::vector<shape> inputs)
-{
-    if(cd == nullptr)
-        cd = make_deconv(op);
-    if(solution_id == 0)
-    {
-        // Check that workspace hasn't changed
-        auto size = inputs.at(2).bytes();
-        auto ws   = find(ctx, output_shape, inputs);
-        if(ws.bytes() > size)
-            MIGRAPHX_THROW("MIOpen Deconvolution: workspace has changed during finalization.");
-    }
-
-    auto x_desc = make_tensor(reshape_if_1d(inputs[0]));
-    auto w_desc = make_tensor(reshape_if_1d(inputs[1]));
-    auto y_desc = make_tensor(reshape_if_1d(output_shape));
-
-    auto status = miopenConvolutionForwardCompileSolution(ctx.get_stream().get_miopen(),
-                                                          w_desc.get(),
-                                                          x_desc.get(),
-                                                          cd.get(),
-                                                          y_desc.get(),
-                                                          solution_id);
-    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Deconvolution: compile solution failed");
-}
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/elu.cpp
+++ b/src/targets/gpu/elu.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/gpu/elu.hpp>
-#include <migraphx/gpu/context.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-shape miopen_elu::compute_shape(const std::vector<shape>& inputs) const
-{
-    check_shapes{inputs, *this}.has(2).not_broadcasted();
-    return inputs.at(1);
-}
-
-argument miopen_elu::compute(context& ctx,
-                             const shape& output_shape,
-                             const std::vector<argument>& args) const
-{
-    float alpha = 1;
-    float beta  = 0;
-    auto x_desc = make_tensor(args[0].get_shape());
-    auto y_desc = make_tensor(output_shape);
-    miopenActivationForward(ctx.get_stream().get_miopen(),
-                            ad.get(),
-                            &alpha,
-                            x_desc.get(),
-                            args[0].implicit(),
-                            &beta,
-                            y_desc.get(),
-                            args[1].implicit());
-
-    return args[1];
-}
-
-void miopen_elu::finalize(context&, const shape&, const std::vector<shape>&)
-{
-    ad = make_elu(op.alpha);
-}
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/fuse_mlir.cpp
+++ b/src/targets/gpu/fuse_mlir.cpp
@@ -49,7 +49,7 @@ struct mlir_conv
    std::string name() const { return "gpu::mlir_conv"; }
    shape compute_shape(std::vector<shape> inputs, const std::vector<module_ref>& mods) const
    {
-        check_shapes{inputs, *this}.standard();
+        check_shapes{inputs, *this}.packed_or_broadcasted();
        if(mods.size() != 1)
            MIGRAPHX_THROW("should have one submodule.");
        if(inputs.size() < 2)
@@ -70,6 +70,9 @@ MIGRAPHX_PRED_MATCHER(is_mlir_conv, instruction_ref ins)
    auto group = v.at("group").to<int>();
    if(group != 1)
        return false;
+    // Avoid MLIR assertion: Index < Length && "Invalid index!"
+    if(ins->get_shape().lens().size() != 4)
+        return false;
    return true;
 }

@@ -96,9 +99,10 @@ struct find_conv_pointwise
                                   i.name());
           }))
            return;
-        // Only fuse with fp32 for now
+        // Only fuse with fp32/fp16
        if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [&](auto i) {
-               return i->get_shape().type() != shape::type_t::float_type;
+               return not contains({shape::type_t::float_type, shape::type_t::half_type},
+                                   i->get_shape().type());
           }))
            return;
        std::sort(names.begin(), names.end());

--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -26,7 +26,6 @@
 #include <migraphx/gpu/fuse_ops.hpp>
 #include <migraphx/matcher.hpp>
 #include <migraphx/gpu/miopen.hpp>
-#include <migraphx/gpu/convolution.hpp>
 #include <migraphx/gpu/device_name.hpp>
 #include <migraphx/gpu/oper.hpp>
 #include <migraphx/gpu/gemm.hpp>
@@ -190,10 +189,12 @@ MIGRAPHX_PRED_MATCHER(fusable_conv, instruction_ref ins)
        return false;
    auto wei = ins->inputs().at(1)->get_shape();
    assert(wei.lens().size() == 4);
-    auto conv = any_cast<miopen_convolution>(ins->get_operator());
-    if(conv.op.group > 1)
+    auto miopen_conv_op = ins->get_operator().to_value();
+    auto algo           = miopen_conv_op.at("algo").to<miopenConvFwdAlgorithm_t>();
+    auto conv_op        = from_value<op::convolution>(miopen_conv_op["op"]);
+    if(conv_op.group > 1)
        return false;
-    if(wei.lens()[1] > 512 and conv.algo != miopenConvolutionFwdAlgoWinograd)
+    if(wei.lens()[1] > 512 and algo != miopenConvolutionFwdAlgoWinograd)
        return false;

    // Do not fuse non-symmetric input
@@ -201,13 +202,12 @@ MIGRAPHX_PRED_MATCHER(fusable_conv, instruction_ref ins)
    if(input_lens[2] != input_lens[3] or wei.lens()[2] != wei.lens()[3])
        return false;

-    auto op = conv.op;
    // Dont fuse winograd for non-3x3s since there is no fused windograd for those configs
-    if(conv.algo == miopenConvolutionFwdAlgoWinograd and wei.lens()[2] != 3 and
-       wei.lens()[3] != 3 and contains({{1, 1}}, op.stride))
+    if(algo == miopenConvolutionFwdAlgoWinograd and wei.lens()[2] != 3 and wei.lens()[3] != 3 and
+       contains({{1, 1}}, conv_op.stride))
        return false;
-    return contains({{0, 0, 0, 0}, {1, 1, 1, 1}, {2, 2, 2, 2}}, op.padding) and
-           contains({{0, 0}, {1, 1}}, op.stride) and contains({{1, 1}}, op.dilation);
+    return contains({{0, 0, 0, 0}, {1, 1, 1, 1}, {2, 2, 2, 2}}, conv_op.padding) and
+           contains({{0, 0}, {1, 1}}, conv_op.stride) and contains({{1, 1}}, conv_op.dilation);
 }

 void move_broadcasted_back(std::vector<instruction_ref>& args)
@@ -462,7 +462,7 @@ void apply_conv_bias(context& ctx, module& m, const match::matcher_result& r)
    auto ins         = r.result;
    auto input_ins   = conv_ins->inputs().at(0);
    auto weights_ins = conv_ins->inputs().at(1);
-    auto conv_op     = any_cast<miopen_convolution>(conv_ins->get_operator()).op;
+    auto conv_op     = from_value<op::convolution>((conv_ins->get_operator()).to_value()["op"]);
    auto alloc_ins   = ins->inputs().back();
    auto old_ws_ins  = conv_ins->inputs().at(2);

@@ -528,7 +528,7 @@ struct find_conv_pointwise
        auto ins         = r.result;
        auto input_ins   = conv_ins->inputs().at(0);
        auto weights_ins = conv_ins->inputs().at(1);
-        auto conv_op     = any_cast<miopen_convolution>(conv_ins->get_operator()).op;
+        auto conv_op     = from_value<op::convolution>(conv_ins->get_operator().to_value()["op"]);
        auto alloc_ins   = ins->inputs().back();

        module_ref pm = ins->module_inputs().front();
@@ -553,11 +553,13 @@ struct find_gemm_pointwise
 {
    auto matcher() const
    {
-        return precompile_name("pointwise")(
+        auto gemm_op   = match::name("gpu::gemm")(match::nargs(3), match::used_once()).bind("gemm");
+        auto binary_op = match::all_of(
            match::nargs(3),
            match::either_arg(0, 1)(
-                match::any_of(match::standard_shape(), match::is_constant()).bind("c"),
-                match::name("gpu::gemm")(match::nargs(3), match::used_once()).bind("gemm")));
+                match::any_of(match::standard_shape(), match::is_constant()).bind("c"), gemm_op));
+        auto unary_op = match::all_of(match::nargs(2), match::arg(0)(gemm_op));
+        return precompile_name("pointwise")(match::any_of(binary_op, unary_op));
    }

    // TODO: Move to matcher.hpp
@@ -589,61 +591,84 @@ struct find_gemm_pointwise
        return match::name("@return")(match::args(match::any_of(add, mul_add, add_mul)));
    }

+    static auto match_mul(const std::string& input)
+    {
+        auto mul = match_mul_const(match_param(input), "alpha");
+        return match::name("@return")(match::args(mul));
+    }
+
    static float get_float(instruction_ref ins) { return ins->get_literal().at<float>(); }

    template <class Gemm>
    static bool update_gemm(Gemm& gemm, module_ref pm, unsigned input)
    {
        auto names = pm->get_parameter_names();
-        if(names.size() != 2)
-            return false;
        std::sort(names.begin(), names.end());
-        unsigned output = input == 0 ? 1 : 0;
-        auto mr         = match::match_instruction(
-            *pm, std::prev(pm->end()), match_add(names[input], names[output]));
-        if(mr.result == pm->end())
-            return false;
-        if(contains(mr.instructions, "alpha_mul"))
+        if(names.size() == 1)
+        {
+            auto mr = match::match_instruction(*pm, std::prev(pm->end()), match_mul(names[input]));
+            if(mr.result == pm->end())
+                return false;
            gemm.alpha *= get_float(mr.instructions["alpha"]);
-        else if(contains(mr.instructions, "beta_mul"))
-            gemm.beta *= get_float(mr.instructions["beta"]);
-        else if(contains(mr.instructions, "gamma_mul"))
+            return true;
+        }
+        else if(names.size() == 2)
        {
-            gemm.alpha *= get_float(mr.instructions["gamma"]);
-            gemm.beta *= get_float(mr.instructions["gamma"]);
+            unsigned output = input == 0 ? 1 : 0;
+            auto mr         = match::match_instruction(
+                *pm, std::prev(pm->end()), match_add(names[input], names[output]));
+            if(mr.result == pm->end())
+                return false;
+            if(contains(mr.instructions, "alpha_mul"))
+                gemm.alpha *= get_float(mr.instructions["alpha"]);
+            else if(contains(mr.instructions, "beta_mul"))
+                gemm.beta *= get_float(mr.instructions["beta"]);
+            else if(contains(mr.instructions, "gamma_mul"))
+            {
+                gemm.alpha *= get_float(mr.instructions["gamma"]);
+                gemm.beta *= get_float(mr.instructions["gamma"]);
+            }
+            return true;
+        }
+        else
+        {
+            return false;
        }
-        return true;
    }

    void apply(module& m, const match::matcher_result& r) const
    {
        auto ins      = r.result;
        auto gemm_ins = r.instructions["gemm"];
-        auto c_ins    = r.instructions["c"];

        auto gemm = any_cast<rocblas_gemm<op::dot>>(gemm_ins->get_operator());

        // Already fused gemm
        if(not float_equal(gemm.beta, 0))
            return;
-        gemm.beta = 1;
+        if(ins->inputs().size() == 3)
+            gemm.beta = 1;

        if(not update_gemm(
               gemm, ins->module_inputs().front(), ins->inputs().front() == gemm_ins ? 0 : 1))
            return;

-        // const-fold input if not standard shape since rocblas can't handle it
-        if(not c_ins->get_shape().standard())
-        {
-            auto c = make_op("contiguous");
-            auto l = c.compute(c.compute_shape({c_ins->get_shape()}), {c_ins->eval()});
-            c_ins  = m.add_literal(l.get_shape(), l.data());
-        }
-
        auto inputs = gemm_ins->inputs();
        inputs.pop_back();

-        inputs.push_back(c_ins);
+        if(ins->inputs().size() == 3)
+        {
+            auto c_ins = r.instructions["c"];
+            // const-fold input if not standard shape since rocblas can't handle it
+            if(not c_ins->get_shape().standard())
+            {
+                auto c = make_op("contiguous");
+                auto l = c.compute(c.compute_shape({c_ins->get_shape()}), {c_ins->eval()});
+                c_ins  = m.add_literal(l.get_shape(), l.data());
+            }
+            inputs.push_back(c_ins);
+        }
+
        inputs.push_back(ins->inputs().back());

        m.replace_instruction(ins, gemm, inputs);
@@ -772,11 +797,9 @@ struct find_layernorm_pointwise
    {
        auto ins       = r.result;
        auto layernorm = r.instructions["layernorm"];
-        auto* pm       = ins->module_inputs().front();
-
        if(not layernorm->module_inputs().empty())
            return;
-
+        auto* pm    = ins->module_inputs().front();
        auto inputs = layernorm->inputs();
        inputs.pop_back();
        inputs.insert(inputs.end(), ins->inputs().begin() + 1, ins->inputs().end());
@@ -785,6 +808,37 @@ struct find_layernorm_pointwise
    }
 };

+struct find_concat_pointwise
+{
+    auto matcher() const
+    {
+        return precompile_name("pointwise")(
+            match::arg(0)(precompile_name("concat").bind("concat")));
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins    = r.result;
+        auto concat = r.instructions["concat"];
+        if(not concat->module_inputs().empty())
+            return;
+
+        // TODO: Handle type conversions
+        if(ins->get_shape().type() != concat->get_shape().type())
+            return;
+
+        auto* pm    = ins->module_inputs().front();
+        auto inputs = concat->inputs();
+        inputs.pop_back();
+        inputs.insert(inputs.end(), ins->inputs().begin() + 1, ins->inputs().end());
+
+        auto op = concat->get_operator();
+        op.from_value({{"additional_args", ins->inputs().size() - 1}, {"ignore_modules", true}});
+
+        m.replace_instruction(ins, op, inputs, {pm});
+    }
+};
+
 void fuse_ops::apply(module& m) const
 {
    match::find_matches(m, find_contiguous_pointwise{});
@@ -793,6 +847,7 @@ void fuse_ops::apply(module& m) const
    run_passes(m, {dead_code_elimination{}});
    match::find_matches(m,
                        find_layernorm_pointwise{},
+                        find_concat_pointwise{},
                        find_gemm_pointwise{},
                        find_contiguous_tranpose_gemm{},
                        find_commutative_broadcast{});

--- a/src/targets/gpu/gemm_impl.cpp
+++ b/src/targets/gpu/gemm_impl.cpp
@@ -21,7 +21,7 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#include <rocblas.h>
+#include <rocblas/rocblas.h>
 #include <migraphx/gpu/gemm_impl.hpp>
 #include <migraphx/reduce_dims.hpp>
 #include <migraphx/permutation.hpp>

--- a/src/targets/gpu/hip.cpp
+++ b/src/targets/gpu/hip.cpp
@@ -183,8 +183,8 @@ argument register_on_gpu(const argument& arg)
 {
    auto arg_shared = arg.share();
    auto p          = register_on_gpu(arg_shared.data(), arg_shared.get_shape().bytes());
-    return {arg_shared.get_shape(),
-            [p, a = std::move(arg_shared)]() mutable { return get_device_ptr(p.get()); }};
+    auto s          = arg_shared.get_shape();
+    return {s, [p, a = std::move(arg_shared)]() mutable { return get_device_ptr(p.get()); }};
 }

 argument to_gpu(const argument& arg, bool host)

--- a/src/targets/gpu/include/migraphx/gpu/batch_norm_inference.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/batch_norm_inference.hpp
@@ -21,41 +21,31 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_BATCHNORM_HPP
-#define MIGRAPHX_GUARD_RTGLIB_BATCHNORM_HPP
+#ifndef MIGRAPHX_GUARD_GPU_COMPILE_MIOPEN_HPP
+#define MIGRAPHX_GUARD_GPU_COMPILE_MIOPEN_HPP

-#include <migraphx/argument.hpp>
-#include <migraphx/op/batch_norm_inference.hpp>
-#include <migraphx/reflect.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/instruction_ref.hpp>
+#include <string>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {

+struct module;
 struct context;
+struct operation;

-struct miopen_batch_norm_inference
-{
-    op::batch_norm_inference op;
-
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return migraphx::reflect(self.op, f);
-    }
+namespace gpu {

-    std::string name() const { return "gpu::batch_norm_inference"; }
-    shape compute_shape(const std::vector<shape>& inputs) const;
-    argument
-    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
-    {
-        return shapes.size() - 1;
-    }
+struct compile_miopen
+{
+    context* ctx = nullptr;
+    std::string name() const { return "gpu::compile_miopen"; }
+    void apply(module& m) const;
+    std::size_t compile(operation& op, instruction_ref ins, bool format) const;
 };

 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
-
-#endif
+#endif // MIGRAPHX_GUARD_GPU_COMPILE_MIOPEN_HPP