manual merge

870a396b · Khalique Ahmed · 228b665c · d309e02f · 228b665c · 228b665c
Commit 870a396b authored Jan 23, 2023 by Khalique Ahmed
20 changed files
--- a/src/targets/gpu/device/where.cpp
+++ b/src/targets/gpu/device/where.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/gpu/device/where.hpp>
-#include <migraphx/gpu/device/tensor.hpp>
-#include <migraphx/gpu/device/types.hpp>
-#include <migraphx/gpu/device/launch.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-namespace device {
-
-template <class Shape>
-constexpr auto get_rank(const Shape&)
-{
-    return decltype(typename Shape::hip_index{}.size()){};
-}
-
-void where(hipStream_t stream,
-           const argument& result,
-           const argument& arg0,
-           const argument& arg1,
-           const argument& arg2)
-{
-    hip_visit_all(result, arg1, arg2)([&](auto output, auto x, auto y) {
-        hip_visit_all(arg0)([&](auto cond) {
-            if constexpr(get_rank(cond.get_shape()) == get_rank(output.get_shape()))
-            {
-                gs_launch(stream, arg1.get_shape().elements())([=](auto idx) __device__ {
-                    auto i    = output.get_shape().multi(idx);
-                    output[i] = cond[i] ? x[i] : y[i];
-                });
-            }
-        });
-    });
-}
-
-} // namespace device
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/elu.cpp
+++ b/src/targets/gpu/elu.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/gpu/elu.hpp>
-#include <migraphx/gpu/context.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-shape miopen_elu::compute_shape(const std::vector<shape>& inputs) const
-{
-    check_shapes{inputs, *this}.has(2).not_broadcasted();
-    return inputs.at(1);
-}
-
-argument miopen_elu::compute(context& ctx,
-                             const shape& output_shape,
-                             const std::vector<argument>& args) const
-{
-    float alpha = 1;
-    float beta  = 0;
-    auto x_desc = make_tensor(args[0].get_shape());
-    auto y_desc = make_tensor(output_shape);
-    miopenActivationForward(ctx.get_stream().get_miopen(),
-                            ad.get(),
-                            &alpha,
-                            x_desc.get(),
-                            args[0].implicit(),
-                            &beta,
-                            y_desc.get(),
-                            args[1].implicit());
-
-    return args[1];
-}
-
-void miopen_elu::finalize(context&, const shape&, const std::vector<shape>&)
-{
-    ad = make_elu(op.alpha);
-}
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/fuse_mlir.cpp
+++ b/src/targets/gpu/fuse_mlir.cpp
@@ -49,7 +49,7 @@ struct mlir_conv
    std::string name() const { return "gpu::mlir_conv"; }
    shape compute_shape(std::vector<shape> inputs, const std::vector<module_ref>& mods) const
    {
-        check_shapes{inputs, *this}.standard();
+        check_shapes{inputs, *this}.packed_or_broadcasted();
        if(mods.size() != 1)
            MIGRAPHX_THROW("should have one submodule.");
        if(inputs.size() < 2)
@@ -61,13 +61,28 @@ struct mlir_conv
 MIGRAPHX_REGISTER_OP(mlir_conv);

 namespace {
+
+MIGRAPHX_PRED_MATCHER(is_mlir_conv, instruction_ref ins)
+{
+    if(ins->name() != "convolution")
+        return false;
+    value v    = ins->get_operator().to_value();
+    auto group = v.at("group").to<int>();
+    if(group != 1)
+        return false;
+    // Avoid MLIR assertion: Index < Length && "Invalid index!"
+    if(ins->get_shape().lens().size() != 4)
+        return false;
+    return true;
+}
+
 struct find_conv_pointwise
 {
    // Find a convolution followed by a pointwise operation.
    auto matcher() const
    {
        auto convolution =
-            match::skip(match::name("contiguous"))(match::name("convolution").bind("convolution"));
+            match::skip(match::name("contiguous"))(is_mlir_conv().bind("convolution"));
        return match::name("pointwise")(match::any_of[match::inputs()](convolution.bind("x")));
    }

@@ -84,9 +99,10 @@ struct find_conv_pointwise
                                   i.name());
           }))
            return;
-        // Only fuse with fp32 for now
+        // Only fuse with fp32/fp16
        if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [&](auto i) {
-               return i->get_shape().type() != shape::type_t::float_type;
+               return not contains({shape::type_t::float_type, shape::type_t::half_type},
+                                   i->get_shape().type());
           }))
            return;
        std::sort(names.begin(), names.end());

--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -26,24 +26,9 @@
 #include <migraphx/gpu/fuse_ops.hpp>
 #include <migraphx/matcher.hpp>
 #include <migraphx/gpu/miopen.hpp>
-#include <migraphx/gpu/convolution.hpp>
 #include <migraphx/gpu/device_name.hpp>
 #include <migraphx/gpu/oper.hpp>
-#include <migraphx/gpu/add.hpp>
-#include <migraphx/gpu/mul.hpp>
 #include <migraphx/gpu/gemm.hpp>
-#include <migraphx/gpu/device/layernorm.hpp>
-#include <migraphx/gpu/device/gelu.hpp>
-#include <migraphx/gpu/device/mul_add.hpp>
-#include <migraphx/gpu/device/add_clip.hpp>
-#include <migraphx/gpu/device/add_relu.hpp>
-#include <migraphx/gpu/device/add_sigmoid.hpp>
-#include <migraphx/gpu/device/add_tanh.hpp>
-#include <migraphx/gpu/device/mul_add_relu.hpp>
-#include <migraphx/gpu/device/add.hpp>
-#include <migraphx/match/layernorm.hpp>
-#include <migraphx/match/gelu_erf.hpp>
-#include <migraphx/match/gelu_tanh.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/register_op.hpp>
 #include <migraphx/array.hpp>
@@ -204,10 +189,12 @@ MIGRAPHX_PRED_MATCHER(fusable_conv, instruction_ref ins)
        return false;
    auto wei = ins->inputs().at(1)->get_shape();
    assert(wei.lens().size() == 4);
-    auto conv = any_cast<miopen_convolution>(ins->get_operator());
-    if(conv.op.group > 1)
+    auto miopen_conv_op = ins->get_operator().to_value();
+    auto algo           = miopen_conv_op.at("algo").to<miopenConvFwdAlgorithm_t>();
+    auto conv_op        = from_value<op::convolution>(miopen_conv_op["op"]);
+    if(conv_op.group > 1)
        return false;
-    if(wei.lens()[1] > 512 and conv.algo != miopenConvolutionFwdAlgoWinograd)
+    if(wei.lens()[1] > 512 and algo != miopenConvolutionFwdAlgoWinograd)
        return false;

    // Do not fuse non-symmetric input
@@ -215,109 +202,14 @@ MIGRAPHX_PRED_MATCHER(fusable_conv, instruction_ref ins)
    if(input_lens[2] != input_lens[3] or wei.lens()[2] != wei.lens()[3])
        return false;

-    auto op = conv.op;
    // Dont fuse winograd for non-3x3s since there is no fused windograd for those configs
-    if(conv.algo == miopenConvolutionFwdAlgoWinograd and wei.lens()[2] != 3 and
-       wei.lens()[3] != 3 and contains({{1, 1}}, op.stride))
+    if(algo == miopenConvolutionFwdAlgoWinograd and wei.lens()[2] != 3 and wei.lens()[3] != 3 and
+       contains({{1, 1}}, conv_op.stride))
        return false;
-    return contains({{0, 0, 0, 0}, {1, 1, 1, 1}, {2, 2, 2, 2}}, op.padding) and
-           contains({{0, 0}, {1, 1}}, op.stride) and contains({{1, 1}}, op.dilation);
+    return contains({{0, 0, 0, 0}, {1, 1, 1, 1}, {2, 2, 2, 2}}, conv_op.padding) and
+           contains({{0, 0}, {1, 1}}, conv_op.stride) and contains({{1, 1}}, conv_op.dilation);
 }

-struct hip_triadd : ternary_device<hip_triadd, &device::add>
-{
-};
-MIGRAPHX_REGISTER_OP(hip_triadd)
-
-struct hip_triadd_clip : quinary_device<hip_triadd_clip, &device::add_clip>
-{
-};
-MIGRAPHX_REGISTER_OP(hip_triadd_clip)
-
-struct hip_add_clip : quaternary_device<hip_add_clip, &device::add_clip>
-{
-};
-MIGRAPHX_REGISTER_OP(hip_add_clip)
-
-struct hip_triadd_relu : ternary_device<hip_triadd_relu, &device::add_relu>
-{
-};
-MIGRAPHX_REGISTER_OP(hip_triadd_relu)
-
-struct hip_triadd_sigmoid : ternary_device<hip_triadd_sigmoid, &device::add_sigmoid>
-{
-};
-MIGRAPHX_REGISTER_OP(hip_triadd_sigmoid)
-
-struct hip_triadd_tanh : ternary_device<hip_triadd_tanh, &device::add_tanh>
-{
-};
-MIGRAPHX_REGISTER_OP(hip_triadd_tanh)
-
-struct hip_add_relu : binary_device<hip_add_relu, &device::add_relu>
-{
-};
-MIGRAPHX_REGISTER_OP(hip_add_relu)
-
-struct hip_add_sigmoid : binary_device<hip_add_sigmoid, &device::add_sigmoid>
-{
-};
-MIGRAPHX_REGISTER_OP(hip_add_sigmoid)
-
-struct hip_add_tanh : binary_device<hip_add_tanh, &device::add_tanh>
-{
-};
-MIGRAPHX_REGISTER_OP(hip_add_tanh)
-
-struct hip_layernorm : unary_device<hip_layernorm, &device::layernorm>
-{
-    // Empty finalize to skip dimension reduction
-    void finalize(context&, const shape&, const std::vector<shape>&) {}
-};
-MIGRAPHX_REGISTER_OP(hip_layernorm)
-
-struct hip_triadd_layernorm : ternary_device<hip_triadd_layernorm, &device::triadd_layernorm>
-{
-    shape compute_shape(const std::vector<shape>& inputs) const
-    {
-        check_shapes{inputs, *this}.has(4).standard();
-        return inputs[0];
-    }
-    // Empty finalize to skip dimension reduction
-    void finalize(context&, const shape&, const std::vector<shape>&) {}
-};
-MIGRAPHX_REGISTER_OP(hip_triadd_layernorm)
-
-struct hip_gelu : unary_device<hip_gelu, &device::gelu>
-{
-};
-MIGRAPHX_REGISTER_OP(hip_gelu)
-
-struct hip_add_gelu : binary_device<hip_add_gelu, &device::add_gelu>
-{
-};
-MIGRAPHX_REGISTER_OP(hip_add_gelu)
-
-struct hip_gelu_new : unary_device<hip_gelu_new, &device::gelu_new>
-{
-};
-MIGRAPHX_REGISTER_OP(hip_gelu_new)
-
-struct hip_add_gelu_new : binary_device<hip_add_gelu_new, &device::add_gelu_new>
-{
-};
-MIGRAPHX_REGISTER_OP(hip_add_gelu_new)
-
-struct hip_mul_add : ternary_device<hip_mul_add, &device::mul_add>
-{
-};
-MIGRAPHX_REGISTER_OP(hip_mul_add)
-
-struct hip_mul_add_relu : ternary_device<hip_mul_add_relu, &device::mul_add_relu>
-{
-};
-MIGRAPHX_REGISTER_OP(hip_mul_add_relu)
-
 void move_broadcasted_back(std::vector<instruction_ref>& args)
 {
    // Ensure the last arguments is the broadcasted one
@@ -341,256 +233,6 @@ void move_standard_front(std::vector<instruction_ref>& args)
 auto gpu_name(const std::string& s) { return match::name("gpu::" + s); }

 namespace {
-struct find_layernorm
-{
-    auto matcher() const { return match::layernorm(&gpu_name); }
-
-    void apply(module& m, const match::matcher_result& r) const
-    {
-        auto ins   = r.result;
-        auto x_ins = r.instructions["x"];
-        auto args  = ins->inputs();
-
-        // We dont fuse for non-standard layouts
-        if(not x_ins->get_shape().standard())
-            return;
-
-        auto relements = x_ins->get_shape().lens().back();
-
-        if(relements > 1024 or (relements % 4 != 0 and relements > 256))
-            return;
-
-        m.replace_instruction(ins, hip_layernorm{}, x_ins, args.back());
-    }
-};
-
-struct find_triadd_layernorm
-{
-    auto matcher() const
-    {
-        return match::name("gpu::layernorm")(match::arg(0)(match::name("gpu::triadd")(
-            match::used_once(), match::all_of[match::inputs()](match::standard_shape()))));
-    }
-
-    void apply(module& m, const match::matcher_result& r) const
-    {
-        auto ins    = r.result;
-        auto triadd = ins->inputs().front();
-        m.replace_instruction(ins, hip_triadd_layernorm{}, triadd->inputs());
-    }
-};
-
-struct find_gelu
-{
-    auto matcher() const { return match::gelu_erf(&gpu_name); }
-
-    void apply(module& m, const match::matcher_result& r) const
-    {
-        auto ins   = r.result;
-        auto x_ins = r.instructions["x"];
-        auto args  = ins->inputs();
-
-        m.replace_instruction(ins, hip_gelu{}, x_ins, args.back());
-    }
-};
-
-struct find_add_gelu
-{
-    auto matcher() const
-    {
-        return match::name("gpu::gelu")(match::arg(0)(match::name("gpu::add").bind("add")));
-    }
-
-    void apply(module& m, const match::matcher_result& r) const
-    {
-        auto add_ins = r.instructions["add"];
-        auto ins     = r.result;
-        auto args    = add_ins->inputs();
-        move_standard_front(args);
-        move_broadcasted_back(args);
-
-        args.back() = ins->inputs().back();
-        m.replace_instruction(ins, hip_add_gelu{}, args);
-    }
-};
-
-struct find_gelu_new
-{
-    bool fast_math = true;
-
-    auto matcher() const { return match::gelu_tanh(&gpu_name); }
-
-    void apply(module& m, const match::matcher_result& r) const
-    {
-        auto ins   = r.result;
-        auto x_ins = r.instructions["x"];
-        auto args  = ins->inputs();
-
-        if(fast_math)
-            m.replace_instruction(ins, hip_gelu{}, x_ins, args.back());
-        else
-            m.replace_instruction(ins, hip_gelu_new{}, x_ins, args.back());
-    }
-};
-
-struct find_add_gelu_new
-{
-    auto matcher() const
-    {
-        return match::name("gpu::gelu_new")(match::arg(0)(match::name("gpu::add").bind("add")));
-    }
-
-    void apply(module& m, const match::matcher_result& r) const
-    {
-        auto add_ins = r.instructions["add"];
-        auto ins     = r.result;
-        auto args    = add_ins->inputs();
-        move_standard_front(args);
-        move_broadcasted_back(args);
-
-        args.back() = ins->inputs().back();
-        m.replace_instruction(ins, hip_add_gelu_new{}, args);
-    }
-};
-
-struct find_add_clip
-{
-    auto matcher() const
-    {
-        return match::name(std::unordered_set<std::string>{"gpu::clip", "gpu::clipped_relu"})(
-            match::arg(0)(match::any_of(match::name("gpu::add"),
-                                        match::name("gpu::triadd"),
-                                        match::any_of[match::inputs()](match::standard_shape()))
-                              .bind("add")));
-    }
-
-    void apply(module& m, const match::matcher_result& r) const
-    {
-        auto add_ins  = r.instructions["add"];
-        auto ins      = r.result;
-        auto ins_args = ins->inputs();
-        auto add_args = add_ins->inputs();
-        move_standard_front(add_args);
-        move_broadcasted_back(add_args);
-
-        // Use the allocation from the clip operator
-        add_args.pop_back();
-        add_args.insert(add_args.end(), std::next(ins_args.begin()), ins_args.end());
-        if(add_ins->name() == "gpu::add")
-            m.replace_instruction(ins, hip_add_clip{}, add_args);
-        else if(add_ins->name() == "gpu::triadd")
-            m.replace_instruction(ins, hip_triadd_clip{}, add_args);
-    }
-};
-
-struct find_add_unary
-{
-    std::string op_name;
-    operation binary_add_op;
-    operation ternary_add_op;
-    auto matcher() const
-    {
-        return match::name(op_name)(match::arg(0)(
-            match::used_once(),
-            match::any_of(match::name("gpu::add"),
-                          match::name("gpu::triadd"),
-                          match::any_of(match::name("@literal"),
-                                        match::any_of[match::inputs()](match::standard_shape())))
-                .bind("add")));
-    }
-
-    void apply(module& m, const match::matcher_result& r) const
-    {
-        auto add_ins = r.instructions["add"];
-        auto ins     = r.result;
-        auto args    = add_ins->inputs();
-        move_standard_front(args);
-        move_broadcasted_back(args);
-
-        // Use the allocation from the relu operator
-        args.back() = ins->inputs().back();
-        if(add_ins->name() == "gpu::add")
-            m.replace_instruction(ins, binary_add_op, args);
-        else if(add_ins->name() == "gpu::triadd")
-            m.replace_instruction(ins, ternary_add_op, args);
-    }
-};
-
-struct find_triadd
-{
-    auto matcher() const
-    {
-        return match::name("gpu::add")(match::either_arg(0, 1)(
-            match::name("gpu::add")(match::used_once()).bind("add"),
-            match::any(match::any_of(match::name("@literal"),
-                                     match::any_of[match::inputs()](match::standard_shape())))
-                .bind("input")));
-    }
-
-    void apply(module& m, const match::matcher_result& r) const
-    {
-        auto add_ins   = r.instructions["add"];
-        auto input_ins = r.instructions["input"];
-        auto ins       = r.result;
-        auto args      = add_ins->inputs();
-
-        auto is_broadcasted = [](auto arg) { return arg->get_shape().broadcasted(); };
-        if(std::count_if(args.begin(), args.end(), is_broadcasted) > 2)
-            return;
-        args.insert(args.begin(), input_ins);
-        move_standard_front(args);
-        move_broadcasted_back(args);
-
-        args.back() = ins->inputs().back();
-        m.replace_instruction(ins, hip_triadd{}, args);
-    }
-};
-
-struct find_mul_add
-{
-    auto matcher() const
-    {
-        return match::name("gpu::add")(match::either_arg(0, 1)(
-            match::name("gpu::mul")(match::used_once()).bind("mul"), match::any().bind("b")));
-    }
-
-    void apply(module& m, const match::matcher_result& r) const
-    {
-        auto mul_ins = r.instructions["mul"];
-        auto b_ins   = r.instructions["b"];
-        auto ins     = r.result;
-        auto args    = mul_ins->inputs();
-        assert(mul_ins != b_ins);
-
-        move_standard_front(args);
-        move_broadcasted_back(args);
-        args.insert(std::prev(args.end()), b_ins);
-
-        args.back() = ins->inputs().back();
-        m.replace_instruction(ins, hip_mul_add{}, args);
-    }
-};
-
-struct find_mul_add_relu
-{
-    auto matcher() const
-    {
-        return match::name("gpu::relu")(
-            match::arg(0)(match::name("gpu::mul_add")(match::used_once()).bind("mul_add")));
-    }
-
-    void apply(module& m, const match::matcher_result& r) const
-    {
-        auto mul_add_ins = r.instructions["mul_add"];
-        auto ins         = r.result;
-        auto args        = mul_add_ins->inputs();
-
-        // Use the allocation from the relu operator
-        args.back() = ins->inputs().back();
-        m.replace_instruction(ins, hip_mul_add_relu{}, args);
-    }
-};
-
 struct miopen_fusion
 {
    struct fuse_op_data
@@ -820,7 +462,7 @@ void apply_conv_bias(context& ctx, module& m, const match::matcher_result& r)
    auto ins         = r.result;
    auto input_ins   = conv_ins->inputs().at(0);
    auto weights_ins = conv_ins->inputs().at(1);
-    auto conv_op     = any_cast<miopen_convolution>(conv_ins->get_operator()).op;
+    auto conv_op     = from_value<op::convolution>((conv_ins->get_operator()).to_value()["op"]);
    auto alloc_ins   = ins->inputs().back();
    auto old_ws_ins  = conv_ins->inputs().at(2);

@@ -886,7 +528,7 @@ struct find_conv_pointwise
        auto ins         = r.result;
        auto input_ins   = conv_ins->inputs().at(0);
        auto weights_ins = conv_ins->inputs().at(1);
-        auto conv_op     = any_cast<miopen_convolution>(conv_ins->get_operator()).op;
+        auto conv_op     = from_value<op::convolution>(conv_ins->get_operator().to_value()["op"]);
        auto alloc_ins   = ins->inputs().back();

        module_ref pm = ins->module_inputs().front();
@@ -907,46 +549,6 @@ struct find_conv_pointwise
    }
 };

-struct find_gemm_add
-{
-    auto matcher() const
-    {
-        return match::name("gpu::add")(
-            match::all_of[match::inputs()](match::standard_shape()),
-            match::either_arg(0, 1)(match::used_once().bind("c"),
-                                    match::name("gpu::gemm")(match::nargs(3)).bind("gemm")));
-    }
-
-    void apply(module& m, const match::matcher_result& r) const
-    {
-        auto ins      = r.result;
-        auto gemm_ins = r.instructions["gemm"];
-        auto c_ins    = r.instructions["c"];
-
-        auto gemm = any_cast<rocblas_gemm<op::dot>>(gemm_ins->get_operator());
-
-        // Already fused gemm
-        if(not float_equal(gemm.beta, 0))
-            return;
-
-        auto inputs = gemm_ins->inputs();
-        inputs.pop_back();
-
-        auto copy_ins = c_ins;
-
-        // Insert copy
-        if(ins == m.end() or c_ins->outputs().size() > 1 or c_ins->inputs().empty())
-        {
-            copy_ins = m.insert_instruction(ins, hip_copy{}, c_ins, ins->inputs().back());
-        }
-        inputs.push_back(copy_ins);
-        inputs.push_back(copy_ins);
-
-        gemm.beta = 1;
-        m.replace_instruction(ins, gemm, inputs);
-    }
-};
-
 struct find_gemm_pointwise
 {
    auto matcher() const
@@ -1170,11 +772,9 @@ struct find_layernorm_pointwise
    {
        auto ins       = r.result;
        auto layernorm = r.instructions["layernorm"];
-        auto* pm       = ins->module_inputs().front();
-
        if(not layernorm->module_inputs().empty())
            return;
-
+        auto* pm    = ins->module_inputs().front();
        auto inputs = layernorm->inputs();
        inputs.pop_back();
        inputs.insert(inputs.end(), ins->inputs().begin() + 1, ins->inputs().end());
@@ -1183,29 +783,46 @@ struct find_layernorm_pointwise
    }
 };

+struct find_concat_pointwise
+{
+    auto matcher() const
+    {
+        return precompile_name("pointwise")(
+            match::arg(0)(precompile_name("concat").bind("concat")));
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins    = r.result;
+        auto concat = r.instructions["concat"];
+        if(not concat->module_inputs().empty())
+            return;
+
+        // TODO: Handle type conversions
+        if(ins->get_shape().type() != concat->get_shape().type())
+            return;
+
+        auto* pm    = ins->module_inputs().front();
+        auto inputs = concat->inputs();
+        inputs.pop_back();
+        inputs.insert(inputs.end(), ins->inputs().begin() + 1, ins->inputs().end());
+
+        auto op = concat->get_operator();
+        op.from_value({{"additional_args", ins->inputs().size() - 1}, {"ignore_modules", true}});
+
+        m.replace_instruction(ins, op, inputs, {pm});
+    }
+};
+
 void fuse_ops::apply(module& m) const
 {
-    match::find_matches(m, find_contiguous_pointwise{}, find_gelu{}, find_gelu_new{fast_math});
+    match::find_matches(m, find_contiguous_pointwise{});
    run_passes(m, {dead_code_elimination{}});
-    match::find_matches(m, find_triadd{});
-    match::find_matches(m,
-                        find_layernorm{},
-                        find_conv_pointwise{ctx},
-                        find_conv_bias_relu{ctx},
-                        find_conv_bias{ctx},
-                        find_add_gelu{},
-                        find_add_gelu_new{},
-                        find_mul_add{},
-                        find_mul_add_relu{},
-                        find_add_unary{"gpu::relu", hip_add_relu{}, hip_triadd_relu{}},
-                        find_add_unary{"gpu::sigmoid", hip_add_sigmoid{}, hip_triadd_sigmoid{}},
-                        find_add_unary{"gpu::tanh", hip_add_tanh{}, hip_triadd_tanh{}},
-                        find_add_clip{});
+    match::find_matches(m, find_conv_pointwise{ctx}, find_conv_bias_relu{ctx}, find_conv_bias{ctx});
    run_passes(m, {dead_code_elimination{}});
    match::find_matches(m,
-                        find_triadd_layernorm{},
-                        find_gemm_add{},
                        find_layernorm_pointwise{},
+                        find_concat_pointwise{},
                        find_gemm_pointwise{},
                        find_contiguous_tranpose_gemm{},
                        find_commutative_broadcast{});

--- a/src/targets/gpu/gemm_impl.cpp
+++ b/src/targets/gpu/gemm_impl.cpp
@@ -21,7 +21,7 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#include <rocblas.h>
+#include <rocblas/rocblas.h>
 #include <migraphx/gpu/gemm_impl.hpp>
 #include <migraphx/reduce_dims.hpp>
 #include <migraphx/permutation.hpp>

--- a/src/targets/gpu/hip.cpp
+++ b/src/targets/gpu/hip.cpp
@@ -183,8 +183,8 @@ argument register_on_gpu(const argument& arg)
 {
    auto arg_shared = arg.share();
    auto p          = register_on_gpu(arg_shared.data(), arg_shared.get_shape().bytes());
-    return {arg_shared.get_shape(),
-            [p, a = std::move(arg_shared)]() mutable { return get_device_ptr(p.get()); }};
+    auto s          = arg_shared.get_shape();
+    return {s, [p, a = std::move(arg_shared)]() mutable { return get_device_ptr(p.get()); }};
 }

 argument to_gpu(const argument& arg, bool host)
@@ -196,12 +196,21 @@ argument to_gpu(const argument& arg, bool host)
 argument from_gpu(const argument& arg)
 {
    argument result;
-    arg.visit([&](auto x) {
-        using type = typename decltype(x)::value_type;
-        auto v     = read_from_gpu<type>(arg.data(), x.get_shape().bytes() / sizeof(type));
-        // cppcheck-suppress returnDanglingLifetime
-        result = {x.get_shape(), [v]() mutable { return v.data(); }};
-    });
+    arg.visit(
+        [&](auto x) {
+            using type = typename decltype(x)::value_type;
+            auto v     = read_from_gpu<type>(arg.data(), x.get_shape().bytes() / sizeof(type));
+            // cppcheck-suppress returnDanglingLifetime
+            result = {x.get_shape(), [v]() mutable { return v.data(); }};
+        },
+        [&](const auto& xs) {
+            std::vector<argument> args;
+            std::transform(xs.begin(), xs.end(), std::back_inserter(args), [&](auto x) {
+                return from_gpu(x);
+            });
+            result = argument{args};
+        });
+
    return result;
 }


--- a/src/targets/gpu/include/migraphx/gpu/acos.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/acos.hpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#ifndef MIGRAPHX_GUARD_RTGLIB_ACOS_HPP
-#define MIGRAPHX_GUARD_RTGLIB_ACOS_HPP
-
-#include <migraphx/gpu/oper.hpp>
-#include <migraphx/gpu/device/acos.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-struct hip_acos : unary_device<hip_acos, device::acos>
-{
-};
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
-
-#endif
--- a/src/targets/gpu/include/migraphx/gpu/acosh.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/acosh.hpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#ifndef MIGRAPHX_GUARD_RTGLIB_ACOSH_HPP
-#define MIGRAPHX_GUARD_RTGLIB_ACOSH_HPP
-
-#include <migraphx/gpu/oper.hpp>
-#include <migraphx/gpu/device/acosh.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-struct hip_acosh : unary_device<hip_acosh, device::acosh>
-{
-};
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
-
-#endif
--- a/src/targets/gpu/include/migraphx/gpu/asin.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/asin.hpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#ifndef MIGRAPHX_GUARD_RTGLIB_ASIN_HPP
-#define MIGRAPHX_GUARD_RTGLIB_ASIN_HPP
-
-#include <migraphx/gpu/oper.hpp>
-#include <migraphx/gpu/device/asin.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-struct hip_asin : unary_device<hip_asin, device::asin>
-{
-};
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
-
-#endif
--- a/src/targets/gpu/include/migraphx/gpu/asinh.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/asinh.hpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#ifndef MIGRAPHX_GUARD_RTGLIB_ASINH_HPP
-#define MIGRAPHX_GUARD_RTGLIB_ASINH_HPP
-
-#include <migraphx/gpu/oper.hpp>
-#include <migraphx/gpu/device/asinh.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-struct hip_asinh : unary_device<hip_asinh, device::asinh>
-{
-};
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
-
-#endif
--- a/src/targets/gpu/include/migraphx/gpu/atan.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/atan.hpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#ifndef MIGRAPHX_GUARD_RTGLIB_ATAN_HPP
-#define MIGRAPHX_GUARD_RTGLIB_ATAN_HPP
-
-#include <migraphx/gpu/oper.hpp>
-#include <migraphx/gpu/device/atan.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-struct hip_atan : unary_device<hip_atan, device::atan>
-{
-};
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
-
-#endif
--- a/src/targets/gpu/include/migraphx/gpu/atanh.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/atanh.hpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#ifndef MIGRAPHX_GUARD_RTGLIB_ATANH_HPP
-#define MIGRAPHX_GUARD_RTGLIB_ATANH_HPP
-
-#include <migraphx/gpu/oper.hpp>
-#include <migraphx/gpu/device/atanh.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-struct hip_atanh : unary_device<hip_atanh, device::atanh>
-{
-};
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
-
-#endif
--- a/src/targets/gpu/include/migraphx/gpu/ceil.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/ceil.hpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#ifndef MIGRAPHX_GUARD_RTGLIB_CEIL_HPP
-#define MIGRAPHX_GUARD_RTGLIB_CEIL_HPP
-
-#include <migraphx/gpu/oper.hpp>
-#include <migraphx/gpu/device/ceil.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-struct hip_ceil : unary_device<hip_ceil, device::ceil>
-{
-};
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
-
-#endif
--- a/src/targets/gpu/include/migraphx/gpu/clip.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/clip.hpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#ifndef MIGRAPHX_GUARD_RTGLIB_CLIP_HPP
-#define MIGRAPHX_GUARD_RTGLIB_CLIP_HPP
-
-#include <migraphx/argument.hpp>
-#include <migraphx/reflect.hpp>
-#include <migraphx/op/clip.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-struct context;
-
-struct hip_clip
-{
-    op::clip op;
-
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return migraphx::reflect(self.op, f);
-    }
-
-    std::string name() const { return "gpu::clip"; }
-    shape compute_shape(std::vector<shape> inputs) const;
-    argument
-    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
-    {
-        return shapes.size() - 1;
-    }
-};
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
-
-#endif
--- a/src/targets/gpu/include/migraphx/gpu/compile_gen.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_gen.hpp
@@ -36,6 +36,9 @@ inline namespace MIGRAPHX_INLINE_NS {
 struct shape;

 namespace gpu {
+
+struct context;
+
 namespace gen {

 struct vectorize
@@ -43,6 +46,10 @@ struct vectorize
    std::size_t size = 1;
    std::size_t axis = 0;
    static vectorize elements(std::size_t axis, const std::vector<shape>& inputs);
+    static vectorize elements(context& ctx, std::size_t axis, const std::vector<shape>& inputs);
+    static vectorize elements(std::size_t axis,
+                              const std::vector<shape>& inputs,
+                              const std::vector<std::size_t>& sizes);
    std::string str() const;
 };
 struct preload

--- a/src/include/migraphx/rewrite_batchnorm.hpp
+++ b/src/include/migraphx/rewrite_batchnorm.hpp
@@ -21,28 +21,31 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_FWD_CONV_BATCHNORM_REWRITE_HPP
-#define MIGRAPHX_GUARD_RTGLIB_FWD_CONV_BATCHNORM_REWRITE_HPP
+#ifndef MIGRAPHX_GUARD_GPU_COMPILE_MIOPEN_HPP
+#define MIGRAPHX_GUARD_GPU_COMPILE_MIOPEN_HPP

-#include <string>
-#include <migraphx/instruction_ref.hpp>
 #include <migraphx/config.hpp>
+#include <migraphx/instruction_ref.hpp>
+#include <string>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

 struct module;
+struct context;
+struct operation;

-/**
- * Rewrite batchnorm to a multiply and add.
- */
-struct rewrite_batchnorm
+namespace gpu {
+
+struct compile_miopen
 {
-    std::string name() const { return "rewrite_batchnorm"; }
+    context* ctx = nullptr;
+    std::string name() const { return "gpu::compile_miopen"; }
    void apply(module& m) const;
+    std::size_t compile(operation& op, instruction_ref ins, bool format) const;
 };

+} // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
-
-#endif
+#endif // MIGRAPHX_GUARD_GPU_COMPILE_MIOPEN_HPP
--- a/src/targets/gpu/include/migraphx/gpu/concat.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/concat.hpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#ifndef MIGRAPHX_GUARD_RTGLIB_CONCAT_HPP
-#define MIGRAPHX_GUARD_RTGLIB_CONCAT_HPP
-
-#include <migraphx/argument.hpp>
-#include <migraphx/reflect.hpp>
-#include <migraphx/op/concat.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-struct context;
-
-struct hip_concat
-{
-    op::concat op;
-
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return migraphx::reflect(self.op, f);
-    }
-
-    std::string name() const { return "gpu::concat"; }
-    shape compute_shape(std::vector<shape> inputs) const;
-    argument
-    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
-    {
-        return shapes.size() - 1;
-    }
-};
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
-
-#endif
--- a/src/targets/gpu/include/migraphx/gpu/concat_gpu_opt.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/concat_gpu_opt.hpp
@@ -24,8 +24,9 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_CONCAT_GPU_OPT_HPP
 #define MIGRAPHX_GUARD_RTGLIB_CONCAT_GPU_OPT_HPP

-#include <migraphx/gpu/concat.hpp>
+#include <migraphx/op/concat.hpp>
 #include <migraphx/operation.hpp>
+#include <migraphx/serialize.hpp>

 namespace migraphx {
 namespace gpu {
@@ -36,7 +37,8 @@ struct concat_gpu_optimization
    std::string allocate() const { return "hip::allocate"; }
    migraphx::op::concat get_concat(const migraphx::operation& op) const
    {
-        return migraphx::any_cast<migraphx::gpu::hip_concat>(op).op;
+        auto v = op.to_value();
+        return from_value<migraphx::op::concat>(v.at("op"));
    }
 };


--- a/src/targets/gpu/include/migraphx/gpu/context.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/context.hpp
@@ -197,7 +197,9 @@ struct hip_device
 struct context
 {
    context(std::size_t device_id = 0, std::size_t n = value_of(MIGRAPHX_NSTREAMS{}, 1))
-        : current_device(std::make_shared<hip_device>(device_id, n))
+        : current_device(std::make_shared<hip_device>(device_id, n)),
+          begin_event(create_event()),
+          finish_event(create_event())
    {
    }

@@ -274,6 +276,24 @@ struct context
        this->current_device = std::make_shared<hip_device>(0, n_streams);
    }

+    void wait_for(any_ptr queue)
+    {
+        auto status = hipEventRecord(begin_event.get(), queue.get<hipStream_t>());
+        if(status != hipSuccess)
+            MIGRAPHX_THROW("failed to record " + hip_error(status));
+
+        get_stream().wait(begin_event.get());
+    }
+
+    void finish_on(any_ptr queue)
+    {
+        get_stream().record(finish_event.get());
+
+        auto status = hipStreamWaitEvent(queue.get<hipStream_t>(), finish_event.get(), 0);
+        if(status != hipSuccess)
+            MIGRAPHX_THROW("Failed to wait on event " + hip_error(status));
+    }
+
    any_ptr get_queue() { return get_stream().get(); }

    void enable_perf_measurement(bool b = true)
@@ -316,9 +336,13 @@ struct context
    // TODO: Make this a vector to support multiple devices
    std::shared_ptr<hip_device> current_device;
    std::vector<shared<hip_event_ptr>> events;
-    bool measure_perf                 = false;
+    bool measure_perf = false;
+    // for event perf timing
    shared<hip_event_ptr> start_event = nullptr;
    shared<hip_event_ptr> stop_event  = nullptr;
+    // for stream syncronization
+    shared<hip_event_ptr> begin_event  = nullptr;
+    shared<hip_event_ptr> finish_event = nullptr;
 };

 inline void migraphx_to_value(value& v, const context& ctx) { v = ctx.to_value(); }

--- a/src/targets/gpu/include/migraphx/gpu/convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/convolution.hpp
@@ -25,45 +25,318 @@
 #define MIGRAPHX_GUARD_RTGLIB_CONVOLUTION_HPP

 #include <migraphx/shape.hpp>
-#include <migraphx/op/convolution.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/operation.hpp>
+#include <migraphx/register_op.hpp>
 #include <migraphx/gpu/miopen.hpp>
-
+#include <migraphx/op/identity.hpp>
+#include <migraphx/op/convolution.hpp>
+#include <migraphx/op/quant_convolution.hpp>
+#include <migraphx/op/deconvolution.hpp>
+#include <unordered_map>
+#include <migraphx/reflect.hpp>
+#include <migraphx/gpu/context.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

-struct context;
+inline shape reshape_if_1d(const shape& input)
+{
+    shape new_shape{input};
+    auto dims = new_shape.lens();
+
+    if(dims.size() == 3)
+    {
+        std::vector<size_t> new_dims = dims;
+        new_dims.insert(new_dims.begin() + 2, 1);
+        new_shape = shape{input.type(), new_dims};
+    }
+    return new_shape;
+}

+template <class Op>
 struct miopen_convolution
 {
-    op::convolution op;
+    Op op;
+    bool int8_x4_format               = false;
    shared<convolution_descriptor> cd = nullptr;
    miopenConvFwdAlgorithm_t algo{};
+#ifdef MIGRAPHX_HAS_FIND_2_API
+    value::binary solution_object{};
+    shared<miopen_solution> solution_ptr = nullptr;
+#endif
    uint64_t solution_id = 0;

    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
-        return pack(f(self.op.padding, "padding"),
-                    f(self.op.stride, "stride"),
-                    f(self.op.dilation, "dilation"),
-                    f(self.op.group, "group"),
-                    f(self.op.padding_mode, "padding_mode"),
+        return pack(f(self.op, "op"),
+#ifdef MIGRAPHX_HAS_FIND_2_API
+                    f(self.solution_object, "solution_object"),
+#endif
+                    f(self.algo, "algo"),
+                    f(self.int8_x4_format, "int8_x4_format"),
                    f(self.solution_id, "solution_id"));
    }

-    std::string name() const { return "gpu::convolution"; }
-    shape compute_shape(const std::vector<shape>& inputs) const;
+    std::string name() const { return "gpu::" + op.name(); }
+
+    inline shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, op}.has(4);
+        std::vector<shape> conv_inputs(inputs.begin(), inputs.begin() + 2);
+        check_shapes{conv_inputs, *this}.max_ndims(5).packed_layouts(
+            {{0, 1, 2}, {0, 1, 2, 3}, {0, 2, 3, 1}, {0, 1, 2, 3, 4}});
+        return migraphx::compute_shape<Op>(op, conv_inputs);
+    }
+
    argument
-    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    shape find(context& ctx, const shape& output_shape, std::vector<shape> inputs);
-    void finalize(context& ctx, const shape& output_shape, std::vector<shape> inputs);
-    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
+    {
+        auto x_desc = make_tensor(reshape_if_1d(args[0].get_shape()), int8_x4_format);
+        auto w_desc = make_tensor(reshape_if_1d(args[1].get_shape()), int8_x4_format);
+        auto y_desc = make_tensor(reshape_if_1d(output_shape));
+        auto* miopen_stream_handle = ctx.get_stream().get_miopen();
+        auto workspace_size        = args[2].get_shape().bytes();
+
+#ifdef MIGRAPHX_HAS_FIND_2_API
+        {
+            const miopenTensorArgument_t tensor_args[3] = {
+                {miopenTensorConvolutionX, nullptr, args[0].implicit()},
+                {miopenTensorConvolutionW, nullptr, args[1].implicit()},
+                {miopenTensorConvolutionY, nullptr, args[3].implicit()},
+            };
+
+            if(solution_ptr.get() == nullptr)
+                MIGRAPHX_THROW("MIOpen " + op.name() + " : Load MIOpen Solution before running it");
+
+            auto status = miopenRunSolution(miopen_stream_handle,
+                                            solution_ptr.get(),
+                                            3,
+                                            tensor_args,
+                                            args[2].implicit(),
+                                            workspace_size);
+            if(status != miopenStatusSuccess)
+                MIGRAPHX_THROW("MIOpen " + op.name() +
+                               " : running convolution using find_2.0 failed");
+
+            return args[3];
+        }
+#else
+        // else use immediate mode
+        if(solution_id == 0)
+            MIGRAPHX_THROW("MIOpen " + op.name() + " : invalid solution ID");
+
+        auto status = miopenConvolutionForwardImmediate(miopen_stream_handle,
+                                                        w_desc.get(),
+                                                        args[1].implicit(),
+                                                        x_desc.get(),
+                                                        args[0].implicit(),
+                                                        cd.get(),
+                                                        y_desc.get(),
+                                                        args[3].implicit(),
+                                                        args[2].implicit(),
+                                                        workspace_size,
+                                                        solution_id);
+
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen " + op.name() + ": running convolution failed");
+        return args[3];
+#endif
+    }
+
+    void set_conv_descriptor()
+    {
+        cd = (op.name() == "deconvolution") ? make_deconv(op) : make_conv(op);
+    }
+
+    value compile(migraphx::context& ctx, const shape& output, const std::vector<shape>& input)
+    {
+        set_conv_descriptor();
+        auto ws = find(any_cast<migraphx::gpu::context>(ctx), output, input);
+        return {{"workspace", ws.bytes()}};
+    }
+
+    shape find(context& ctx, const shape& output_shape, const std::vector<shape>& inputs)
+    {
+        shape workspace_shape{};
+        auto x_desc                = make_tensor(reshape_if_1d(inputs[0]), int8_x4_format);
+        auto w_desc                = make_tensor(reshape_if_1d(inputs[1]), int8_x4_format);
+        auto y_desc                = make_tensor(reshape_if_1d(output_shape));
+        std::size_t workspace_size = 0;
+#ifdef MIGRAPHX_HAS_FIND_2_API
+        {
+            auto conv_problem = make_obj<miopen_problem>(
+                &miopenCreateConvProblem, cd.get(), miopenProblemDirectionForward);
+
+            set_tensor_descriptor(miopenTensorConvolutionX, x_desc, conv_problem);
+            set_tensor_descriptor(miopenTensorConvolutionW, w_desc, conv_problem);
+            set_tensor_descriptor(miopenTensorConvolutionY, y_desc, conv_problem);
+
+            auto* miopen_stream_handle = ctx.get_stream().get_miopen();
+
+            solution_ptr = find_solution(miopen_stream_handle, conv_problem.get());
+            auto status  = miopenGetSolutionWorkspaceSize(solution_ptr.get(), &workspace_size);
+            if(status != miopenStatusSuccess)
+                MIGRAPHX_THROW("MIOpen" + op.name() + " : failed to get solution's workspace size");
+
+            std::size_t solution_size;
+            status = miopenGetSolutionSize(solution_ptr.get(), &solution_size);
+            if(status != miopenStatusSuccess)
+                MIGRAPHX_THROW("MIOpen" + op.name() + ": Failed to fetch solution size");
+
+            auto solution_binary = std::vector<char>{};
+            solution_binary.resize(solution_size);
+
+            status = miopenSaveSolution(solution_ptr.get(), solution_binary.data());
+            if(status != miopenStatusSuccess)
+                MIGRAPHX_THROW("MIOpen" + op.name() + ": Saving solution failed");
+            solution_object = value::binary{solution_binary.data(), solution_size};
+            return shape{shape::int8_type, {workspace_size}};
+        }
+#else
+        auto status = miopenConvolutionForwardGetWorkSpaceSize(ctx.get_stream().get_miopen(),
+                                                               w_desc.get(),
+                                                               x_desc.get(),
+                                                               cd.get(),
+                                                               y_desc.get(),
+                                                               &workspace_size);
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen" + op.name() + " : Failed to get forward workspace size");
+
+        workspace_shape = shape{shape::int8_type, {workspace_size}};
+
+        auto x_shape = inputs[0];
+        auto w_shape = inputs[1];
+        if(int8_x4_format)
+        {
+            x_shape = pack_int8_shape(x_shape);
+            w_shape = pack_int8_shape(w_shape);
+        }
+        auto x         = to_gpu(generate_argument(x_shape));
+        auto w         = to_gpu(generate_argument(w_shape));
+        auto y         = allocate_gpu(output_shape);
+        auto workspace = allocate_gpu(workspace_shape);
+
+        int algo_count = 1;
+        miopenConvAlgoPerf_t perf;
+        status = miopenFindConvolutionForwardAlgorithm(ctx.get_stream().get_miopen(),
+                                                       x_desc.get(),
+                                                       x.implicit(),
+                                                       w_desc.get(),
+                                                       w.implicit(),
+                                                       cd.get(),
+                                                       y_desc.get(),
+                                                       y.implicit(),
+                                                       1,
+                                                       &algo_count,
+                                                       &perf,
+                                                       workspace.implicit(),
+                                                       workspace_size,
+                                                       false);
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen " + op.name() + " : find convolution failed");
+        algo = perf.fwd_algo;
+        size_t solution_count;
+
+        status = miopenConvolutionForwardGetSolutionCount(ctx.get_stream().get_miopen(),
+                                                          w_desc.get(),
+                                                          x_desc.get(),
+                                                          cd.get(),
+                                                          y_desc.get(),
+                                                          &solution_count);
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen " + op.name() + ": get solution count failed");
+
+        std::vector<miopenConvSolution_t> solutions(solution_count);
+
+        status = miopenConvolutionForwardGetSolution(ctx.get_stream().get_miopen(),
+                                                     w_desc.get(),
+                                                     x_desc.get(),
+                                                     cd.get(),
+                                                     y_desc.get(),
+                                                     solution_count,
+                                                     &solution_count,
+                                                     solutions.data());
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen " + op.name() + ": get solution failed");
+
+        solution_id = solutions.front().solution_id;
+
+        return shape{shape::int8_type, {perf.memory}};
+#endif
+    }
+
+    void finalize(context& ctx, const shape& output_shape, const std::vector<shape>& inputs)
+    {
+#ifdef MIGRAPHX_HAS_FIND_2_API
+        {
+            (void)(ctx); // avoid warnings
+            (void)(output_shape);
+            (void)(inputs);
+            // load solution
+            if(solution_ptr == nullptr)
+            {
+                miopenSolution_t ptr;
+                auto status =
+                    miopenLoadSolution(&ptr,
+                                       reinterpret_cast<const char*>(solution_object.data()),
+                                       solution_object.size());
+                solution_ptr = miopen_solution{ptr};
+                if(status != miopenStatusSuccess)
+                    MIGRAPHX_THROW("MIOpen " + op.name() + ": loading convolution solution failed");
+            }
+        }
+#else
+        // Use immediate mode API
+        {
+            set_conv_descriptor();
+            if(solution_id == 0)
+            {
+                // Check that workspace hasn't changed
+                auto size = inputs.at(2).bytes();
+                auto ws   = find(ctx, output_shape, inputs);
+                if(ws.bytes() > size)
+                    MIGRAPHX_THROW("MIOpen " + op.name() +
+                                   ": workspace has changed during finalization.");
+            }
+
+            auto x_desc = make_tensor(reshape_if_1d(inputs[0]), int8_x4_format);
+            auto w_desc = make_tensor(reshape_if_1d(inputs[1]), int8_x4_format);
+            auto y_desc = make_tensor(reshape_if_1d(output_shape));
+
+            auto status = miopenConvolutionForwardCompileSolution(ctx.get_stream().get_miopen(),
+                                                                  w_desc.get(),
+                                                                  x_desc.get(),
+                                                                  cd.get(),
+                                                                  y_desc.get(),
+                                                                  solution_id);
+            if(status != miopenStatusSuccess)
+                MIGRAPHX_THROW("MIOpen Convolution: compile solution failed");
+        }
+#endif
+    }
+
+    inline std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {
        return shapes.size() - 1;
    }
-};

+    inline shape pack_int8_shape(const shape& s) const
+    {
+        if(s.type() != shape::int8_type)
+        {
+            return s;
+        }
+
+        auto lens    = s.lens();
+        auto strides = s.strides();
+        lens[1]      = (lens[1] + 3) / 4 * 4;
+        strides[0]   = strides[1] * lens[1];
+
+        return {s.type(), lens, strides};
+    }
+};
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx