Merge branch 'develop' of github.com:ROCmSoftwarePlatform/AMDMIGraphX into dyn_model_test

9b929d4e · charlie · c4b1102e · 4394e9b3 · 9b929d4e · 9b929d4e
Commit 9b929d4e authored Dec 29, 2022 by charlie
20 changed files
--- a/src/onnx/parse_split.cpp
+++ b/src/onnx/parse_split.cpp
@@ -26,6 +26,9 @@
 #include <migraphx/ranges.hpp>
 #include <migraphx/make_op.hpp>
 #include <migraphx/tune_axis.hpp>
+#include <migraphx/onnx/checks.hpp>
+#include <migraphx/stringutils.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -55,12 +58,12 @@ struct parse_split : op_parser<parse_split>
        {
            literal s = parser.parse_value(info.attributes.at("split"));
            s.visit([&](auto v) { vec_splits.assign(v.begin(), v.end()); });
+        }
-            if(std::accumulate(vec_splits.begin(), vec_splits.end(), int64_t(0)) !=
+        else if(args.size() == 2)
-               static_cast<int64_t>(lens[tuned_axis]))
+        {
-            {
+            auto s = args[1]->eval();
-                MIGRAPHX_THROW("PARSE_SPLIT: sum of split attribute unequal to dim size of axis!");
+            check_arg_empty(s, "Split: dynamic shape is not supported");
-            }
+            s.visit([&](auto v) { vec_splits.assign(v.begin(), v.end()); });
        }
        // no split attribute, input is equally divided
        else
@@ -74,6 +77,15 @@ struct parse_split : op_parser<parse_split>
            vec_splits.resize(info.num_outputs, dl);
        }
+        if(std::accumulate(vec_splits.begin(), vec_splits.end(), int64_t(0)) !=
+           static_cast<int64_t>(lens[tuned_axis]))
+        {
+            MIGRAPHX_THROW(
+                "PARSE_SPLIT: sum of split attribute unequal to dim size of axis! tuned axis:" +
+                std::to_string(lens[tuned_axis]) + " Output " + to_string_range(vec_splits) +
+                " Rank " + std::to_string(n_rank) + " Len outs " + to_string_range(lens));
+        }
        std::vector<instruction_ref> ret_ins;
        int64_t start = 0;
        for(auto sl : vec_splits)

--- a/src/onnx/parse_transpose.cpp
+++ b/src/onnx/parse_transpose.cpp
@@ -47,7 +47,7 @@ struct parse_transpose : op_parser<parse_transpose>
        }
        // if perm is empty, use the default value
-        auto n_dim = args.front()->get_shape().lens().size();
+        auto n_dim = args.front()->get_shape().ndim();
        if(perm.empty())
        {
            perm.resize(n_dim);

--- a/src/pass_manager.cpp
+++ b/src/pass_manager.cpp
@@ -94,11 +94,19 @@ struct module_pm : module_pass_manager
    virtual void run_pass(const pass& p) override
    {
        assert(mod);
+        timer ts{};
+        using seconds = std::chrono::duration<double>;
        trace("Module: ", mod->name(), ", Pass: ", p.name());
+        const double t1 = ts.record<seconds>();
        assert(mod->validate() == mod->end());
        p.apply(*this);
        trace(*mod);
        validate_pass(*mod, p, *t);
+        const double t2 = ts.record<seconds>();
+        trace("Pass: ", p.name(), " completed in (s): ", (t2 - t1));
    }
 };

--- a/src/program.cpp
+++ b/src/program.cpp
@@ -854,6 +854,25 @@ void program::print_graph(std::ostream& os, bool brief) const
    mm->print_graph(os, brief);
 }
+void program::print_py(std::ostream& os) const
+{
+    auto vec_modules = this->get_modules();
+    std::unordered_map<instruction_ref, std::string> names;
+    os << "p = migraphx.program()\n";
+    for(auto& mod : vec_modules)
+    {
+        std::string var_name = "m" + mod->name();
+        os << var_name << " = ";
+        if(mod->name() == "main")
+            os << "p.get_main_module()";
+        else
+            os << "p.create_module(\"" << mod->name() << "\");";
+        os << std::endl;
+        names = mod->print_py(os, var_name, names);
+        os << std::endl;
+    }
+}
 void program::print_cpp(std::ostream& os) const
 {
    auto vec_modules = this->get_modules();

--- a/src/rewrite_rnn.cpp
+++ b/src/rewrite_rnn.cpp
@@ -92,7 +92,7 @@ void rewrite_rnn::apply_vanilla_rnn(module& m, instruction_ref ins) const
    // process sequence length
    instruction_ref seq_lens = m.end();
-    if((args.size() >= 5) && args[4]->name() != "undefined")
+    if((args.size() >= 5) and not args[4]->is_undefined())
    {
        seq_lens = args[4];
    }
@@ -117,7 +117,7 @@ void rewrite_rnn::apply_vanilla_rnn(module& m, instruction_ref ins) const
        // process bias
        instruction_ref bias_forward = m.end();
        instruction_ref bias_reverse = m.end();
-        if(args.size() >= 4 && args[3]->name() != "undefined")
+        if(args.size() >= 4 and not args[3]->is_undefined())
        {
            bias_forward = m.insert_instruction(
                ins, make_op("slice", {{"axes", {0}}, {"starts", {0}}, {"ends", {1}}}), args[3]);
@@ -129,7 +129,7 @@ void rewrite_rnn::apply_vanilla_rnn(module& m, instruction_ref ins) const
        // or the 5th one (if the sequence len argument is ignored)
        instruction_ref ih_forward{};
        instruction_ref ih_reverse{};
-        if(args.size() == 6 && args[5]->name() != "undefined")
+        if(args.size() == 6 and not args[5]->is_undefined())
        {
            ih_forward = m.insert_instruction(
                ins, make_op("slice", {{"axes", {0}}, {"starts", {0}}, {"ends", {1}}}), args[5]);
@@ -195,14 +195,14 @@ void rewrite_rnn::apply_vanilla_rnn(module& m, instruction_ref ins) const
        // process bias and initial hidden state
        instruction_ref bias = m.end();
-        if(args.size() >= 4 && args[3]->name() != "undefined")
+        if(args.size() >= 4 and not args[3]->is_undefined())
        {
            bias = args[3];
        }
        // process intial hidden state
        instruction_ref ih;
-        if(args.size() == 6 && args[5]->name() != "undefined")
+        if(args.size() == 6 and not args[5]->is_undefined())
        {
            ih = args[5];
        }
@@ -398,7 +398,7 @@ void rewrite_rnn::apply_gru(module& m, instruction_ref ins) const
    // process sequence length
    instruction_ref seq_lens = m.end();
-    if((args.size() >= 5) && args[4]->name() != "undefined")
+    if((args.size() >= 5) and not args[4]->is_undefined())
    {
        seq_lens = args[4];
    }
@@ -423,7 +423,7 @@ void rewrite_rnn::apply_gru(module& m, instruction_ref ins) const
        // bias
        instruction_ref bias_forward = m.end();
        instruction_ref bias_reverse = m.end();
-        if(args.size() >= 4 && args[3]->name() != "undefined")
+        if(args.size() >= 4 and not args[3]->is_undefined())
        {
            bias_forward = m.insert_instruction(
                ins, make_op("slice", {{"axes", {0}}, {"starts", {0}}, {"ends", {1}}}), args[3]);
@@ -434,7 +434,7 @@ void rewrite_rnn::apply_gru(module& m, instruction_ref ins) const
        // intial hidden state
        instruction_ref ih_forward{};
        instruction_ref ih_reverse{};
-        if(args.size() == 6 && args[5]->name() != "undefined")
+        if(args.size() == 6 and not args[5]->is_undefined())
        {
            ih_forward = m.insert_instruction(
                ins, make_op("slice", {{"axes", {0}}, {"starts", {0}}, {"ends", {1}}}), args[5]);
@@ -501,14 +501,14 @@ void rewrite_rnn::apply_gru(module& m, instruction_ref ins) const
        // bias
        instruction_ref bias = m.end();
-        if(args.size() >= 4 && args[3]->name() != "undefined")
+        if(args.size() >= 4 and not args[3]->is_undefined())
        {
            bias = args[3];
        }
        // intial hidden state
        instruction_ref ih{};
-        if(args.size() == 6 && args[5]->name() != "undefined")
+        if(args.size() == 6 and not args[5]->is_undefined())
        {
            ih = args[5];
        }
@@ -784,7 +784,7 @@ void rewrite_rnn::apply_lstm(module& m, instruction_ref ins) const
    // process sequence length
    instruction_ref seq_lens = m.end();
-    if((args.size() >= 5) && args[4]->name() != "undefined")
+    if((args.size() >= 5) and not args[4]->is_undefined())
    {
        seq_lens = args[4];
    }
@@ -813,7 +813,7 @@ void rewrite_rnn::apply_lstm(module& m, instruction_ref ins) const
        // process bias
        instruction_ref bias_forward = m.end();
        instruction_ref bias_reverse = m.end();
-        if(args.size() >= 4 && args[3]->name() != "undefined")
+        if(args.size() >= 4 and not args[3]->is_undefined())
        {
            bias_forward = m.insert_instruction(
                ins, make_op("slice", {{"axes", {0}}, {"starts", {0}}, {"ends", {1}}}), args[3]);
@@ -824,7 +824,7 @@ void rewrite_rnn::apply_lstm(module& m, instruction_ref ins) const
        // process intial hidden state, it is the 6th argument
        instruction_ref ih_forward{};
        instruction_ref ih_reverse{};
-        if(args.size() >= 6 && args[5]->name() != "undefined")
+        if(args.size() >= 6 and not args[5]->is_undefined())
        {
            ih_forward = m.insert_instruction(
                ins, make_op("slice", {{"axes", {0}}, {"starts", {0}}, {"ends", {1}}}), args[5]);
@@ -840,7 +840,7 @@ void rewrite_rnn::apply_lstm(module& m, instruction_ref ins) const
        // process initial cell value
        instruction_ref ic_forward{};
        instruction_ref ic_reverse{};
-        if(args.size() >= 7 && args[6]->name() != "undefined")
+        if(args.size() >= 7 and not args[6]->is_undefined())
        {
            ic_forward = m.insert_instruction(
                ins, make_op("slice", {{"axes", {0}}, {"starts", {0}}, {"ends", {1}}}), args[6]);
@@ -856,7 +856,7 @@ void rewrite_rnn::apply_lstm(module& m, instruction_ref ins) const
        // process weight of the peephole
        instruction_ref pph_forward = m.end();
        instruction_ref pph_reverse = m.end();
-        if(args.size() == 8 && args[7]->name() != "undefined")
+        if(args.size() == 8 and not args[7]->is_undefined())
        {
            pph_forward = m.insert_instruction(
                ins, make_op("slice", {{"axes", {0}}, {"starts", {0}}, {"ends", {1}}}), args[7]);
@@ -940,14 +940,14 @@ void rewrite_rnn::apply_lstm(module& m, instruction_ref ins) const
        // bias
        instruction_ref bias = m.end();
-        if(args.size() >= 4 && args[3]->name() != "undefined")
+        if(args.size() >= 4 and not args[3]->is_undefined())
        {
            bias = args[3];
        }
        // initial hidden state
        instruction_ref ih{};
-        if(args.size() >= 6 && args[5]->name() != "undefined")
+        if(args.size() >= 6 and not args[5]->is_undefined())
        {
            ih = args[5];
        }
@@ -958,7 +958,7 @@ void rewrite_rnn::apply_lstm(module& m, instruction_ref ins) const
        // initial cell value
        instruction_ref ic{};
-        if(args.size() >= 7 && args[6]->name() != "undefined")
+        if(args.size() >= 7 and not args[6]->is_undefined())
        {
            ic = args[6];
        }
@@ -969,7 +969,7 @@ void rewrite_rnn::apply_lstm(module& m, instruction_ref ins) const
        // process weight of the peephole
        instruction_ref pph = m.end();
-        if(args.size() == 8 && args[7]->name() != "undefined")
+        if(args.size() == 8 and not args[7]->is_undefined())
        {
            pph = args[7];
        }

--- a/src/shape.cpp
+++ b/src/shape.cpp
@@ -521,6 +521,14 @@ std::ostream& operator<<(std::ostream& os, const shape::dynamic_dimension& x)
    return os;
 }
+bool operator==(const shape::dynamic_dimension& x, const std::size_t& y)
+{
+    return x.min == y and x.max == y;
+}
+bool operator==(const std::size_t& x, const shape::dynamic_dimension& y) { return y == x; }
+bool operator!=(const shape::dynamic_dimension& x, const std::size_t& y) { return not(x == y); }
+bool operator!=(const std::size_t& x, const shape::dynamic_dimension& y) { return not(x == y); }
 bool operator==(const shape& x, const shape& y)
 {
    if(x.dynamic() and y.dynamic())

--- a/src/targets/cpu/binary.cpp
+++ b/src/targets/cpu/binary.cpp
@@ -51,7 +51,18 @@ struct dnnl_binary : dnnl_op<dnnl_binary, dnnl::binary>
        auto r  = s0;
        if(s0 != s1 or not s0.packed())
        {
-            r = shape{s0.type(), s0.lens()};
+            if(s0.packed() != s1.packed())
+            {
+                r = s0.packed() ? s0 : s1;
+            }
+            else if(s0.broadcasted() != s1.broadcasted())
+            {
+                r = s0.broadcasted() ? s1.with_lens(s0.lens()) : s0.with_lens(s0.lens());
+            }
+            else
+            {
+                r = {s0.type(), s0.lens()};
+            }
        }
        // Call to get_primitive to make sure an algo is available
        this->get_primitive(this->to_memory_desc(r, inputs));

--- a/src/targets/cpu/convolution.cpp
+++ b/src/targets/cpu/convolution.cpp
@@ -43,9 +43,9 @@ struct dnnl_convolution
        return {MIGRAPHX_DNNL_PREFIX(ARG_SRC), MIGRAPHX_DNNL_PREFIX(ARG_WEIGHTS)};
    }
-    shape adjust_shape(const shape& x, int i) const
+    shape adjust_shape(const shape& x, int i, const shape& output) const
    {
-        auto s = base_adjust_shape(x);
+        auto s = base_adjust_shape(x, output);
        if(i == 1 and op.group > 1)
        {
            // TODO: Add support for transposed weights

--- a/src/targets/cpu/deconvolution.cpp
+++ b/src/targets/cpu/deconvolution.cpp
@@ -37,9 +37,9 @@ struct dnnl_deconvolution
        return {MIGRAPHX_DNNL_PREFIX(ARG_SRC), MIGRAPHX_DNNL_PREFIX(ARG_WEIGHTS)};
    }
-    shape adjust_shape(const shape& x, int i) const
+    shape adjust_shape(const shape& x, int i, const shape& output) const
    {
-        auto s = base_adjust_shape(x);
+        auto s = base_adjust_shape(x, output);
        if(i == 1)
        {
            // The input and output channels are flipped for dnnl

--- a/src/targets/cpu/include/migraphx/cpu/dnnl.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/dnnl.hpp
@@ -167,7 +167,7 @@ struct dnnl_op : auto_register_op<Derived>
        std::iota(result.begin(), result.end(), MIGRAPHX_DNNL_PREFIX(ARG_SRC_0));
        return result;
    }
-    shape base_adjust_shape(const shape& s) const
+    shape base_adjust_shape(const shape& s, const shape& output) const
    {
        if(s.broadcasted())
        {
@@ -183,7 +183,8 @@ struct dnnl_op : auto_register_op<Derived>
                               else
                                   return len;
                           });
-            return shape{s.type(), lens};
+            // Use the permutation of the output
+            return output.with_lens(s.type(), lens);
        }
        return s;
    }
@@ -204,7 +205,10 @@ struct dnnl_op : auto_register_op<Derived>
            i++;
        }
    }
-    shape adjust_shape(const shape& s, int) const { return base_adjust_shape(s); }
+    shape adjust_shape(const shape& s, int, const shape& output) const
+    {
+        return base_adjust_shape(s, output);
+    }
    std::vector<int> create_arg_map(std::size_t input_size) const
    {
        const auto& self     = static_cast<const Derived&>(*this);
@@ -224,12 +228,12 @@ struct dnnl_op : auto_register_op<Derived>
        const auto& self = static_cast<const Derived&>(*this);
        std::unordered_map<int, dnnl::memory::desc> result;
        result[MIGRAPHX_DNNL_PREFIX(ARG_DST)] =
-            to_dnnl_memory_desc(self.adjust_shape(output_shape, inputs.size()));
+            to_dnnl_memory_desc(self.adjust_shape(output_shape, inputs.size(), output_shape));
        auto m = create_arg_map(inputs.size());
        assert(m.size() >= inputs.size());
        for(int i = 0; i < inputs.size(); i++)
        {
-            result[m[i]] = to_dnnl_memory_desc(self.adjust_shape(inputs[i], i));
+            result[m[i]] = to_dnnl_memory_desc(self.adjust_shape(inputs[i], i, output_shape));
        }
        return result;
    }

--- a/src/targets/cpu/reorder.cpp
+++ b/src/targets/cpu/reorder.cpp
@@ -32,7 +32,7 @@ struct dnnl_reorder : dnnl_op<dnnl_reorder, dnnl::reorder>
 {
    std::string name() const { return "dnnl::reorder"; }
-    shape adjust_shape(const shape& x, int) const { return x; }
+    shape adjust_shape(const shape& x, int, const shape&) const { return x; }
    shape compute_shape(const std::vector<shape>& inputs) const
    {

--- a/src/targets/cpu/target.cpp
+++ b/src/targets/cpu/target.cpp
@@ -33,6 +33,7 @@
 #include <migraphx/eliminate_data_type.hpp>
 #include <migraphx/eliminate_identity.hpp>
 #include <migraphx/eliminate_pad.hpp>
+#include <migraphx/layout_nhwc.hpp>
 #include <migraphx/memory_coloring.hpp>
 #include <migraphx/propagate_constant.hpp>
 #include <migraphx/register_target.hpp>
@@ -82,6 +83,9 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
            dead_code_elimination{},
            simplify_algebra{},
            simplify_reshapes{},
+            layout_nhwc{},
+            dead_code_elimination{},
+            simplify_reshapes{},
            simplify_algebra{},
            auto_contiguous{},
            simplify_reshapes{},

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -83,6 +83,7 @@ add_library(migraphx_gpu
    compile_gen.cpp
    compile_hip.cpp
    compile_hip_code_object.cpp
+    compile_miopen.cpp
    compiler.cpp
    device_name.cpp
    fuse_mlir.cpp
@@ -232,11 +233,14 @@ get_target_property(MIOPEN_LOCATION MIOpen LOCATION)
 check_library_exists(MIOpen "miopenHiddenSetConvolutionFindMode" "${MIOPEN_LOCATION}" HAS_FIND_MODE_API)
 check_library_exists(MIOpen "miopenFindSolutions" "${MIOPEN_LOCATION}" HAS_FIND_2_API)
-if(HAS_FIND_2_API) 
+# TODO: Set default to HAS_FIND_2_API
+set(MIGRAPHX_USE_FIND_2_API OFF CACHE BOOL "")
+if(MIGRAPHX_USE_FIND_2_API) 
    target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_2_API)
    message(STATUS "MIGraphx is using Find-2.0 API of MIOpen")
 else()
-    message(STATUS "MIOpen does not have Find-2.0 API")
+    message(STATUS "MIGraphx is using legacy Find API in MIOpen")
 endif()
 if(HAS_FIND_MODE_API)

--- a/src/targets/gpu/compile_hip.cpp
+++ b/src/targets/gpu/compile_hip.cpp
@@ -185,7 +185,7 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
    options.push_back("-fno-gpu-rdc");
    options.push_back(" -O" + string_value_of(MIGRAPHX_GPU_OPTIMIZE{}, "3"));
    options.push_back("-Wno-cuda-compat");
-    options.push_back("--cuda-gpu-arch=" + arch);
+    options.push_back("--offload-arch=" + arch);
    prog.compile(options);
    return {prog.get_code_obj()};
 }
@@ -237,7 +237,7 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
    }
    else if(is_hip_clang_compiler())
    {
-        params += " --cuda-gpu-arch=" + arch;
+        params += " --offload-arch=" + arch;
        params += " --cuda-device-only";
        params += " -O" + string_value_of(MIGRAPHX_GPU_OPTIMIZE{}, "3") + " ";
    }

--- a/src/targets/gpu/compile_miopen.cpp
+++ b/src/targets/gpu/compile_miopen.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/compile_miopen.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/register_op.hpp>
+#include <migraphx/op/identity.hpp>
+#include <migraphx/gpu/rocblas.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+struct miopen_op
+{
+    operation op = op::identity{};
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.op, "op"));
+    }
+    std::string name() const { return "gpu::miopen_op"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        inputs.push_back(inputs.back());
+        return op.compute_shape(inputs);
+    }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+MIGRAPHX_REGISTER_OP(miopen_op);
+std::size_t compile_miopen::compile(operation& op, instruction_ref ins, bool format) const
+{
+    op.from_value({{"int8_x4_format", format}});
+    auto v = op.compile(*ctx, ins->get_shape(), to_shapes(ins->inputs()));
+    return v.get<std::size_t>("workspace", 0);
+}
+void compile_miopen::apply(module& m) const
+{
+    assert(ctx);
+    const bool int8_x4_format = get_int8_x4_format(any_cast<migraphx::gpu::context>(*ctx));
+    for(auto ins : iterator_for(m))
+    {
+        if(ins->name() != "gpu::miopen_op")
+            continue;
+        auto op        = any_cast<miopen_op>(ins->get_operator()).op;
+        std::size_t ws = 0;
+        try
+        {
+            // for the regular convolution and deconvolution, this try would always succeed
+            ws = compile(op, ins, int8_x4_format);
+        }
+        catch(migraphx::exception&)
+        {
+            // In case no solver supports the default format, retry using the other format.
+            ws = compile(op, ins, not int8_x4_format);
+        }
+        auto inputs = ins->inputs();
+        auto alloc  = m.insert_instruction(
+            ins, make_op("allocate", {{"shape", to_value(shape{shape::int8_type, {ws}})}}));
+        inputs.insert(std::prev(inputs.end()), alloc);
+        m.replace_instruction(ins, op, inputs);
+    }
+}
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/compile_ops.cpp
+++ b/src/targets/gpu/compile_ops.cpp
@@ -39,19 +39,26 @@ MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_COMPILE_PARALLEL);
 struct precompile_op
 {
-    operation op = op::identity{};
+    operation op                = op::identity{};
+    std::size_t additional_args = 1;
+    bool ignore_modules         = false;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
-        return pack(f(self.op, "op"));
+        return pack(f(self.op, "op"),
+                    f(self.additional_args, "additional_args"),
+                    f(self.ignore_modules, "ignore_modules"));
    }
    std::string name() const { return "gpu::precompile_op"; }
    shape compute_shape(std::vector<shape> inputs, const std::vector<module_ref>& mods) const
    {
-        inputs.pop_back();
+        // Pop off additional args
+        inputs.resize(inputs.size() - additional_args);
+        if(ignore_modules)
+            return op.compute_shape(inputs);
        return op.compute_shape(inputs, mods);
    }

--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -772,11 +772,9 @@ struct find_layernorm_pointwise
    {
        auto ins       = r.result;
        auto layernorm = r.instructions["layernorm"];
-        auto* pm       = ins->module_inputs().front();
        if(not layernorm->module_inputs().empty())
            return;
+        auto* pm    = ins->module_inputs().front();
        auto inputs = layernorm->inputs();
        inputs.pop_back();
        inputs.insert(inputs.end(), ins->inputs().begin() + 1, ins->inputs().end());
@@ -785,6 +783,37 @@ struct find_layernorm_pointwise
    }
 };
+struct find_concat_pointwise
+{
+    auto matcher() const
+    {
+        return precompile_name("pointwise")(
+            match::arg(0)(precompile_name("concat").bind("concat")));
+    }
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins    = r.result;
+        auto concat = r.instructions["concat"];
+        if(not concat->module_inputs().empty())
+            return;
+        // TODO: Handle type conversions
+        if(ins->get_shape().type() != concat->get_shape().type())
+            return;
+        auto* pm    = ins->module_inputs().front();
+        auto inputs = concat->inputs();
+        inputs.pop_back();
+        inputs.insert(inputs.end(), ins->inputs().begin() + 1, ins->inputs().end());
+        auto op = concat->get_operator();
+        op.from_value({{"additional_args", ins->inputs().size() - 1}, {"ignore_modules", true}});
+        m.replace_instruction(ins, op, inputs, {pm});
+    }
+};
 void fuse_ops::apply(module& m) const
 {
    match::find_matches(m, find_contiguous_pointwise{});
@@ -793,6 +822,7 @@ void fuse_ops::apply(module& m) const
    run_passes(m, {dead_code_elimination{}});
    match::find_matches(m,
                        find_layernorm_pointwise{},
+                        find_concat_pointwise{},
                        find_gemm_pointwise{},
                        find_contiguous_tranpose_gemm{},
                        find_commutative_broadcast{});

--- a/src/targets/gpu/gemm_impl.cpp
+++ b/src/targets/gpu/gemm_impl.cpp
@@ -21,7 +21,7 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#include <rocblas.h>
+#include <rocblas/rocblas.h>
 #include <migraphx/gpu/gemm_impl.hpp>
 #include <migraphx/reduce_dims.hpp>
 #include <migraphx/permutation.hpp>

--- a/src/targets/gpu/hip.cpp
+++ b/src/targets/gpu/hip.cpp
@@ -196,12 +196,21 @@ argument to_gpu(const argument& arg, bool host)
 argument from_gpu(const argument& arg)
 {
    argument result;
-    arg.visit([&](auto x) {
+    arg.visit(
-        using type = typename decltype(x)::value_type;
+        [&](auto x) {
-        auto v     = read_from_gpu<type>(arg.data(), x.get_shape().bytes() / sizeof(type));
+            using type = typename decltype(x)::value_type;
-        // cppcheck-suppress returnDanglingLifetime
+            auto v     = read_from_gpu<type>(arg.data(), x.get_shape().bytes() / sizeof(type));
-        result = {x.get_shape(), [v]() mutable { return v.data(); }};
+            // cppcheck-suppress returnDanglingLifetime
-    });
+            result = {x.get_shape(), [v]() mutable { return v.data(); }};
+        },
+        [&](const auto& xs) {
+            std::vector<argument> args;
+            std::transform(xs.begin(), xs.end(), std::back_inserter(args), [&](auto x) {
+                return from_gpu(x);
+            });
+            result = argument{args};
+        });
    return result;
 }

--- a/src/targets/gpu/include/migraphx/gpu/compile_miopen.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_miopen.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_COMPILE_MIOPEN_HPP
+#define MIGRAPHX_GUARD_GPU_COMPILE_MIOPEN_HPP
+#include <migraphx/config.hpp>
+#include <migraphx/instruction_ref.hpp>
+#include <string>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+struct module;
+struct context;
+struct operation;
+namespace gpu {
+struct compile_miopen
+{
+    context* ctx = nullptr;
+    std::string name() const { return "gpu::compile_miopen"; }
+    void apply(module& m) const;
+    std::size_t compile(operation& op, instruction_ref ins, bool format) const;
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_COMPILE_MIOPEN_HPP