Merge branch 'develop' into resnet50_partition

124f7d55 · Umang Yadav · 350bbea2 · 34b68ee4 · 124f7d55 · 124f7d55
Commit 124f7d55 authored Oct 12, 2023 by Umang Yadav
20 changed files
--- a/src/py/CMakeLists.txt
+++ b/src/py/CMakeLists.txt
@@ -22,27 +22,24 @@
 # THE SOFTWARE.
 #####################################################################################

-option(MIGRAPHX_ENABLE_PYTHON "Enable python bindings" ON)
 add_library(migraphx_py py_loader.cpp)
 migraphx_generate_export_header(migraphx_py)
 target_include_directories(migraphx_py PRIVATE include)
 target_link_libraries(migraphx_py PUBLIC migraphx)
 rocm_install_targets(TARGETS migraphx_py INCLUDE include)
-if(MIGRAPHX_ENABLE_PYTHON)
-    include(PythonModules)

+include(PythonModules)

-    foreach(PYTHON_VERSION ${PYTHON_VERSIONS})
-        py_add_module(migraphx_pybind_${PYTHON_VERSION} migraphx_py.cpp PYTHON_VERSION ${PYTHON_VERSION} PYTHON_MODULE migraphx)
-        target_link_libraries(migraphx_pybind_${PYTHON_VERSION} PRIVATE migraphx migraphx_tf migraphx_onnx migraphx_all_targets)
-        rocm_install_targets(TARGETS migraphx_pybind_${PYTHON_VERSION})
-        add_dependencies(migraphx_py migraphx_pybind_${PYTHON_VERSION})
-        
-        add_library(migraphx_py_${PYTHON_VERSION} py.cpp)
-        target_include_directories(migraphx_py_${PYTHON_VERSION} PRIVATE include)
-        target_link_libraries(migraphx_py_${PYTHON_VERSION} PUBLIC migraphx)
-        target_link_libraries(migraphx_py_${PYTHON_VERSION} PRIVATE pybind11::pybind11 python${PYTHON_VERSION}::runtime)
-        rocm_install_targets(TARGETS migraphx_py_${PYTHON_VERSION})
-        add_dependencies(migraphx_py migraphx_py_${PYTHON_VERSION})
-    endforeach()
-endif()
+foreach(PYTHON_VERSION ${PYTHON_VERSIONS})
+    py_add_module(migraphx_pybind_${PYTHON_VERSION} migraphx_py.cpp PYTHON_VERSION ${PYTHON_VERSION} PYTHON_MODULE migraphx)
+    target_link_libraries(migraphx_pybind_${PYTHON_VERSION} PRIVATE migraphx migraphx_tf migraphx_onnx migraphx_all_targets)
+    rocm_install_targets(TARGETS migraphx_pybind_${PYTHON_VERSION})
+    add_dependencies(migraphx_py migraphx_pybind_${PYTHON_VERSION})
+
+    add_library(migraphx_py_${PYTHON_VERSION} py.cpp)
+    target_include_directories(migraphx_py_${PYTHON_VERSION} PRIVATE include)
+    target_link_libraries(migraphx_py_${PYTHON_VERSION} PUBLIC migraphx)
+    target_link_libraries(migraphx_py_${PYTHON_VERSION} PRIVATE pybind11::pybind11 python${PYTHON_VERSION}::runtime)
+    rocm_install_targets(TARGETS migraphx_py_${PYTHON_VERSION})
+    add_dependencies(migraphx_py migraphx_py_${PYTHON_VERSION})
+endforeach()
--- a/src/quantization.cpp
+++ b/src/quantization.cpp
@@ -70,6 +70,10 @@ void quantize_int8(program& prog,
        MIGRAPHX_THROW("QUANTIZE_INT8: only support DOT and CONVOLUTION operation");
    }

+    // Run optimize_module() before converting to int8 to const eval and fold in FP32 to
+    // avoid loss of precision.
+    run_passes(prog, {optimize_module{}});
+
    std::shared_ptr<std::vector<std::pair<float, float>>> int8_quant_params =
        std::make_shared<std::vector<std::pair<float, float>>>();
    std::shared_ptr<std::vector<float>> max_abs_vals = std::make_shared<std::vector<float>>();
@@ -143,10 +147,7 @@ void quantize_int8(program& prog,

    run_passes(prog,
               {quantize_int8_pass{ins_names, *int8_quant_params},
-                eliminate_common_subexpression{},
-                dead_code_elimination{},
-                simplify_reshapes{},
-                dead_code_elimination{},
+                optimize_module{},
                simplify_qdq{},
                dead_code_elimination{}});
 }

--- a/src/rewrite_pooling.cpp
+++ b/src/rewrite_pooling.cpp
@@ -43,9 +43,7 @@ void rewrite_pooling::apply(module& m) const
            continue;
        if(ins->inputs().empty())
            continue;
-        auto&& s = ins->inputs().front()->get_shape();
-        if(not s.standard())
-            continue;
+        auto&& s  = ins->inputs().front()->get_shape();
        auto&& op = any_cast<op::pooling>(ins->get_operator());
        if(not std::all_of(op.padding.begin(), op.padding.end(), [](auto i) { return i == 0; }))
            continue;
@@ -54,27 +52,18 @@ void rewrite_pooling::apply(module& m) const
        auto lens = s.lens();
        if(not std::equal(lens.begin() + 2, lens.end(), op.lengths.begin(), op.lengths.end()))
            continue;
-        std::int64_t n = s.lens()[0];
-        std::int64_t c = s.lens()[1];
-        auto reshape   = m.insert_instruction(
-            ins, make_op("reshape", {{"dims", {n * c, -1}}}), ins->inputs().front());
-        instruction_ref pooling{};
-
+        std::vector<std::int64_t> axes(lens.size() - 2);
+        std::iota(axes.begin(), axes.end(), 2);
        // average pooling
        if(op.mode == op::pooling_mode::average)
        {
-            pooling = m.insert_instruction(ins, make_op("reduce_mean", {{"axes", {1}}}), reshape);
+            m.replace_instruction(ins, make_op("reduce_mean", {{"axes", axes}}), ins->inputs());
        }
        // max pooling
        else
        {
-            pooling = m.insert_instruction(ins, make_op("reduce_max", {{"axes", {1}}}), reshape);
+            m.replace_instruction(ins, make_op("reduce_max", {{"axes", axes}}), ins->inputs());
        }
-
-        std::vector<int64_t> rsp_lens(lens.size(), 1);
-        rsp_lens[0] = n;
-        rsp_lens[1] = c;
-        m.replace_instruction(ins, make_op("reshape", {{"dims", rsp_lens}}), pooling);
    }
 }


--- a/src/simplify_algebra.cpp
+++ b/src/simplify_algebra.cpp
@@ -1325,48 +1325,59 @@ struct find_split_reshape

    void apply(module& m, const match::matcher_result& r) const
    {
-        auto slc = r.instructions["slice"];
-        auto rsp = r.instructions["reshape"];
+        auto slc   = r.instructions["slice"];
+        auto rsp   = r.instructions["reshape"];
+        auto input = slc->inputs().front();
+
+        // Only apply simplification when slices are on a single axis
+        auto axes = any_cast<op::slice>(slc->get_operator()).axes;
+        if(axes.size() > 1)
+        {
+            return;
+        }

-        auto input         = slc->inputs().front();
        auto split_outputs = get_splits(input);
        if(split_outputs.empty())
        {
            return;
        }

-        // Only want to apply this optimization if each split output is followed by
-        // a contiguous op and a reshape
-        if(std::any_of(split_outputs.begin(), split_outputs.end(), [](auto i) {
-               if(i->outputs().size() == 1)
-               {
-                   auto cont = i->outputs().front();
-                   return cont->outputs().size() != 1;
-               }
-               return false;
-           }))
+        // Find all the reshapes (similar to rsp) that can be simplified
+        std::vector<instruction_ref> conts;
+        std::vector<instruction_ref> vec_rsp;
+
+        // Iterate through slice and contiguous outputs to allow simplifications when
+        // slice is followed by multiple reshapes
+        for(auto& i : split_outputs)
        {
-            return;
+            std::copy_if(i->outputs().begin(),
+                         i->outputs().end(),
+                         std::back_inserter(conts),
+                         [](auto j) { return j->name() == "contiguous"; });
        }

-        std::vector<instruction_ref> vec_rsp(split_outputs.size());
-        std::transform(split_outputs.begin(), split_outputs.end(), vec_rsp.begin(), [](auto i) {
-            auto cont = i->outputs().front();
-            return cont->outputs().front();
-        });
+        for(auto& i : conts)
+        {
+            std::copy_if(i->outputs().begin(),
+                         i->outputs().end(),
+                         std::back_inserter(vec_rsp),
+                         [&](auto j) { return j->get_operator() == rsp->get_operator(); });
+        }

-        // all outputs are reshape and of the same shape
-        auto dims = any_cast<op::reshape>(rsp->get_operator()).dims;
-        if(not same_ops(vec_rsp))
+        // No simplification needed if there is only one slice -> cont -> reshape
+        if(vec_rsp.size() <= 1)
        {
            return;
        }

        // ensure reshape happens after the axis dimension
-        auto axis         = any_cast<op::slice>(slc->get_operator()).axes[0];
+        auto axis         = axes[0];
        auto slc_lens     = slc->get_shape().lens();
        auto slc_dim_size = std::accumulate(
            slc_lens.begin() + axis, slc_lens.end(), 1, std::multiplies<std::size_t>());
+        auto input_lens   = input->get_shape().lens();
+        auto input_size   = input->get_shape().elements();
+        auto slc_axis_len = input_lens[axis];

        // search the reshape output (standard shape) to decide which axis are
        // in its output corresponding to the slc_dim_size
@@ -1393,16 +1404,67 @@ struct find_split_reshape
        {
            rsp_axis = std::distance(rsp_strides.begin(), ait);
        }
-        // calculate reshape output shape
-        std::vector<int64_t> vec_dims(vec_rsp.size());

-        std::transform(vec_rsp.begin(), vec_rsp.end(), vec_dims.begin(), [&](auto is) {
-            return is->get_shape().lens()[rsp_axis];
-        });
+        // Calculate reshape output shape
+        // Need to find a reshape such that data represented by instructions in vec_rsp can be
+        // written as slices of this new reshape. This is done by holding all the dims constant in
+        // rsp_lens to compute the required dim for rsp_axis (axis that will be sliced)
+
+        // ex 1:  Input Shape: {2, 12, 4}, Slice Axis: 1, Slices are: (0:4), (4:8), (8:12),
+        //        Reshape Outputs: {2, 2, 2, 4}, {2, 2, 2, 4}, {2, 2, 2, 4}
+        //        rsp_axis = 1, rsp_out_lens (initial) = {2, 1, 2, 4}, rsp_fixed_size = 2*1*2*4 = 16
+        //        rsp_axis_len = 2*12*4 / 16 = 6
+        //        rsp_out_lens (final) = {2, 6, 2, 4}
+
+        // ex 2:  Input Shape: {2, 12, 4}, Slice Axis: 1, Slices are: (0:4), (4:8), (8:12),
+        //        Reshape Outputs: {2, 16}, {2, 16}, {2, 16}
+        //        rsp_axis = 1, rsp_out_lens (initial) = {2, 1}, rsp_fixed_size = 2*1 = 2
+        //        rsp_axis_len = 2*12*4 / 2 = 48
+        //        rsp_out_lens (final) = {2, 48}

        std::vector<int64_t> rsp_out_lens(rsp_lens.begin(), rsp_lens.end());
+        rsp_out_lens[rsp_axis] = 1;
+        auto rsp_fixed_size    = std::accumulate(
+            rsp_out_lens.begin(), rsp_out_lens.end(), 1, std::multiplies<std::size_t>());

-        rsp_out_lens[rsp_axis] = std::accumulate(vec_dims.begin(), vec_dims.end(), std::int64_t{0});
+        // cannot create a valid reshape for simplification
+        if(input_size % rsp_fixed_size != 0)
+        {
+            return;
+        }
+        auto rsp_axis_len      = input_size / rsp_fixed_size;
+        rsp_out_lens[rsp_axis] = rsp_axis_len;
+
+        // Calculate new slice start and end indices. Indices are scaled using the new reshape axis
+        // and the original slice axis. See examples:
+
+        // ex 1:  Input Shape: {2, 12, 4}, Slice Axis: 1, Slices are: (0:4), (4:8), (8:12),
+        //        Reshape Outputs: {2, 2, 2, 4}, {2, 2, 2, 4}, {2, 2, 2, 4}
+        //        slc_axis_len = 12, rsp_axis_len = 6
+        //        New Starts: {0*6/12, 4*6/12,  8*6/12} = {0, 2, 4}
+        //        New Ends:   {4*6/12, 8*6/12, 12*6/12} = {2, 4, 6}
+
+        // ex 2:  Input Shape: {2, 12, 4}, Slice Axis: 1, Slices are: (0:4), (4:8), (8:12),
+        //        Reshape Outputs: {2, 16}, {2, 16}, {2, 16}
+        //        slc_axis_len = 12, rsp_axis_len = 48
+        //        New Starts: {0*48/12, 4*48/12,  8*48/12} = { 0, 16, 32}
+        //        New Ends:   {4*48/12, 8*48/12, 12*48/12} = {16, 32, 48}
+
+        std::vector<int64_t> new_starts(vec_rsp.size());
+        std::transform(vec_rsp.begin(), vec_rsp.end(), new_starts.begin(), [&](auto is) {
+            auto cont   = is->inputs().front();
+            auto og_slc = cont->inputs().front();
+            return any_cast<op::slice>(og_slc->get_operator()).starts[0] * rsp_axis_len /
+                   slc_axis_len;
+        });
+
+        std::vector<int64_t> new_ends(vec_rsp.size());
+        std::transform(vec_rsp.begin(), vec_rsp.end(), new_ends.begin(), [&](auto is) {
+            auto cont   = is->inputs().front();
+            auto og_slc = cont->inputs().front();
+            return any_cast<op::slice>(og_slc->get_operator()).ends[0] * rsp_axis_len /
+                   slc_axis_len;
+        });

        // insert the reshape instruction and add contiguous if needed
        if(not input->get_shape().standard())
@@ -1413,16 +1475,14 @@ struct find_split_reshape
            std::next(input), make_op("reshape", {{"dims", rsp_out_lens}}), input);

        // replace the original reshape with slice
-        int64_t start = 0;
        for(std::size_t i = 0; i < vec_rsp.size(); ++i)
        {
            m.replace_instruction(
                vec_rsp[i],
                make_op(
                    "slice",
-                    {{"axes", {rsp_axis}}, {"starts", {start}}, {"ends", {start + vec_dims[i]}}}),
+                    {{"axes", {rsp_axis}}, {"starts", {new_starts[i]}}, {"ends", {new_ends[i]}}}),
                rsp_ins);
-            start += vec_dims[i];
        }
    }
 };

--- a/src/simplify_dyn_ops.cpp
+++ b/src/simplify_dyn_ops.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/simplify_dyn_ops.hpp>
+#include <migraphx/matcher.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+/**
+ * Convert 2 input static shape broadcast/multibroadcast into 1 input version.
+ * Some compiler passes (ex. simplify_algebra) only support the 1 input versions
+ * of the broadcasting operators.
+ */
+struct find_static_2in_broadcasts
+{
+    auto matcher() const
+    {
+        return match::broadcast(match::nargs(2),
+                                match::arg(0)(match::static_shape()),
+                                match::arg(1)(match::static_shape()));
+    }
+
+    void apply(module& m, const match::matcher_result& mr) const
+    {
+        auto ins          = mr.result;
+        auto out_lens     = ins->get_shape().lens();
+        auto broadcast_op = ins->get_operator();
+        if(broadcast_op.name() == "broadcast")
+        {
+            broadcast_op.from_value({{"out_lens", out_lens}});
+        }
+        else
+        {
+            broadcast_op.from_value({{"out_lens", out_lens}, {"out_dyn_dims", {}}});
+        }
+        m.replace_instruction(ins, broadcast_op, ins->inputs().at(0));
+    }
+};
+
+/**
+ * Simplify slice with variable `starts` and `ends` to the constant version if
+ * the `input_starts` and `input_ends` inputs are constant.
+ */
+struct find_const_3in_slice
+{
+    auto matcher() const
+    {
+        return match::name("slice")(match::nargs(3),
+                                    match::arg(1)(match::is_constant()),
+                                    match::arg(2)(match::is_constant()));
+    }
+
+    void apply(module& m, const match::matcher_result& mr) const
+    {
+        auto ins            = mr.result;
+        auto inputs         = ins->inputs();
+        argument starts_arg = inputs.at(1)->eval();
+        argument ends_arg   = inputs.at(2)->eval();
+        if(not starts_arg.empty() and not ends_arg.empty())
+        {
+            std::vector<int64_t> starts_vec;
+            std::vector<int64_t> ends_vec;
+            starts_arg.visit([&](auto output) { starts_vec.assign(output.begin(), output.end()); });
+            ends_arg.visit([&](auto output) { ends_vec.assign(output.begin(), output.end()); });
+            auto slice_val = ins->get_operator().to_value();
+            auto axes_vec  = slice_val.at("axes").to_vector<int64_t>();
+            m.replace_instruction(
+                ins,
+                make_op("slice", {{"starts", starts_vec}, {"ends", ends_vec}, {"axes", axes_vec}}),
+                inputs.at(0));
+        }
+    }
+};
+
+/**
+ * Simplify slice with variable `starts`, `ends`, and `input_axes` to the constant version if
+ * the `input_starts`, `input_ends`, and `input_axes` inputs are constant.
+ */
+struct find_const_4in_slice
+{
+    auto matcher() const
+    {
+        return match::name("slice")(match::nargs(4),
+                                    match::arg(1)(match::is_constant()),
+                                    match::arg(2)(match::is_constant()),
+                                    match::arg(3)(match::is_constant()));
+    }
+
+    void apply(module& m, const match::matcher_result& mr) const
+    {
+        auto ins            = mr.result;
+        auto inputs         = ins->inputs();
+        argument starts_arg = inputs.at(1)->eval();
+        argument ends_arg   = inputs.at(2)->eval();
+        argument axes_arg   = inputs.at(3)->eval();
+        if(not starts_arg.empty() and not ends_arg.empty() and not axes_arg.empty())
+        {
+            std::vector<int64_t> starts_vec;
+            std::vector<int64_t> ends_vec;
+            std::vector<int64_t> axes_vec;
+            starts_arg.visit([&](auto output) { starts_vec.assign(output.begin(), output.end()); });
+            ends_arg.visit([&](auto output) { ends_vec.assign(output.begin(), output.end()); });
+            axes_arg.visit([&](auto output) { axes_vec.assign(output.begin(), output.end()); });
+            m.replace_instruction(
+                ins,
+                make_op("slice", {{"starts", starts_vec}, {"ends", ends_vec}, {"axes", axes_vec}}),
+                inputs.at(0));
+        }
+    }
+};
+
+void simplify_dyn_ops::apply(module& m) const
+{
+    match::find_matches(
+        m, find_static_2in_broadcasts{}, find_const_3in_slice{}, find_const_4in_slice{});
+}
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/simplify_reshapes.cpp
+++ b/src/simplify_reshapes.cpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -122,6 +122,11 @@ struct find_nop_reshapes
        reshapes.insert("pad");
        reshapes.insert("slice");
        reshapes.insert("transpose");
+        reshapes.insert("reduce_mean");
+        reshapes.insert("reduce_max");
+        reshapes.insert("reduce_min");
+        reshapes.insert("reduce_sum");
+        reshapes.insert("reduce_prod");
        return match::name(reshapes)(match::same_shape(match::arg(0)));
    }

@@ -627,6 +632,30 @@ struct find_transpose_contiguous_reshaper_unary
    }
 };

+struct find_broadcast_transpose
+{
+    auto matcher() const
+    {
+        return match::name("transpose")(
+            match::arg(0)(match::name("multibroadcast").bind("bcast_ins")));
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins       = r.result;
+        auto ins_lens  = ins->get_shape().lens();
+        auto bcast_ins = r.instructions["bcast_ins"];
+        auto input     = bcast_ins->inputs().front();
+        // for now, focusing on scalar transformation
+        if(not input->get_shape().scalar())
+            return;
+
+        auto new_mbcast = m.insert_instruction(
+            bcast_ins, make_op("multibroadcast", {{"out_lens", ins_lens}}), input);
+        m.replace_instruction(ins, new_mbcast);
+    }
+};
+
 struct find_slice_transpose
 {
    auto matcher() const
@@ -799,6 +828,7 @@ void simplify_reshapes::apply(module& m) const
                            find_nested_slice{},
                            find_nested_concat{},
                            find_transpose_slice{},
+                            find_broadcast_transpose{},
                            find_slice_transpose{},
                            find_transpose_contiguous_reshaper_unary{});
        dead_code_elimination{}.apply(m);

--- a/src/split_single_dyn_dim.cpp
+++ b/src/split_single_dyn_dim.cpp
@@ -68,37 +68,6 @@ has_one_dyn_dim(const std::unordered_map<std::string, shape>& param_shapes)
                                    dds_it->max};
 }

-namespace {
-struct find_static_2in_broadcasts
-{
-    // Convert 2 input static shape broadcast/multibroadcast into 1 input version.
-    // Some compiler passes (ex. simplify_algebra) only support the 1 input versions
-    // of the broadcasting operators.
-    auto matcher() const
-    {
-        return match::broadcast(match::nargs(2),
-                                match::arg(0)(match::static_shape()),
-                                match::arg(1)(match::static_shape()));
-    }
-
-    void apply(module& m, const match::matcher_result& mr) const
-    {
-        auto ins          = mr.result;
-        auto out_lens     = ins->get_shape().lens();
-        auto broadcast_op = ins->get_operator();
-        if(broadcast_op.name() == "broadcast")
-        {
-            broadcast_op.from_value({{"out_lens", out_lens}});
-        }
-        else
-        {
-            broadcast_op.from_value({{"out_lens", out_lens}, {"out_dyn_dims", {}}});
-        }
-        m.replace_instruction(ins, broadcast_op, ins->inputs().at(0));
-    }
-};
-} // namespace
-
 /**
 * Makes all the shapes in the dynamic_dimension range.  Probably won't work for `if`
 * and `loop` instructions, depending on how the submodules for those
@@ -135,7 +104,6 @@ void split_single_dyn_dim::apply(module_pass_manager& mpm) const
                dd_check->dyn_param_str, migraphx::shape{dyn_param_shape.type(), static_lens});
            auto outputs = submod->add_instructions(mm, map_ins);
            submod->add_return({outputs});
-            match::find_matches(*submod, find_static_2in_broadcasts{});
            submodules.push_back(submod);
        }
        // redirect to select_module operator and return

--- a/src/targets/cpu/include/migraphx/cpu/dnnl.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/dnnl.hpp
@@ -91,6 +91,19 @@ struct post_op : reflect_equality<post_op>, reflect_stream<post_op>
    }
 };

+template <class F>
+struct execute_wrapper
+{
+    F f;
+    argument operator()(context&, const std::vector<argument>& args) const { return f(args); }
+};
+
+template <class F>
+execute_wrapper<F> make_execute_wrapper(F f)
+{
+    return {std::move(f)};
+}
+
 template <class Derived, class Primitive>
 struct dnnl_op : auto_register_op<Derived>
 {
@@ -308,7 +321,7 @@ struct dnnl_op : auto_register_op<Derived>
 #ifndef NDEBUG
        auto prim_attr = get_primitive_attr(md);
 #endif
-        execute = [=](context&, const std::vector<argument>& args) {
+        execute = make_execute_wrapper([=](const std::vector<argument>& args) {
 #ifndef NDEBUG
            // Check that the memory descriptors have not changed
            auto debug_args = args;
@@ -379,7 +392,7 @@ struct dnnl_op : auto_register_op<Derived>
                m[arg_lookup[i]] = to_dnnl_memory(md.at(arg_lookup[i]), args[i]);
            prim.execute(get_dnnl_context().stream, m);
            return args.back();
-        };
+        });
    }
    std::vector<shape> trim_post_op_inputs(const std::vector<shape>& inputs) const
    {

--- a/src/targets/cpu/include/migraphx/cpu/pointwise.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/pointwise.hpp
@@ -24,6 +24,7 @@
 #ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_POINTWISE_HPP
 #define MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_POINTWISE_HPP

+#include <array>
 #include <migraphx/config.hpp>
 #include <migraphx/context.hpp>
 #include <migraphx/check_shapes.hpp>

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -23,6 +23,10 @@
 # ####################################################################################

 list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
+find_package(hip)
+if(NOT GPU_TARGETS)
+    message(FATAL_ERROR "HIP package is broken and has no GPU_TARGETS, please pass -DGPU_TARGETS=$(/opt/rocm/bin/rocminfo | grep -o -m1 'gfx.*') to cmake to build for your gpu.")
+endif()
 find_package(miopen)

 # rocblas
@@ -44,12 +48,21 @@ else()
    set(MIGRAPHX_USE_HIPRTC ON CACHE BOOL "Use hipRTC APIs")
 endif()

-include(Embed)
 file(GLOB KERNEL_FILES CONFIGURE_DEPENDS
    ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/migraphx/kernels/*.hpp)
 message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")
+
+if(WIN32)
+    # TODO: re-enable when CK is ported to Windows
+    list(REMOVE_ITEM KERNEL_FILES
+        ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/migraphx/kernels/ck_gemm.hpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/migraphx/kernels/ck.hpp)
+endif()
+
+include(Embed)
 add_embed_library(migraphx_kernels ${KERNEL_FILES} RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/)

+configure_file(device/targets.hpp.in include/migraphx/gpu/device/targets.hpp)
 file(GLOB DEVICE_GPU_SRCS CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/device/*.cpp)
 add_library(migraphx_device ${DEVICE_GPU_SRCS})

@@ -69,6 +82,7 @@ rocm_clang_tidy_check(migraphx_device)
 target_link_libraries(migraphx_device PUBLIC migraphx)
 target_link_libraries(migraphx_device PRIVATE compile_for_gpu)
 target_include_directories(migraphx_device PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
+target_include_directories(migraphx_device PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_BINAR_DIR}/include>)
 target_include_directories(migraphx_device PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/device/include>)
 target_compile_options(migraphx_device PRIVATE -Wno-ignored-attributes)
 migraphx_generate_export_header(migraphx_device DIRECTORY migraphx/gpu/device)
@@ -192,7 +206,7 @@ register_op(migraphx_gpu HEADER migraphx/gpu/convolution.hpp
 rocm_set_soversion(migraphx_gpu ${MIGRAPHX_SO_VERSION})
 rocm_clang_tidy_check(migraphx_gpu)

-set(MIGRAPHX_ENABLE_MLIR OFF CACHE BOOL "")
+set(MIGRAPHX_ENABLE_MLIR ON CACHE BOOL "")

 if(MIGRAPHX_ENABLE_MLIR)
    # Find package rocMLIR

--- a/src/targets/gpu/compile_hip.cpp
+++ b/src/targets/gpu/compile_hip.cpp
@@ -28,6 +28,7 @@
 #include <migraphx/env.hpp>
 #include <cassert>
 #include <iostream>
+#include <deque>

 #ifdef MIGRAPHX_USE_HIPRTC
 #include <hip/hiprtc.h>
@@ -92,7 +93,7 @@ struct hiprtc_program
 {
    struct string_array
    {
-        std::vector<std::string> strings{};
+        std::deque<std::string> strings{};
        std::vector<const char*> c_strs{};

        string_array() {}
@@ -209,7 +210,6 @@ std::vector<std::vector<char>> compile_hip_src_with_hiprtc(std::vector<hiprtc_sr
        options.push_back("-Wno-gnu-line-marker");
        options.push_back("-Wno-old-style-cast");
    }
-
    if(enabled(MIGRAPHX_GPU_DEBUG{}))
        options.push_back("-DMIGRAPHX_DEBUG");
    if(std::none_of(options.begin(), options.end(), [](const std::string& s) {
@@ -248,7 +248,7 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
        {
            if(src.path.extension() != ".cpp")
                continue;
-            std::cout << std::string(src.content.first, src.len()) << std::endl;
+            std::cout << std::string(src.content) << std::endl;
        }
    }
    auto p      = dynamic_loader::path(&compile_hip_src_with_hiprtc);
@@ -338,7 +338,7 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
        {
            if(src.path.extension() != ".cpp")
                continue;
-            std::cout << std::string(src.content.first, src.len()) << std::endl;
+            std::cout << std::string(src.content) << std::endl;
        }
    }

@@ -359,9 +359,7 @@ bool hip_has_flags(const std::vector<std::string>& flags)
        join_strings(flags, " ") + " -x hip -c --offload-arch=gfx900 --cuda-device-only";

    std::string src;
-    src_file input;
-    input.path    = "main.cpp";
-    input.content = std::make_pair(src.data(), src.data() + src.size());
+    src_file input{"main.cpp", src};

    try
    {

--- a/src/targets/gpu/compile_hip_code_object.cpp
+++ b/src/targets/gpu/compile_hip_code_object.cpp
@@ -172,21 +172,17 @@ operation compile_hip_code_object(const std::string& content, hip_compile_option
    assert(options.inputs.size() == options.virtual_inputs.size() or
           options.virtual_inputs.empty());
    std::vector<src_file> srcs = options.additional_src_files;
-    std::transform(migraphx_kernels().begin(),
-                   migraphx_kernels().end(),
-                   std::back_inserter(srcs),
-                   [](auto&& p) {
-                       auto&& name = p.first;
-                       auto&& c    = p.second;
-                       auto path   = name;
-                       return src_file{path, c};
-                   });
-    srcs.push_back(src_file{fs::path{"main.cpp"},
-                            std::make_pair(content.data(), content.data() + content.size())});
+    static auto kernels{::migraphx_kernels()};
+    std::transform(
+        kernels.begin(),
+        kernels.end(),
+        std::back_inserter(srcs),
+        [](const std::pair<std::string_view, std::string_view>& elem) { return src_file{elem}; });
+    srcs.emplace_back("main.cpp", content);
    auto args_hpp =
        generate_args_hpp(options.virtual_inputs.empty() ? options.inputs : options.virtual_inputs);
-    srcs.push_back(src_file{fs::path{"args.hpp"},
-                            std::make_pair(args_hpp.data(), args_hpp.data() + args_hpp.size())});
+    srcs.emplace_back("args.hpp", args_hpp);
+
    options.params += " -DMIGRAPHX_NGLOBAL=" + std::to_string(options.global);
    options.params += " -DMIGRAPHX_NLOCAL=" + std::to_string(options.local);
    options.params += " " + join_strings(compiler_warnings(), " ");

--- a/src/targets/gpu/compile_ops.cpp
+++ b/src/targets/gpu/compile_ops.cpp
@@ -185,8 +185,7 @@ struct compile_plan
            results.begin(), results.end(), std::back_inserter(times), [&](const auto& cr) {
                if(not cr.has_value())
                    return std::numeric_limits<double>::max();
-                return time_op(*ctx, cr->replace.code_object, to_shapes(cr->ins->inputs()), 20)
-                    .first;
+                return time_op(*ctx, cr->replace.code_object, to_shapes(cr->ins->inputs()), 20);
            });
        auto i = std::distance(times.begin(), std::min_element(times.begin(), times.end()));
        std::cout << "Fastest solution: " << config->solutions.at(i) << std::endl;

--- a/src/targets/gpu/device/include/migraphx/gpu/device/launch.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/launch.hpp
@@ -26,7 +26,9 @@

 #include <hip/hip_runtime.h>
 #include <migraphx/config.hpp>
+#include <migraphx/ranges.hpp>
 #include <migraphx/gpu/device/types.hpp>
+#include <migraphx/gpu/device/targets.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -79,13 +81,28 @@ inline auto launch(hipStream_t stream, index_int global, index_int local)
        using f_type = decltype(f);
        dim3 nblocks(global / local);
        dim3 nthreads(local);
+        /*
+        hipGetLastError() returns error for the first failed HIP call that happened previously.
+        MIGraphX calls into various backend libraries and failed HIP calls can also happen there.
+        Calling hipGetLastError() would reset error code to hipSuccess, so that inside MIGraphX
+        failed call to hipLaunchKernelGGL() can be captured.
+        */
+        hipError_t flush_call = hipGetLastError();
+        (void)(flush_call);
        // cppcheck-suppress UseDeviceLaunch
        hipLaunchKernelGGL((launcher<f_type>), nblocks, nthreads, 0, stream, f);
        hipError_t kernel_launch_status = hipGetLastError();
        if(kernel_launch_status != hipSuccess)
        {
-            MIGRAPHX_THROW("MIGraphX device kernel failed to launch with error: " +
-                           std::string(hipGetErrorString(kernel_launch_status)));
+            std::string message = hipGetErrorString(kernel_launch_status);
+            if(not contains(get_targets(), get_device_name()))
+            {
+                message += ". Trying to run a kernel for " + get_device_name() +
+                           " but MIGraphX was built for targets " + get_targets_as_string() +
+                           ". Please rebuild MIGraphX with -DGPU_TARGETS='" + get_device_name() +
+                           "'.";
+            }
+            MIGRAPHX_THROW("MIGraphX device kernel failed to launch with error: " + message);
        }
    };
 }

--- a/src/targets/gpu/device/targets.cpp
+++ b/src/targets/gpu/device/targets.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <migraphx/gpu/device/targets.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/errors.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+static std::vector<std::string> parse_targets() { return split_string(MIGRAPHX_GPU_TARGETS, ';'); }
+
+const std::vector<std::string>& get_targets()
+{
+    static auto result = parse_targets();
+    return result;
+}
+
+std::string get_targets_as_string() { return join_strings(get_targets(), ", "); }
+
+static int get_device_id()
+{
+    int device;
+    auto status = hipGetDevice(&device);
+    if(status != hipSuccess)
+        MIGRAPHX_THROW("No device");
+    return device;
+}
+
+std::string get_device_name()
+{
+    hipDeviceProp_t props{};
+    auto status = hipGetDeviceProperties(&props, get_device_id());
+    if(status != hipSuccess)
+        MIGRAPHX_THROW("Failed to get device properties");
+    return props.gcnArchName;
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/targets.hpp.in
+++ b/src/targets/gpu/device/targets.hpp.in
+/*
+* The MIT License (MIT)
+*
+* Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+* THE SOFTWARE.
+*/
+#ifndef MIGRAPHX_GUARD_DEVICE_TARGETS_CPP
+#define MIGRAPHX_GUARD_DEVICE_TARGETS_CPP
+
+#include <migraphx/gpu/device/config.hpp>
+#include <string>
+#include <vector>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+#define MIGRAPHX_GPU_TARGETS "@GPU_TARGETS@" // NOLINT
+
+MIGRAPHX_DEVICE_EXPORT
+const std::vector<std::string>& get_targets();
+
+MIGRAPHX_DEVICE_EXPORT
+std::string get_targets_as_string();
+
+MIGRAPHX_DEVICE_EXPORT
+std::string get_device_name();
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_DEVICE_TARGETS_CPP
+
+
--- a/src/targets/gpu/driver/compile_op.cpp
+++ b/src/targets/gpu/driver/compile_op.cpp
@@ -38,10 +38,8 @@ struct compile_op : action<compile_op>
        context ctx;
        auto inputs = p.parse_shapes(v.at("inputs"));
        auto op     = gpu::compile_op(v.at("name").to<std::string>(), ctx, inputs, v);
-        auto [host_time, device_time] = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
-        std::cout << op << ": " << host_time << "ms";
-        if(device_time > 0)
-            std::cout << ", " << device_time << "ms";
+        auto t      = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
+        std::cout << op << ": " << t << "ms";
        std::cout << std::endl;
    }
 };

--- a/src/targets/gpu/driver/run_op.cpp
+++ b/src/targets/gpu/driver/run_op.cpp
@@ -43,8 +43,8 @@ struct run_op : action<run_op>
        auto op = make_op(name);
        if(v.contains("fields"))
            op.from_value(v.at("fields"));
-        auto [host_time, device_time] = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
-        std::cout << op << ": " << host_time << "ms" << std::endl;
+        auto t = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
+        std::cout << op << ": " << t << "ms" << std::endl;
    }
 };


--- a/src/targets/gpu/fuse_mlir.cpp
+++ b/src/targets/gpu/fuse_mlir.cpp
@@ -283,9 +283,9 @@ struct find_mlir_fused_ops
                       names.end(),
                       ins->inputs().begin(),
                       std::inserter(param_map, param_map.end()),
-                       [&, &anchor_op = anchor_op](auto name, auto input) {
+                       [&, &anchor = anchor_op](auto name, auto input) {
                           if(input == x_ins)
-                               return std::make_pair(pm->get_parameter(name), anchor_op);
+                               return std::make_pair(pm->get_parameter(name), anchor);
                           return std::make_pair(pm->get_parameter(name),
                                                 mm->add_parameter(name, input->get_shape()));
                       });
@@ -302,10 +302,8 @@ struct find_mlir_fused_ops
    }
 };

-struct find_mlir_standalone_convolution_op
+struct find_mlir_standalone_op
 {
-    auto matcher() const { return match::name("convolution"); }
-
    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
    {
        auto conv_based_op = r.result;
@@ -327,6 +325,16 @@ struct find_mlir_standalone_convolution_op
    }
 };

+struct find_mlir_standalone_convolution_op : find_mlir_standalone_op
+{
+    auto matcher() const { return is_mlir_conv; }
+};
+
+struct find_mlir_standalone_dot_op : find_mlir_standalone_op
+{
+    auto matcher() const { return match::any_of(match::name("dot"), match::name("quant_dot")); }
+};
+
 /**
 * @brief Declares a new MIGraphX environment variable which forces to generate
 * only specific MLIR operations.
@@ -334,7 +342,7 @@ struct find_mlir_standalone_convolution_op
 * The variable, if defined, forces MIGraphX to use only specific operations
 * with MLIR regardless of the underlying GPU architecture. The variable accepts
 * a list of operations separated by comma. The variable recognizes the following
- * operations: "fused", "convolution". If the variable is not defined MIGraphX
+ * operations: "fused", "convolution", "dot". If the variable is not defined MIGraphX
 * will decide by itself which operations to delegate to MLIR. The variable is
 * intended to be primarily used by rocMLIR developers.
 */
@@ -349,31 +357,33 @@ bool is_requested(std::string_view option)
    return contains(options, option);
 }

-bool is_fusion_enabled()
-{
-    if(is_self_decide())
-    {
-        return true;
-    }
-    return is_requested("fused");
-}
-
-bool is_standalone_convs_enabled(context* ctx)
+bool is_enabled(std::string_view op_name, context* ctx)
 {
    if(is_self_decide())
    {
-        if(ctx == nullptr)
+        if(op_name == "fused")
        {
-            return false;
+            return true;
+        }
+        else if(op_name == "convolution" or op_name == "quant_convolution")
+        {
+            if(ctx == nullptr)
+            {
+                return false;
+            }
+            else
+            {
+                const auto& device = ctx->get_current_device();
+                const std::string navi_family{"gfx110"};
+                return starts_with(device.get_gfx_name(), navi_family);
+            }
        }
        else
        {
-            const auto& device = ctx->get_current_device();
-            const std::string navi_family{"gfx110"};
-            return starts_with(device.get_gfx_name(), navi_family);
+            return false;
        }
    }
-    return is_requested("convolution");
+    return is_requested(op_name);
 }
 } // namespace

@@ -382,21 +392,25 @@ bool is_standalone_convs_enabled(context* ctx)
 void fuse_mlir::apply(module_pass_manager& mpm) const
 {
 #ifdef MIGRAPHX_MLIR
-    if(is_fusion_enabled())
+    if(is_enabled("fused", this->ctx))
    {
        match::find_matches(mpm, find_mlir_fused_ops{});
    }

-    if(is_standalone_convs_enabled(this->ctx))
+    if(is_enabled("convolution", this->ctx))
    {
        match::find_matches(mpm, find_mlir_standalone_convolution_op{});
    }
+
+    if(is_enabled("dot", this->ctx))
+    {
+        match::find_matches(mpm, find_mlir_standalone_dot_op{});
+    }
 #else
    (void)mpm;
 #endif
 }

 } // namespace gpu
-
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -790,22 +790,26 @@ struct find_layernorm_pointwise
 {
    auto matcher() const
    {
-        return precompile_name("pointwise")(match::arg(0)(
+        return precompile_name("pointwise")(match::any_of[match::inputs()](
            precompile_name("gpu::prelayernorm", "gpu::preadd_layernorm").bind("layernorm")));
    }

    void apply(module& m, const match::matcher_result& r) const
    {
-        auto ins       = r.result;
+        auto pw_ins    = r.result;
        auto layernorm = r.instructions["layernorm"];
        if(not layernorm->module_inputs().empty())
            return;
-        auto* pm    = ins->module_inputs().front();
+        auto* pm       = pw_ins->module_inputs().front();
+        auto pw_inputs = pw_ins->inputs();
+        auto ln_pos    = std::find(pw_inputs.begin(), pw_inputs.end(), layernorm);
+        assert(ln_pos != pw_inputs.end());
+        pw_inputs.erase(ln_pos);
        auto inputs = layernorm->inputs();
        inputs.pop_back();
-        inputs.insert(inputs.end(), ins->inputs().begin() + 1, ins->inputs().end());
+        inputs.insert(inputs.end(), pw_inputs.begin(), pw_inputs.end());

-        m.replace_instruction(ins, layernorm->get_operator(), inputs, {pm});
+        m.replace_instruction(pw_ins, layernorm->get_operator(), inputs, {pm});
    }
 };