manual merge

cd4ab535 · Khalique Ahmed · 3891ee58 · a0fa3742 · cd4ab535 · cd4ab535
Commit cd4ab535 authored Jun 20, 2023 by Khalique Ahmed
20 changed files
--- a/src/targets/gpu/driver/compile_op.cpp
+++ b/src/targets/gpu/driver/compile_op.cpp
@@ -22,7 +22,7 @@
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/driver/action.hpp>
-#include <migraphx/gpu/driver/perf.hpp>
+#include <migraphx/gpu/time_op.hpp>
 #include <migraphx/gpu/compiler.hpp>
 #include <migraphx/gpu/context.hpp>


--- a/src/targets/gpu/driver/include/migraphx/gpu/driver/action.hpp
+++ b/src/targets/gpu/driver/include/migraphx/gpu/driver/action.hpp
@@ -44,7 +44,7 @@ struct auto_register_action
    template <class T>
    static void apply()
    {
-        auto name = get_type_name<T>();
+        const auto& name = get_type_name<T>();
        register_action(name.substr(name.rfind("::") + 2),
                        [](auto&&... xs) { T::apply(std::forward<decltype(xs)>(xs)...); });
    }

--- a/src/targets/gpu/driver/run_op.cpp
+++ b/src/targets/gpu/driver/run_op.cpp
@@ -22,7 +22,7 @@
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/driver/action.hpp>
-#include <migraphx/gpu/driver/perf.hpp>
+#include <migraphx/gpu/time_op.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/make_op.hpp>


--- a/src/targets/gpu/fuse_ck.cpp
+++ b/src/targets/gpu/fuse_ck.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/fuse_ck.hpp>
+#include <migraphx/matcher.hpp>
+#include <migraphx/pass_manager.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/register_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+struct module;
+
+namespace gpu {
+
+struct ck_gemm
+{
+    operation op = make_op("dot");
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.op, "op"));
+    }
+
+    std::string name() const { return "gpu::ck_gemm"; }
+
+    void check_gemm_shape(const shape& s) const
+    {
+        if(not contains(range(s.strides().rbegin(), s.strides().rbegin() + 3), 1))
+            MIGRAPHX_THROW("Invalid shape for ck_gemm");
+    }
+
+    shape compute_shape(std::vector<shape> inputs, const std::vector<module_ref>& mods) const
+    {
+        check_shapes{inputs, *this}.same_ndims();
+        if(inputs.size() < 2)
+            MIGRAPHX_THROW("should have at least two inputs.");
+        auto a = inputs[0];
+        auto b = inputs[1];
+        for(const auto& input : inputs)
+            check_gemm_shape(input);
+        auto r = op.compute_shape({a, b});
+        if(mods.empty())
+            return r;
+        return r.with_type(mods.front()->get_output_shapes().front().type());
+    }
+};
+MIGRAPHX_REGISTER_OP(ck_gemm);
+
+namespace {
+
+bool is_ck_supported_type(shape::type_t t)
+{
+    return contains({shape::half_type, shape::int8_type, shape::int32_type}, t);
+}
+
+MIGRAPHX_PRED_MATCHER(is_ck_gemm, instruction_ref ins)
+{
+    if(ins->name() != "dot" and ins->name() != "quant_dot")
+        return false;
+    if(not is_ck_supported_type(ins->get_shape().type()))
+        return false;
+    auto a = ins->inputs().front()->get_shape();
+    auto b = ins->inputs().back()->get_shape();
+    // Skipping GEMMs with a K dimension greater than 2048 is a course-grained strategy
+    // to avoid poor-performing GEMM kernels from CK
+    // To-do: Investigate a more precise strategy
+    return a.lens().back() <= 2048;
+}
+
+struct find_ck_gemm_pointwise
+{
+    // Find a gemm followed by a pointwise operation.
+    auto matcher() const
+    {
+        auto gemm = match::skip(match::name("contiguous"))(
+            match::name("dot", "quant_dot")(is_ck_gemm().bind("gemm")));
+        return match::name("pointwise")(match::any_of[match::inputs()](gemm.bind("x")));
+    }
+
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
+    {
+        auto ins      = r.result;
+        auto gemm_ins = r.instructions["gemm"];
+        auto x_ins    = r.instructions["x"]; // input after contiguous
+        auto* pm      = ins->module_inputs().front();
+        auto names    = pm->get_parameter_names();
+        std::sort(names.begin(), names.end());
+        auto inputs   = ins->inputs();
+        auto gemm_it  = std::find(inputs.begin(), inputs.end(), x_ins);
+        auto gemm_idx = gemm_it - inputs.begin();
+        if(gemm_ins->get_shape().type() != shape::int32_type and
+           ins->get_shape().type() != gemm_ins->get_shape().type())
+            return;
+        if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [](auto input) {
+               return not is_ck_supported_type(input->get_shape().type());
+           }))
+            return;
+        assert(gemm_it != inputs.end());
+        if(gemm_idx != 0)
+        {
+            auto first_param    = pm->get_parameter(names[0]);
+            auto gemm_param     = pm->get_parameter(names[gemm_idx]);
+            auto new_gemm_param = pm->add_parameter(names[0] + "_0", gemm_param->get_shape());
+            auto new_first_param =
+                pm->add_parameter(names[gemm_idx] + "_0", first_param->get_shape());
+            pm->replace_instruction(gemm_param, new_gemm_param);
+            pm->replace_instruction(first_param, new_first_param);
+            pm->remove_instruction(first_param);
+            pm->remove_instruction(gemm_param);
+        }
+        inputs.erase(gemm_it);
+        inputs.insert(inputs.begin(), gemm_ins->inputs().begin(), gemm_ins->inputs().end());
+
+        mpm.get_module().replace_instruction(ins, ck_gemm{gemm_ins->get_operator()}, inputs, {pm});
+    }
+};
+
+struct find_ck_gemm
+{
+    auto matcher() const { return match::name("dot")(is_ck_gemm().bind("gemm")); }
+
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
+    {
+        auto ins = r.result;
+        mpm.get_module().replace_instruction(ins, ck_gemm{ins->get_operator()}, ins->inputs());
+    }
+};
+
+} // namespace
+
+void fuse_ck::apply(module_pass_manager& mpm) const
+{
+    match::find_matches(mpm, find_ck_gemm_pointwise{});
+    match::find_matches(mpm, find_ck_gemm{});
+}
+
+} // namespace gpu
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/fuse_mlir.cpp
+++ b/src/targets/gpu/fuse_mlir.cpp
@@ -38,9 +38,32 @@ namespace gpu {

 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_MLIR);

+bool mlir_enabled()
+{
 #ifdef MIGRAPHX_MLIR
-struct mlir_conv
+    const bool mlir_enabled = enabled(MIGRAPHX_ENABLE_MLIR{});
+    if(mlir_enabled)
+    {
+        return true;
+    }
+    else
+    {
+
+        std::cerr << "WARNING: MIGraphX built with MLIR but it is not enabled. Please set the env "
+                     "var MIGRAPHX_ENABLE_MLIR to use MLIR kernel generator."
+                  << std::endl;
+        return false;
+    }
+#else
+    return false;
+#endif
+}
+
+#ifdef MIGRAPHX_MLIR
+
+struct mlir_op
 {
+    std::string name() const { return "gpu::mlir_op"; }
    operation op = make_op("convolution");

    template <class Self, class F>
@@ -49,7 +72,6 @@ struct mlir_conv
        return pack(f(self.op, "op"));
    }

-    std::string name() const { return "gpu::mlir_conv"; }
    shape compute_shape(std::vector<shape> inputs, const std::vector<module_ref>& mods) const
    {
        check_shapes{inputs, *this}.packed_or_broadcasted();
@@ -57,17 +79,50 @@ struct mlir_conv
            MIGRAPHX_THROW("should have one submodule.");
        if(inputs.size() < 2)
            MIGRAPHX_THROW("should have at least two inputs.");
-        auto n = inputs.size();
-        return op.compute_shape({inputs[n - 2], inputs[n - 1]});
+
+        module_ref mod = mods[0];
+        auto type      = mod->get_output_shapes().front().type();
+        std::unordered_map<instruction_ref, shape> ins_shapes;
+        size_t param_cnt               = 0;
+        std::vector<std::string> names = mod->get_parameter_names();
+        std::sort(names.begin(), names.end());
+        for(std::string param_name : names)
+        {
+            ins_shapes[mod->get_parameter(param_name)] = inputs[param_cnt++];
+        }
+        for(auto ins : iterator_for(*mod))
+        {
+            if(ins->name() == "@param")
+            {
+                continue;
+            }
+            if(ins->name() == "@literal")
+            {
+                ins_shapes[ins] = ins->get_shape();
+                continue;
+            }
+            if(ins->name() == "@return")
+            {
+                return ins_shapes[ins->inputs().at(0)].with_type(type);
+            }
+            std::vector<shape> input_shapes;
+            input_shapes.resize(ins->inputs().size());
+            std::transform(ins->inputs().begin(),
+                           ins->inputs().end(),
+                           input_shapes.begin(),
+                           [&](auto in) { return ins_shapes[in]; });
+            ins_shapes[ins] = ins->get_operator().compute_shape(input_shapes);
+        }
+        MIGRAPHX_THROW("No return found in the submodule");
    }
 };
-MIGRAPHX_REGISTER_OP(mlir_conv);
+MIGRAPHX_REGISTER_OP(mlir_op);

 namespace {

 MIGRAPHX_PRED_MATCHER(is_mlir_conv, instruction_ref ins)
 {
-    if(ins->name() != "convolution")
+    if(ins->name() != "convolution" and ins->name() != "quant_convolution")
        return false;
    value v    = ins->get_operator().to_value();
    auto group = v.at("group").to<int>();
@@ -79,51 +134,107 @@ MIGRAPHX_PRED_MATCHER(is_mlir_conv, instruction_ref ins)
    return true;
 }

-struct find_conv_pointwise
+struct find_mlir_op
 {
-    // Find a convolution followed by a pointwise operation.
    auto matcher() const
    {
-        auto convolution =
-            match::skip(match::name("contiguous"))(is_mlir_conv().bind("convolution"));
-        return match::name("pointwise")(match::any_of[match::inputs()](convolution.bind("x")));
+        auto dot_or_conv = match::skip(match::name("contiguous"))(
+            match::any_of(match::name("dot"), is_mlir_conv()).bind("gemm_based_op"));
+        return match::name("pointwise")(match::any_of[match::inputs()](dot_or_conv.bind("x")));
+    }
+
+    std::unordered_map<instruction_ref, instruction_ref>
+    create_param_map_with_literals(module_ref mm, const module* pm, const shape& shape) const
+    {
+        std::unordered_map<instruction_ref, instruction_ref> ins_map;
+        for(auto ins : iterator_for(*pm))
+        {
+            if(ins->name() != "@literal")
+            {
+                continue;
+            }
+            literal r               = ins->get_literal();
+            instruction_ref literal = mm->add_literal(r);
+            instruction_ref mbcast  = mm->add_instruction(
+                make_op("multibroadcast", {{"out_lens", shape.lens()}}), literal);
+            ins_map[ins] = mbcast;
+        }
+        return ins_map;
+    }
+
+    std::tuple<instruction_ref, std::vector<instruction_ref>>
+    fuse_input_ops_and_gemm_based_op(module_ref mm, instruction_ref gemm_based_op) const
+    {
+        std::vector<instruction_ref> top_inputs;
+        std::vector<instruction_ref> imm_inputs;
+        size_t input_cnt = 0;
+        for(instruction_ref input : gemm_based_op->inputs())
+        {
+            std::vector<operation> op_stream;
+            while(contains({"slice", "transpose", "contiguous", "reshape"}, input->name()))
+            {
+                op_stream.push_back(input->get_operator());
+                input = input->inputs().at(0);
+            }
+            top_inputs.push_back(input);
+            instruction_ref prev_input =
+                mm->add_parameter("y" + std::to_string(input_cnt++), input->get_shape());
+            for(const auto& op : reverse(op_stream))
+            {
+                prev_input = mm->add_instruction(op, {prev_input});
+            }
+            imm_inputs.push_back(prev_input);
+        }
+        instruction_ref new_gemm_based_op =
+            mm->add_instruction(gemm_based_op->get_operator(), imm_inputs);
+        return {new_gemm_based_op, top_inputs};
    }

    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
    {
-        auto ins      = r.result;
-        auto conv_ins = r.instructions["convolution"];
-        auto x_ins    = r.instructions["x"]; // input after contiguous
-        auto* pm      = ins->module_inputs().front();
-        auto names    = pm->get_parameter_names();
+        auto ins           = r.result;
+        auto gemm_based_op = r.instructions["gemm_based_op"];
+        auto x_ins         = r.instructions["x"]; // input after contiguous
+        auto* pm           = ins->module_inputs().front();
+        auto names         = pm->get_parameter_names();
        // Whitelist pointwise operators
        if(std::any_of(pm->begin(), pm->end(), [](const auto& i) {
-               return not contains({"@literal", "@param", "@return", "convolution", "add", "relu"},
+               return not contains({"@literal",
+                                    "@param",
+                                    "@return",
+                                    "convolution",
+                                    "quant_convolution",
+                                    "dot",
+                                    "add",
+                                    "relu",
+                                    "dequantizelinear",
+                                    "quantizelinear",
+                                    "mul"},
                                   i.name());
           }))
            return;
-        // Only fuse with fp32/fp16
+        // Only fuse with fp32/fp16/int8/int32
        if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [&](auto i) {
-               return not contains({shape::type_t::float_type, shape::type_t::half_type},
+               return not contains({shape::type_t::float_type,
+                                    shape::type_t::half_type,
+                                    shape::type_t::int8_type,
+                                    shape::type_t::int32_type},
                                   i->get_shape().type());
           }))
            return;
        std::sort(names.begin(), names.end());
        module_ref mm = mpm.create_module("mlir_" + pm->name());
        mm->set_bypass();
-        std::unordered_map<instruction_ref, instruction_ref> param_map;
-        auto x    = mm->add_parameter("x" + std::to_string(names.size()),
-                                   conv_ins->inputs().at(0)->get_shape());
-        auto w    = mm->add_parameter("x" + std::to_string(names.size() + 1),
-                                   conv_ins->inputs().at(1)->get_shape());
-        auto conv = mm->add_instruction(conv_ins->get_operator(), {x, w});
+        std::unordered_map<instruction_ref, instruction_ref> param_map =
+            create_param_map_with_literals(mm, pm, gemm_based_op->get_shape());
+        auto [anchor_op, top_inputs] = fuse_input_ops_and_gemm_based_op(mm, gemm_based_op);
        std::transform(names.begin(),
                       names.end(),
                       ins->inputs().begin(),
                       std::inserter(param_map, param_map.end()),
-                       [&](auto name, auto input) {
+                       [&, &anchor_op = anchor_op](auto name, auto input) {
                           if(input == x_ins)
-                               return std::make_pair(pm->get_parameter(name), conv);
+                               return std::make_pair(pm->get_parameter(name), anchor_op);
                           return std::make_pair(pm->get_parameter(name),
                                                 mm->add_parameter(name, input->get_shape()));
                       });
@@ -133,12 +244,13 @@ struct find_conv_pointwise
        std::copy_if(ins->inputs().begin(),
                     ins->inputs().end(),
                     std::back_inserter(inputs),
-                     [&](auto input) { return input != conv_ins; });
-        inputs.insert(inputs.end(), conv_ins->inputs().begin(), conv_ins->inputs().end());
+                     [&](auto input) { return input != gemm_based_op; });
+        inputs.insert(inputs.end(), top_inputs.begin(), top_inputs.end());
        mpm.get_module().replace_instruction(
-            ins, mlir_conv{conv_ins->get_operator()}, inputs, {mm});
+            ins, mlir_op{gemm_based_op->get_operator()}, inputs, {mm});
    }
 };
+
 } // namespace

 #endif
@@ -146,17 +258,7 @@ struct find_conv_pointwise
 void fuse_mlir::apply(module_pass_manager& mpm) const
 {
 #ifdef MIGRAPHX_MLIR
-    const bool mlir_enabled = enabled(MIGRAPHX_ENABLE_MLIR{});
-    if(mlir_enabled)
-    {
-        match::find_matches(mpm, find_conv_pointwise{});
-    }
-    else
-    {
-        std::cerr << "WARNING: MIGraphX built with MLIR but it is not enabled. Please set the env "
-                     "var MIGRAPHX_ENABLE_MLIR to use MLIR kernel generator."
-                  << std::endl;
-    }
+    match::find_matches(mpm, find_mlir_op{});
 #else
    (void)mpm;
 #endif

--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -165,7 +165,8 @@ struct fusion

 const std::unordered_set<std::string>& get_supported_archs()
 {
-    static std::unordered_set<std::string> supported_archs{"gfx900", "gfx906", "gfx908", "gfx1030"};
+    static std::unordered_set<std::string> supported_archs{
+        "gfx900", "gfx906", "gfx908", "gfx1030", "gfx940"};
    return supported_archs;
 }


--- a/src/targets/gpu/gemm_impl.cpp
+++ b/src/targets/gpu/gemm_impl.cpp
@@ -140,13 +140,8 @@ void gemm_impl(context& ctx,
            compute_type = rocblas_datatype_f32_r;
    }

-#if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
    rocblas_gemm_flags flag =
        int8_x4_format ? rocblas_gemm_flags_pack_int8x4 : rocblas_gemm_flags_none;
-#else
-    (void)int8_x4_format;
-    int flag = 0;
-#endif

    auto a_lens = args[0].get_shape().lens();
    auto b_lens = args[1].get_shape().lens();

--- a/src/targets/gpu/hip.cpp
+++ b/src/targets/gpu/hip.cpp
@@ -146,7 +146,11 @@ std::vector<T> read_from_gpu(const void* x, std::size_t sz)
    gpu_sync();
    std::vector<T> result(sz);
    assert(not is_device_ptr(result.data()));
-    assert(is_device_ptr(x));
+    if(not is_device_ptr(x))
+    {
+        MIGRAPHX_THROW(
+            "read_from_gpu() requires Src buffer to be on the GPU, Copy from gpu failed\n");
+    }
    auto status = hipMemcpy(result.data(), x, sz * sizeof(T), hipMemcpyDeviceToHost);
    if(status != hipSuccess)
        MIGRAPHX_THROW("Copy from gpu failed: " + hip_error(status)); // NOLINT
@@ -189,8 +193,20 @@ argument register_on_gpu(const argument& arg)

 argument to_gpu(const argument& arg, bool host)
 {
-    auto p = write_to_gpu(arg.data(), arg.get_shape().bytes(), host);
-    return {arg.get_shape(), p};
+    argument result;
+    arg.visit(
+        [&](auto x) {
+            auto p = write_to_gpu(arg.data(), arg.get_shape().bytes(), host);
+            result = {x.get_shape(), p};
+        },
+        [&](const auto& xs) {
+            std::vector<argument> args;
+            std::transform(xs.begin(), xs.end(), std::back_inserter(args), [&](auto x) {
+                return to_gpu(x, host);
+            });
+            result = argument{args};
+        });
+    return result;
 }

 argument from_gpu(const argument& arg)

--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -21,71 +21,13 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 #####################################################################################
-project(migraphx-doc)
-find_package(ROCM REQUIRED)

-include(ROCMDoxygenDoc)
-
-set(DOXYGEN_OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/doxygen)
-rocm_add_doxygen_doc(
-    OUTPUT_DIRECTORY ${DOXYGEN_OUTPUT}
-    INPUT 
-        ${CMAKE_SOURCE_DIR}/src
-    INCLUDE_PATH
-        ${CMAKE_SOURCE_DIR}/src/include
-        ${CMAKE_SOURCE_DIR}/src/targets/cpu/include
-        ${CMAKE_SOURCE_DIR}/src/targets/gpu/include
-    STRIP_FROM_INC_PATH
-        ${CMAKE_SOURCE_DIR}/src/include
-        ${CMAKE_SOURCE_DIR}/src/targets/cpu/include
-        ${CMAKE_SOURCE_DIR}/src/targets/gpu/include
-    EXCLUDE_PATTERNS
-        ${CMAKE_SOURCE_DIR}/src/targets/gpu/kernels
-        ${CMAKE_SOURCE_DIR}/src/targets/gpu/device
-    SEARCH_INCLUDES YES
-    MACRO_EXPANSION YES
-    RECURSIVE YES
-    GENERATE_XML YES
-    GENERATE_LATEX YES
-    USE_PDFLATEX YES
-    CALL_GRAPH YES
-    CALLER_GRAPH YES
-    BUILTIN_STL_SUPPORT YES
-    PROJECT_NAME MIGraphX
-    SORT_MEMBERS_CTORS_1ST YES
-    SOURCE_BROWSER YES
-    GENERATE_TREEVIEW YES
-    REFERENCED_BY_RELATION YES
-    REFERENCES_RELATION YES
-    REFERENCES_LINK_SOURCE YES
-    EXTRACT_ALL YES
-    ENUM_VALUES_PER_LINE 1
-    FULL_PATH_NAMES YES
-    WARN_LOGFILE "${DOXYGEN_OUTPUT}/DoxygenWarningLog.txt"
-    PREDEFINED DOXYGEN
+add_executable(migraphx-hiprtc-driver
+    main.cpp
 )
-
-include(ROCMSphinxDoc)
-rocm_add_sphinx_doc(src 
-    BUILDER html 
-    OUTPUT_DIR html
-    VARS 
-        breathe_projects.proj=${DOXYGEN_OUTPUT}/xml
-        breathe_default_project=proj
-    DEPENDS doxygen
+rocm_clang_tidy_check(migraphx-hiprtc-driver)
+target_link_libraries(migraphx-hiprtc-driver PRIVATE migraphx_gpu)
+add_dependencies(migraphx_all_targets migraphx-hiprtc-driver)
+rocm_install_targets(
+    TARGETS migraphx-hiprtc-driver
 )
-
-find_package(LATEX)
-if(LATEX_FOUND)
-    rocm_add_sphinx_doc(src 
-        BUILDER latex
-        OUTPUT_DIR pdf
-        VARS 
-            breathe_projects.proj=${DOXYGEN_OUTPUT}/xml
-            breathe_default_project=proj
-        DEPENDS doxygen
-    )
-else()
-    message("Latex builder not found. Latex builder is required only for building the PDF documentation for MIGraphX and is not necessary for building the library, or any other components. To build PDF documentation run make in ${CMAKE_CURRENT_SOURCE_DIR}/pdf, once a latex builder is installed.")
-endif()
-
--- a/src/targets/gpu/hiprtc/main.cpp
+++ b/src/targets/gpu/hiprtc/main.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/serialize.hpp>
+#include <migraphx/value.hpp>
+#include <migraphx/msgpack.hpp>
+#include <migraphx/file_buffer.hpp>
+#include <migraphx/ranges.hpp>
+#include <iostream>
+#include <cstring>
+
+std::vector<char> read_stdin()
+{
+    std::vector<char> result;
+
+    std::array<char, 1024> buffer;
+    std::size_t len = 0;
+    while((len = std::fread(buffer.data(), 1, buffer.size(), stdin)) > 0)
+    {
+        if(std::ferror(stdin) != 0 and std::feof(stdin) == 0)
+            MIGRAPHX_THROW(std::strerror(errno));
+
+        result.insert(result.end(), buffer.data(), buffer.data() + len);
+    }
+    return result;
+}
+
+int main(int argc, char const* argv[])
+{
+    if(argc < 2 or migraphx::contains({"-h", "--help", "-v", "--version"}, std::string(argv[1])))
+    {
+        std::cout << "USAGE:" << std::endl;
+        std::cout << "    ";
+        std::cout << "Used internally by migraphx to compile hip programs out-of-process."
+                  << std::endl;
+        std::exit(0);
+    }
+    std::string output_name = argv[1];
+
+    auto v = migraphx::from_msgpack(read_stdin());
+    std::vector<migraphx::gpu::hiprtc_src_file> srcs;
+    migraphx::from_value(v.at("srcs"), srcs);
+    auto out = migraphx::gpu::compile_hip_src_with_hiprtc(
+        std::move(srcs), v.at("params").to<std::string>(), v.at("arch").to<std::string>());
+    if(not out.empty())
+        migraphx::write_buffer(output_name, out.front());
+}
--- a/src/targets/gpu/include/migraphx/gpu/compile_gen.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_gen.hpp
@@ -26,6 +26,7 @@

 #include <migraphx/config.hpp>
 #include <migraphx/module_ref.hpp>
+#include <migraphx/instruction_ref.hpp>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -34,6 +35,7 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

 struct shape;
+struct operation;

 namespace gpu {

@@ -72,8 +74,23 @@ std::string make_transformer_args(Ts... xs)

 std::string generate_pointwise(const module& pm, const std::string& name);

+std::string generate_reduce(const module& m, const std::string& name);
+
 std::string generate_name_from_ops(const module& m);

+struct reduce_op
+{
+    std::string input     = "";
+    std::string reduction = "";
+    std::string init      = "0";
+    std::string read      = "op::id{}";
+    std::string write     = "op::id{}";
+
+    void set(instruction_ref ins, const operation& op);
+    std::string str() const;
+    static std::string generate(instruction_ref ins, const std::string& x);
+};
+
 } // namespace gen
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/include/migraphx/gpu/compile_hip.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_hip.hpp
@@ -27,6 +27,8 @@
 #include <migraphx/config.hpp>
 #include <migraphx/filesystem.hpp>
 #include <migraphx/compile_src.hpp>
+#include <migraphx/env.hpp>
+#include <migraphx/functional.hpp>
 #include <string>
 #include <utility>
 #include <vector>
@@ -35,6 +37,31 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

+#ifdef MIGRAPHX_USE_HIPRTC
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_HIPRTC);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS);
+#endif
+
+struct hiprtc_src_file
+{
+    hiprtc_src_file() = default;
+    hiprtc_src_file(const src_file& s)
+        : path(s.path.string()), content(s.content.first, s.content.second)
+    {
+    }
+    std::string path;
+    std::string content;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.path, "path"), f(self.content, "content"));
+    }
+};
+
+std::vector<std::vector<char>> compile_hip_src_with_hiprtc(std::vector<hiprtc_src_file> srcs,
+                                                           std::string params,
+                                                           const std::string& arch);
+
 std::vector<std::vector<char>>
 compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std::string& arch);


--- a/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
@@ -26,6 +26,7 @@

 #include <migraphx/config.hpp>
 #include <migraphx/operation.hpp>
+#include <migraphx/compile_src.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -39,9 +40,10 @@ struct hip_compile_options
    std::size_t local;
    std::vector<shape> inputs;
    shape output;
-    std::string kernel_name           = "kernel";
-    std::string params                = "";
-    std::vector<shape> virtual_inputs = {};
+    std::string kernel_name                    = "kernel";
+    std::string params                         = "";
+    std::vector<shape> virtual_inputs          = {};
+    std::vector<src_file> additional_src_files = {};

    /**
     * @brief Set the launch parameters but allow v to override the values
@@ -71,6 +73,8 @@ operation compile_hip_code_object(const std::string& content, hip_compile_option

 std::size_t compute_block_size(std::size_t n, std::size_t max_block_size = 1024);

+std::string generate_make_shape(const shape& s);
+
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/targets/gpu/include/migraphx/gpu/compile_ops.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_ops.hpp
@@ -38,7 +38,8 @@ struct context;

 struct compile_ops
 {
-    context* ctx = nullptr;
+    context* ctx         = nullptr;
+    bool exhaustive_tune = false;
    std::string name() const { return "gpu::compile_ops"; }
    void apply(module& m) const;
 };

--- a/src/targets/gpu/include/migraphx/gpu/compiler.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compiler.hpp
@@ -30,6 +30,8 @@
 #include <migraphx/value.hpp>
 #include <migraphx/module.hpp>
 #include <migraphx/instruction.hpp>
+#include <migraphx/optional.hpp>
+#include <migraphx/rank.hpp>
 #include <functional>

 namespace migraphx {
@@ -38,17 +40,58 @@ namespace gpu {

 struct context;

-using compiler_replace = std::function<void(module& m, instruction_ref ins)>;
-using compiler_compile = std::function<compiler_replace(context&, instruction_ref, operation)>;
+struct compiler_replace
+{
+    compiler_replace() = default;
+
+    compiler_replace(const operation& op) : code_object{op} {}
+
+    template <class F>
+    compiler_replace(const operation& op, F f)
+        : code_object{op},
+          replace_fn([=](const compiler_replace& cr, module& m, instruction_ref ins) {
+              f(m, ins, cr.code_object);
+          })
+    {
+    }
+
+    operation code_object = {};
+    std::function<void(const compiler_replace& cr, module& m, instruction_ref ins)> replace_fn =
+        nullptr;
+
+    void replace(module& m, instruction_ref ins) const
+    {
+        if(replace_fn)
+            replace_fn(*this, m, ins);
+        else
+            m.replace_instruction(ins, code_object, ins->inputs());
+    }
+};
+
+struct tuning_config
+{
+    value problem;
+    std::vector<value> solutions;
+};
+
+using compiler_compile =
+    std::function<compiler_replace(context&, instruction_ref, operation, const value&)>;
 using compiler_compile_op =
    std::function<operation(context&, const std::vector<shape>& inputs, const value&)>;
+using compiler_tuning_config =
+    std::function<optional<tuning_config>(context&, instruction_ref, const operation&)>;

-void register_compiler(const std::string& name, compiler_compile c, compiler_compile_op cop);
+void register_compiler(const std::string& name,
+                       compiler_compile c,
+                       compiler_compile_op cop,
+                       compiler_tuning_config ctg);

 bool has_compiler_for(const std::string& name);
-compiler_replace compile(context& ctx, instruction_ref ins, const operation& op);
+compiler_replace
+compile(context& ctx, instruction_ref ins, const operation& op, const value& solution);
 operation
 compile_op(const std::string& name, context& ctx, const std::vector<shape>& inputs, const value& v);
+optional<tuning_config> get_tuning_config(context& ctx, instruction_ref ins, const operation& op);

 template <class T>
 void register_compiler()
@@ -58,8 +101,11 @@ void register_compiler()
    {
        register_compiler(
            name,
-            [=](auto&&... xs) { return c.compile(std::forward<decltype(xs)>(xs)...); },
-            [=](auto&&... xs) { return c.compile_op(std::forward<decltype(xs)>(xs)...); });
+            [=](auto&&... xs) {
+                return c.invoke_compile(rank<1>{}, std::forward<decltype(xs)>(xs)...);
+            },
+            [=](auto&&... xs) { return c.compile_op(std::forward<decltype(xs)>(xs)...); },
+            [=](auto&&... xs) { return c.get_tuning_config(std::forward<decltype(xs)>(xs)...); });
    }
 }

@@ -78,12 +124,30 @@ using auto_register_compiler = auto_register<register_compiler_action, T>;
 template <class Derived>
 struct compiler : auto_register_compiler<Derived>
 {
-    auto replace(const operation& op) const
+    const Derived& derived() const { return static_cast<const Derived&>(*this); }
+    optional<tuning_config> get_tuning_config(context&, instruction_ref, const operation&) const
    {
-        return
-            [=](module& m, instruction_ref ins) { m.replace_instruction(ins, op, ins->inputs()); };
+        return nullopt;
    }
    operation compile_op(context&, const std::vector<shape>&, const value&) const { return {}; }
+
+    template <class D = Derived>
+    auto invoke_compile(
+        rank<1>, context& ctx, instruction_ref ins, operation op, const value& solution) const
+        -> decltype(std::declval<D>().compile(ctx, ins, std::move(op), solution))
+    {
+        return derived().compile(ctx, ins, std::move(op), solution);
+    }
+
+    template <class D = Derived>
+    auto invoke_compile(
+        rank<0>, context& ctx, instruction_ref ins, operation op, const value& solution) const
+        -> decltype(std::declval<D>().compile(ctx, ins, std::move(op)))
+    {
+        assert(solution.empty());
+        (void)solution;
+        return derived().compile(ctx, ins, std::move(op));
+    }
 };

 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/context.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/context.hpp
@@ -170,7 +170,9 @@ struct hip_device

    std::size_t stream_id() const { return current_stream; }

-    std::string get_device_name() const { return device_props.gcnArchName; }
+    std::string get_device_name() const { return get_arch_name(device_props); }
+
+    std::string get_gfx_name() const { return trim(split_string(get_device_name(), ':').front()); }

    std::size_t get_device_major() const { return device_props.major; }


--- a/src/targets/gpu/include/migraphx/gpu/convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/convolution.hpp
@@ -21,8 +21,8 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_CONVOLUTION_HPP
-#define MIGRAPHX_GUARD_RTGLIB_CONVOLUTION_HPP
+#ifndef MIGRAPHX_GUARD_RTGLIB_GPU_CONVOLUTION_HPP
+#define MIGRAPHX_GUARD_RTGLIB_GPU_CONVOLUTION_HPP

 #include <migraphx/shape.hpp>
 #include <migraphx/generate.hpp>

--- a/src/targets/gpu/include/migraphx/gpu/device_name.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device_name.hpp
@@ -27,10 +27,14 @@
 #include <migraphx/config.hpp>
 #include <string>

+struct hipDeviceProp_t;
+
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

+std::string get_arch_name(const hipDeviceProp_t& props);
+
 std::string get_device_name();

 int get_device_id();

--- a/src/targets/gpu/include/migraphx/gpu/fuse_ck.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/fuse_ck.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_FUSE_CK_HPP
+#define MIGRAPHX_GUARD_GPU_FUSE_CK_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/context.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct module_pass_manager;
+
+namespace gpu {
+
+struct fuse_ck
+{
+    context* ctx = nullptr;
+    std::string name() const { return "gpu::fuse_ck"; }
+    void apply(module_pass_manager& mpm) const;
+};
+
+} // namespace gpu
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_FUSE_CK_HPP
--- a/src/targets/gpu/include/migraphx/gpu/fuse_mlir.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/fuse_mlir.hpp
@@ -34,6 +34,8 @@ struct module_pass_manager;

 namespace gpu {

+bool mlir_enabled();
+
 struct fuse_mlir
 {
    context* ctx = nullptr;