Merge branch 'develop' into blas_tuning

23cb7917 · Brian Pickrell · GitHub · b5fcc0bc · ea32ca70 · 23cb7917
Unverified Commit 23cb7917 authored Aug 16, 2023 by Brian Pickrell Committed by GitHub Aug 16, 2023
20 changed files
--- a/src/sqlite.cpp
+++ b/src/sqlite.cpp
@@ -48,6 +48,7 @@ struct sqlite_impl
    template <class F>
    void exec(const char* sql, F f)
    {
+        // cppcheck-suppress constParameterPointer
        auto callback = [](void* obj, auto... xs) -> int {
            try
            {

--- a/src/target.cpp
+++ b/src/target.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/target.hpp>
+#include <migraphx/register_target.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+void migraphx_to_value(value& v, const target& t) { v["name"] = t.name(); }
+void migraphx_from_value(const value& v, target& t)
+{
+    t = make_target(v.at("name").to<std::string>());
+}
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/cpu/CMakeLists.txt
+++ b/src/targets/cpu/CMakeLists.txt
@@ -78,6 +78,8 @@ else()
 endif()
 target_link_libraries(migraphx_cpu PRIVATE migraphx)
+migraphx_generate_export_header(migraphx_cpu)
 find_package(OpenMP)
 target_link_libraries(migraphx_cpu PUBLIC OpenMP::OpenMP_CXX)
 # Add library path to rpath to workaround issues with our broken packages
@@ -88,8 +90,6 @@ foreach(LIBRARY ${OpenMP_CXX_LIBRARIES})
    endif()
 endforeach()
-target_link_libraries(migraphx_all_targets INTERFACE migraphx_cpu)
 rocm_install_targets(
  TARGETS migraphx_cpu
  INCLUDE

--- a/src/targets/cpu/deconvolution.cpp
+++ b/src/targets/cpu/deconvolution.cpp
@@ -23,14 +23,14 @@
 */
 #include <migraphx/config.hpp>
 #include <migraphx/cpu/dnnl.hpp>
-#include <migraphx/op/deconvolution.hpp>
+#include <migraphx/op/convolution_backwards.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
 struct dnnl_deconvolution
-    : dnnl_extend_op<dnnl_deconvolution, dnnl::deconvolution_forward, op::deconvolution>
+    : dnnl_extend_op<dnnl_deconvolution, dnnl::deconvolution_forward, op::convolution_backwards>
 {
    std::vector<int> arg_map(int) const
    {

--- a/src/targets/cpu/gemm.cpp
+++ b/src/targets/cpu/gemm.cpp
@@ -43,7 +43,11 @@ struct dnnl_gemm : dnnl_extend_op<dnnl_gemm, dnnl::matmul, op::dot>
                MIGRAPHX_DNNL_PREFIX(ARG_BIAS)};
    }
-    void required(const check_shapes& cs) const { cs.not_broadcasted(); }
+    template <class T>
+    void required(const check_shapes<T>& cs) const
+    {
+        cs.not_broadcasted();
+    }
    dnnl::matmul::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
    {

--- a/src/targets/cpu/include/migraphx/cpu/context.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/context.hpp
@@ -28,6 +28,7 @@
 #include <migraphx/cpu/dnnl.hpp>
 #include <migraphx/cpu/parallel.hpp>
 #include <migraphx/par_for.hpp>
+#include <migraphx/cpu/export.h>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/cpu/include/migraphx/cpu/dnnl.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/dnnl.hpp
@@ -400,7 +400,11 @@ struct dnnl_extend_op : dnnl_op<Derived, Primitive>
    }
    // dnnl has some issues with non-packed inputs
-    void required(const check_shapes& cs) const { cs.packed_or_broadcasted(); }
+    template <class T>
+    void required(const check_shapes<T>& cs) const
+    {
+        cs.packed_or_broadcasted();
+    }
    std::string name() const { return "dnnl::" + op.name(); }
    shape compute_shape(std::vector<shape> inputs) const

--- a/src/targets/cpu/include/migraphx/cpu/lowering.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/lowering.hpp
@@ -24,8 +24,7 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_CPU_LOWERING_HPP
 #define MIGRAPHX_GUARD_RTGLIB_CPU_LOWERING_HPP
-#include <string>
+#include <migraphx/cpu/context.hpp>
-#include <migraphx/config.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -34,7 +33,7 @@ struct module;
 namespace cpu {
-struct lowering
+struct MIGRAPHX_CPU_EXPORT lowering
 {
    std::string name() const { return "cpu::lowering"; }
    void apply(module& m) const;

--- a/src/targets/cpu/include/migraphx/cpu/target.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/target.hpp
@@ -28,14 +28,13 @@
 #include <migraphx/register_target.hpp>
 #include <migraphx/compile_options.hpp>
 #include <migraphx/cpu/context.hpp>
-#include <migraphx/config.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 struct pass;
 namespace cpu {
-struct target
+struct MIGRAPHX_CPU_EXPORT target
 {
    std::string name() const;
    std::vector<pass> get_passes(migraphx::context& gctx, const compile_options&) const;

--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -27,7 +27,7 @@
 #include <migraphx/dfor.hpp>
 #include <migraphx/op/identity.hpp>
 #include <migraphx/op/convolution.hpp>
-#include <migraphx/op/deconvolution.hpp>
+#include <migraphx/op/convolution_backwards.hpp>
 #include <migraphx/op/quant_convolution.hpp>
 #include <migraphx/op/dot.hpp>
 #include <migraphx/op/quant_dot.hpp>
@@ -345,7 +345,7 @@ struct cpu_apply
        extend_op("contiguous", "dnnl::reorder");
        extend_op("convolution", "dnnl::convolution");
 #ifndef MIGRAPHX_ENABLE_ZENDNN
-        extend_op("deconvolution", "dnnl::deconvolution");
+        extend_op("convolution_backwards", "dnnl::convolution_backwards");
        extend_op("dot", "dnnl::dot");
 #endif
        extend_op("erf", "cpu::erf");

--- a/src/targets/cpu/target.cpp
+++ b/src/targets/cpu/target.cpp
@@ -61,7 +61,7 @@ namespace cpu {
 std::string target::name() const { return "cpu"; }
-// cppcheck-suppress constParameter
+// cppcheck-suppress constParameterReference
 std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_options&) const
 {
    auto& ctx = any_cast<context>(gctx);

--- a/src/targets/fpga/include/migraphx/fpga/vitis_ai_adapter.hpp
+++ b/src/targets/fpga/include/migraphx/fpga/vitis_ai_adapter.hpp
@@ -41,7 +41,7 @@ class x_model
    void set_shape(migraphx::shape);
 };
-x_model create_xmodel(migraphx::module_ref mod);
+x_model create_xmodel(migraphx::const_module_ref mod);
 migraphx::argument execute(const x_model& xmodel,
                           const migraphx::shape& output_shape,

--- a/src/targets/fpga/subgraph.cpp
+++ b/src/targets/fpga/subgraph.cpp
@@ -113,8 +113,7 @@ void subgraph::apply(module_pass_manager& mpm) const
    // TODO(varunsh): this code may be replaceable by code in the fuse_pointwise pass
    // assuming all FPGA instructions are in one contiguous range
-    pm->insert_instructions(pm->end(), first, last, {});
+    pm->insert_instructions(pm->end(), first, std::next(last), {});
    migraphx::instruction_ref placeholder_ins;
    for(auto it : iterator_for(mod))
    {

--- a/src/targets/fpga/vitis_ai_adapter.cpp
+++ b/src/targets/fpga/vitis_ai_adapter.cpp
@@ -33,7 +33,7 @@ migraphx::shape x_model::get_shape() const { return shape; };
 void x_model::set_shape(migraphx::shape s) { shape = s; }
-x_model create_xmodel(const migraphx::module_ref mod)
+x_model create_xmodel(migraphx::const_module_ref mod)
 {
    std::cout << "Calling an external function: create_xmodel!\n";
    x_model xmodel;

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -33,6 +33,11 @@ if(NOT TARGET MIOpen)
    message(SEND_ERROR "Cant find miopen")
 endif()
+if(NOT WIN32)
+    # TODO: re-enable when CK is ported to Windows
+    find_package(composable_kernel 1.0.0 REQUIRED COMPONENTS jit_library)
+endif()
 if(BUILD_DEV)
    set(MIGRAPHX_USE_HIPRTC OFF CACHE BOOL "Use hipRTC APIs")
 else()
@@ -40,12 +45,12 @@ else()
 endif()
 include(Embed)
-file(GLOB KERNEL_FILES ${CONFIGURE_DEPENDS}
+file(GLOB KERNEL_FILES CONFIGURE_DEPENDS
    ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/migraphx/kernels/*.hpp)
 message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")
-add_embed_library(migraphx_kernels ${KERNEL_FILES})
+add_embed_library(migraphx_kernels ${KERNEL_FILES} RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/)
-file(GLOB DEVICE_GPU_SRCS ${CONFIGURE_DEPENDS} ${CMAKE_CURRENT_SOURCE_DIR}/device/*.cpp)
+file(GLOB DEVICE_GPU_SRCS CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/device/*.cpp)
 add_library(migraphx_device ${DEVICE_GPU_SRCS})
 add_library(compile_for_gpu INTERFACE)
@@ -65,6 +70,8 @@ target_link_libraries(migraphx_device PUBLIC migraphx)
 target_link_libraries(migraphx_device PRIVATE compile_for_gpu)
 target_include_directories(migraphx_device PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
 target_include_directories(migraphx_device PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/device/include>)
+target_compile_options(migraphx_device PRIVATE -Wno-ignored-attributes)
+migraphx_generate_export_header(migraphx_device DIRECTORY migraphx/gpu/device)
 add_library(kernel_file_check EXCLUDE_FROM_ALL)
@@ -80,7 +87,13 @@ target_link_libraries(kernel_file_check compile_for_gpu)
 rocm_clang_tidy_check(kernel_file_check)
-file(GLOB JIT_GPU_SRCS ${CONFIGURE_DEPENDS} ${CMAKE_CURRENT_SOURCE_DIR}/jit/*.cpp)
+file(GLOB JIT_GPU_SRCS CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/jit/*.cpp)
+if(WIN32)
+    # TODO: re-enable when CK is ported to Windows
+    list(REMOVE_ITEM JIT_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/jit/ck_gemm.cpp)
+endif()
 add_library(migraphx_gpu
    abs.cpp
    analyze_streams.cpp
@@ -95,6 +108,7 @@ add_library(migraphx_gpu
    compile_miopen.cpp
    compiler.cpp
    device_name.cpp
+    fuse_ck.cpp
    fuse_mlir.cpp
    fuse_ops.cpp
    gather.cpp
@@ -123,11 +137,14 @@ add_library(migraphx_gpu
    schedule_model.cpp
    sync_device.cpp
    target.cpp
+    time_op.cpp
    topk.cpp
    write_literals.cpp
    ${JIT_GPU_SRCS}
 )
 set_target_properties(migraphx_gpu PROPERTIES EXPORT_NAME gpu)
+migraphx_generate_export_header(migraphx_gpu)
 function(register_migraphx_gpu_ops PREFIX)
    foreach(OP ${ARGN})
@@ -169,7 +186,7 @@ register_op(migraphx_gpu
    OPERATORS gpu::rocblas_gemm<op::dot> gpu::rocblas_gemm<op::quant_dot>
    INCLUDES migraphx/gpu/context.hpp)
 register_op(migraphx_gpu HEADER migraphx/gpu/convolution.hpp
-    OPERATORS gpu::miopen_convolution<op::convolution> gpu::miopen_convolution<op::deconvolution> gpu::miopen_convolution<op::quant_convolution>
+    OPERATORS gpu::miopen_convolution<op::convolution> gpu::miopen_convolution<op::convolution_backwards> gpu::miopen_convolution<op::quant_convolution>
    INCLUDES migraphx/gpu/context.hpp)
 rocm_set_soversion(migraphx_gpu ${MIGRAPHX_SO_VERSION})
 rocm_clang_tidy_check(migraphx_gpu)
@@ -181,7 +198,9 @@ if(MIGRAPHX_ENABLE_MLIR)
    find_package(rocMLIR 1.0.0 CONFIG REQUIRED)
    message(STATUS "Build with rocMLIR::rockCompiler ${rocMLIR_VERSION}")
    target_compile_definitions(migraphx_gpu PRIVATE "-DMIGRAPHX_MLIR")
-    target_link_libraries(migraphx_gpu PUBLIC rocMLIR::rockCompiler)
+    # Make this private to avoid multiple inclusions of LLVM symbols.
+    # TODO: Fix rocMLIR's library to hide LLVM internals.
+    target_link_libraries(migraphx_gpu PRIVATE rocMLIR::rockCompiler)
 endif()
 if(MIGRAPHX_USE_HIPRTC)
@@ -231,7 +250,12 @@ check_library_exists(roc::rocblas "rocblas_gemm_ex_get_solutions" "${ROCBLAS_LOC
 set(MIGRAPHX_USE_FIND_2_API "${HAS_FIND_2_API}" CACHE BOOL "")
 if(MIGRAPHX_USE_FIND_2_API)
-    target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_2_API)
+    check_library_exists(MIOpen "miopenSetFindOptionPreallocatedTensor" "${MIOPEN_LOCATION}" HAS_PREALLOCATION_API)
+    if(HAS_PREALLOCATION_API)
+        target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_2_API -DMIGRAPHX_PREALLOCATE_MIOPEN_BUFFERS)
+    else()
+        target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_2_API)
+    endif()
    message(STATUS "MIGraphx is using Find-2.0 API of MIOpen")
 else()
    message(STATUS "MIGraphx is using legacy Find API in MIOpen")
@@ -253,6 +277,10 @@ endif()
 target_link_libraries(migraphx_gpu PUBLIC migraphx MIOpen roc::rocblas)
 target_link_libraries(migraphx_gpu PRIVATE migraphx_device migraphx_kernels)
+if(NOT WIN32)
+    # TODO: re-enable when CK is ported to Windows
+    target_link_libraries(migraphx_gpu PRIVATE composable_kernel::jit_library)
+endif()
 add_subdirectory(driver)
 add_subdirectory(hiprtc)

--- a/src/targets/gpu/compile_gen.cpp
+++ b/src/targets/gpu/compile_gen.cpp
@@ -29,6 +29,7 @@
 #include <migraphx/module.hpp>
 #include <migraphx/dead_code_elimination.hpp>
 #include <migraphx/eliminate_common_subexpression.hpp>
+#include <migraphx/rewrite_quantization.hpp>
 #include <migraphx/cpp_generator.hpp>
 #include <migraphx/pass_manager.hpp>
 #include <migraphx/instruction.hpp>
@@ -171,7 +172,8 @@ std::string make_transformer_args(std::vector<std::string> transformers)
 void generate_pointwise(cpp_generator& gg, const module& pm, const std::string& name)
 {
    module m = pm;
-    run_passes(m, {eliminate_common_subexpression{}, dead_code_elimination{}});
+    run_passes(m,
+               {rewrite_quantization{}, eliminate_common_subexpression{}, dead_code_elimination{}});
    cpp_generator g;
    g.fmap([](const std::string& fname) { return "migraphx::" + fname; });
    g.add_point_op("where", "${function:where}(${0}, ${1}, ${2})");
@@ -280,6 +282,14 @@ std::string generate_reduce(const module& m, const std::string& name)
                                    not input->get_shape().broadcasted();
                         });
            auto inner_names = names;
+            for(auto input : ins->inputs())
+            {
+                if(input->name() != "@param")
+                    continue;
+                if(contains(tensors, input))
+                    continue;
+                inner_names[input] += "[out_idx]";
+            }
            for(auto input : tensors)
                inner_names[input] += "_lambda_param";
            auto call_function =
@@ -308,6 +318,8 @@ std::string generate_reduce(const module& m, const std::string& name)
    });
    f.set_attributes({"__device__", "__attribute__((const))"}).set_generic_types(m).set_name(name);
    f.add_generic_param("r");
+    f.add_generic_param("out_idx");
+    f.unused_param("out_idx");
    g.create_function(f);
    return g.str();
 }
@@ -319,7 +331,7 @@ static std::vector<std::string> get_op_names(const module& m)
    {
        if(starts_with(ins.name(), "@"))
            continue;
-        if(ins.name() == "multibroadcast")
+        if(contains({"multibroadcast", "contiguous"}, ins.name()))
            continue;
        if(ins.name() == "pointwise")
        {

--- a/src/targets/gpu/compile_hip.cpp
+++ b/src/targets/gpu/compile_hip.cpp
@@ -56,9 +56,6 @@ MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DUMP_SRC);
 #ifdef MIGRAPHX_USE_HIPRTC
-MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_HIPRTC);
-MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS);
 std::string hiprtc_error(hiprtcResult err, const std::string& msg)
 {
    return "hiprtc: " + (hiprtcGetErrorString(err) + (": " + msg));
@@ -194,6 +191,7 @@ std::vector<std::vector<char>> compile_hip_src_with_hiprtc(std::vector<hiprtc_sr
        options.push_back("-DMIGRAPHX_HAS_DPP=0");
        options.push_back("-DMIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS=1");
        options.push_back("-Wno-reserved-identifier");
+        options.push_back("-Wno-unused-parameter");
        options.push_back("-Wno-gnu-line-marker");
        options.push_back("-Wno-old-style-cast");
    }
@@ -216,6 +214,15 @@ std::vector<std::vector<char>>
 compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std::string& arch)
 {
    std::vector<hiprtc_src_file> hsrcs{srcs.begin(), srcs.end()};
+    if(enabled(MIGRAPHX_GPU_DUMP_SRC{}))
+    {
+        for(const auto& src : srcs)
+        {
+            if(src.path.extension() != ".cpp")
+                continue;
+            std::cout << std::string(src.content.first, src.len()) << std::endl;
+        }
+    }
    auto p      = dynamic_loader::path(&compile_hip_src_with_hiprtc);
    auto driver = p.parent_path().parent_path() / "bin" / "migraphx-hiprtc-driver";

--- a/src/targets/gpu/compile_hip_code_object.cpp
+++ b/src/targets/gpu/compile_hip_code_object.cpp
@@ -135,10 +135,14 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over)
    std::size_t max_global = ctx.get_current_device().get_cu_count() *
                             ctx.get_current_device().get_max_workitems_per_cu();
    return [n, over, max_global](std::size_t local) {
-        std::size_t groups     = (n + local - 1) / local;
+        // hip require global workitems multiple of local workitems. It may degrade performance.
-        std::size_t max_blocks = max_global / local;
+        // [TODO]: consider adding "fno-hip-uniform-block" flag when it becomes available.
-        std::size_t nglobal    = std::min(max_blocks * over, groups) * local;
+        // https://reviews.llvm.org/D155213
-        return std::min(nglobal, n);
+        std::size_t num_elements = ((n + local - 1) / local) * local;
+        std::size_t groups       = (num_elements + local - 1) / local;
+        std::size_t max_blocks   = max_global / local;
+        std::size_t nglobal      = std::min(max_blocks * over, groups) * local;
+        return std::min(nglobal, num_elements);
    };
 }
@@ -156,14 +160,14 @@ operation compile_hip_code_object(const std::string& content, hip_compile_option
    assert(not options.inputs.empty());
    assert(options.inputs.size() == options.virtual_inputs.size() or
           options.virtual_inputs.empty());
-    std::vector<src_file> srcs;
+    std::vector<src_file> srcs = options.additional_src_files;
    std::transform(migraphx_kernels().begin(),
                   migraphx_kernels().end(),
                   std::back_inserter(srcs),
                   [](auto&& p) {
                       auto&& name = p.first;
                       auto&& c    = p.second;
-                       auto path   = fs::path{"migraphx"} / "kernels" / name;
+                       auto path   = name;
                       return src_file{path, c};
                   });
    srcs.push_back(src_file{fs::path{"main.cpp"},

--- a/src/targets/gpu/compile_miopen.cpp
+++ b/src/targets/gpu/compile_miopen.cpp
@@ -79,7 +79,7 @@ void compile_miopen::apply(module& m) const
        std::size_t ws = 0;
        try
        {
-            // for the regular convolution and deconvolution, this try would always succeed
+            // for the regular convolution and convolution_backwards, this try would always succeed
            ws = compile(op, ins, int8_x4_format);
        }
        catch(migraphx::exception&)

--- a/src/targets/gpu/compile_ops.cpp
+++ b/src/targets/gpu/compile_ops.cpp
@@ -30,6 +30,7 @@
 #include <migraphx/register_op.hpp>
 #include <migraphx/op/identity.hpp>
 #include <migraphx/gpu/compiler.hpp>
+#include <migraphx/gpu/time_op.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -76,33 +77,201 @@ struct compiled_result
    instruction_ref ins;
 };
+struct problem_cache
+{
+    bool has(const std::string& name, const value& problem) const
+    {
+        return contains(cache, create_key(name, problem));
+    }
+    void insert(const std::string& name, const value& problem, const value& solution)
+    {
+        assert(not solution.is_null());
+        cache[create_key(name, problem)] = solution;
+    }
+    void mark(const std::string& name, const value& problem)
+    {
+        cache.insert(std::make_pair(create_key(name, problem), value{}));
+    }
+    optional<value> get(const std::string& name, const value& problem) const
+    {
+        auto it = cache.find(create_key(name, problem));
+        if(it == cache.end())
+            return nullopt;
+        return it->second;
+    }
+    static value create_key(const std::string& name, const value& problem)
+    {
+        return {{"name", name}, {"problem", problem}};
+    }
+    std::unordered_map<value, value> cache;
+};
+struct compile_plan
+{
+    context* ctx;
+    operation preop;
+    instruction_ref ins;
+    optional<tuning_config> config                 = nullopt;
+    std::vector<optional<compiled_result>> results = {};
+    void update_config(bool exhaustive)
+    {
+        config = get_tuning_config(*ctx, ins, preop, exhaustive);
+    }
+    template <class Vector>
+    void insert_compiles(Vector& compiles, const value& solution, std::size_t i)
+    {
+        compiles.emplace_back([=] {
+            try
+            {
+                results[i] = compiled_result{compile(*ctx, ins, preop, solution), ins};
+            }
+            catch(...)
+            {
+                results[i] = nullopt;
+            }
+        });
+    }
+    template <class Vector>
+    void add_compiles(Vector& compiles, problem_cache& pc)
+    {
+        if(config.has_value())
+        {
+            const auto& problem = config->problem;
+            if(auto sol = pc.get(preop.name(), problem))
+            {
+                auto solution = sol.value();
+                // No solution yet until benchmarked so skip for now
+                if(solution.is_null())
+                    return;
+                results.resize(1);
+                insert_compiles(compiles, solution, 0);
+            }
+            else
+            {
+                pc.mark(preop.name(), problem);
+                const auto& solutions = config->solutions;
+                results.resize(solutions.size());
+                for(auto i : range(solutions.size()))
+                {
+                    auto solution = solutions[i];
+                    insert_compiles(compiles, solution, i);
+                }
+            }
+        }
+        else
+        {
+            results.resize(1);
+            insert_compiles(compiles, value{}, 0);
+        }
+    }
+    const compiled_result& benchmark(problem_cache& pc) const
+    {
+        if(results.empty())
+            MIGRAPHX_THROW("No configs to tune");
+        if(results.size() == 1)
+        {
+            if(not results.front().has_value())
+                MIGRAPHX_THROW("No configs to tune");
+            return *results.front();
+        }
+        if(not config)
+            MIGRAPHX_THROW("Multiple kernels without config");
+        std::cout << "Benchmarking " << preop.name() << ": " << results.size() << " configs"
+                  << std::endl;
+        std::vector<double> times;
+        times.reserve(results.size());
+        std::transform(
+            results.begin(), results.end(), std::back_inserter(times), [&](const auto& cr) {
+                if(not cr.has_value())
+                    return std::numeric_limits<double>::max();
+                return time_op(*ctx, cr->replace.code_object, to_shapes(cr->ins->inputs()), 20)
+                    .first;
+            });
+        auto i = std::distance(times.begin(), std::min_element(times.begin(), times.end()));
+        std::cout << "Fastest solution: " << config->solutions.at(i) << std::endl;
+        pc.insert(preop.name(), config->problem, config->solutions.at(i));
+        if(not results[i].has_value())
+            MIGRAPHX_THROW("No valid tuned compilation.");
+        return *results[i];
+    }
+    void replace(module& m, problem_cache& pc) const
+    {
+        const auto& cr = benchmark(pc);
+        cr.replace.replace(m, cr.ins);
+    }
+};
 template <class F>
 void par_compile(std::size_t n, F f)
 {
    if(n == 0)
        return;
-    par_for(n, n / value_of(MIGRAPHX_GPU_COMPILE_PARALLEL{}, n), f);
+    auto d = value_of(MIGRAPHX_GPU_COMPILE_PARALLEL{});
+    if(d == 0)
+        d = n;
+    par_for(n, n / d, f);
 }
-void compile_ops::apply(module& m) const
+struct compile_manager
 {
-    std::vector<std::function<compiled_result()>> compiles;
+    problem_cache pc;
+    std::vector<compile_plan> cps;
+    bool exhaustive = false;
+    template <class... Ts>
+    void add_plan(Ts&&... xs)
+    {
+        cps.push_back({std::forward<Ts>(xs)...});
+    }
+    void update_configs()
+    {
+        par_compile(cps.size(), [&](auto i) { cps[i].update_config(exhaustive); });
+    }
+    void compile(module& m)
+    {
+        std::vector<std::function<void()>> compiles;
+        for(auto& cp : cps)
+        {
+            cp.add_compiles(compiles, pc);
+        }
+        par_compile(compiles.size(), [&](auto i) { compiles[i](); });
+        // Replace and/or benchmark
+        for(const auto& cp : cps)
+        {
+            if(cp.results.empty())
+                continue;
+            cp.replace(m, pc);
+        }
+        // Remove compile_plan already executed
+        cps.erase(std::remove_if(cps.begin(),
+                                 cps.end(),
+                                 [](const auto& cp) { return not cp.results.empty(); }),
+                  cps.end());
+    }
+};
+void compile_ops::apply(module& m) const
+{
+    compile_manager cm;
+    cm.exhaustive = exhaustive_tune;
+    // Find all precompile opes
    for(auto ins : iterator_for(m))
    {
        if(ins->name() != "gpu::precompile_op")
            continue;
        operation preop = any_cast<precompile_op>(ins->get_operator()).op;
-        compiles.emplace_back([=]() -> compiled_result {
+        cm.add_plan(ctx, preop, ins);
-            return {compile(*ctx, ins, preop), ins};
-        });
-    }
-    std::vector<compiled_result> results(compiles.size());
-    par_compile(compiles.size(), [&](auto i) { results[i] = compiles[i](); });
-    for(const auto& cr : results)
-    {
-        cr.replace(m, cr.ins);
    }
+    cm.update_configs();
+    cm.compile(m);
+    // Compile already tuned configs
+    cm.compile(m);
+    assert(cm.cps.empty());
 }
 } // namespace gpu