Merge branch 'develop' into blas_tuning

23cb7917 · Brian Pickrell · GitHub · b5fcc0bc · ea32ca70 · 23cb7917
Unverified Commit 23cb7917 authored Aug 16, 2023 by Brian Pickrell Committed by GitHub Aug 16, 2023
20 changed files
--- a/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
@@ -218,7 +218,15 @@ using common_type_t = typename common_type<Ts...>::type;

 #define MIGRAPHX_REQUIRES(...) class = enable_if_t<__VA_ARGS__>

-constexpr unsigned long int_max(unsigned long n) { return (1u << (n * 8)) - 1; }
+constexpr unsigned long int_max(unsigned long n)
+{
+    // Note, left shift cannot be used to get the maximum value of int64_type or
+    // uint64_type because it is undefined behavior to left shift 64 bits for
+    // these types
+    if(n == sizeof(int64_t))
+        return -1;
+    return (1ul << (n * 8)) - 1;
+}

 template <class T,
          MIGRAPHX_REQUIRES(is_integral<T>{} or is_floating_point<T>{} or
@@ -228,9 +236,9 @@ constexpr T numeric_max()
    if constexpr(is_integral<T>{})
    {
        if constexpr(is_unsigned<T>{})
-            return int_max(sizeof(T)) * 2;
-        else
            return int_max(sizeof(T));
+        else
+            return int_max(sizeof(T)) / 2;
    }
    else if constexpr(is_same<T, double>{})
        return __DBL_MAX__;

--- a/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
@@ -135,7 +135,7 @@ constexpr vec<vec_type<T>, N> vec_packed_at(T x, I i)
        return vec<T, N>{x};
    else
    {
-        MIGRAPHX_ASSERT((i + N) < vec_size<T>());
+        MIGRAPHX_ASSERT((i + N) <= vec_size<T>());
        vec<vec_type<T>, N> result = {0};
        for(int j = 0; j < N; j++)
        {

--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -22,12 +22,19 @@
 * THE SOFTWARE.
 */
 #include <iterator>
-#include <migraphx/gpu/lowering.hpp>
+#include <utility>
+#include <functional>
+#include <algorithm>
+#include <map>
+
 #include <migraphx/manage_ptr.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/make_op.hpp>
 #include <migraphx/instruction_ref.hpp>
 #include <migraphx/stringutils.hpp>
+#include <migraphx/pass_manager.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/program.hpp>

 #include <migraphx/op/dot.hpp>
 #include <migraphx/op/if_op.hpp>
@@ -35,17 +42,12 @@
 #include <migraphx/op/quant_dot.hpp>

 #include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/lowering.hpp>
 #include <migraphx/gpu/device_name.hpp>
 #include <migraphx/gpu/gemm.hpp>
 #include <migraphx/gpu/miopen.hpp>
 #include <migraphx/gpu/rocblas.hpp>
 #include <migraphx/gpu/compiler.hpp>
-#include <migraphx/iterator_for.hpp>
-#include <migraphx/program.hpp>
-#include <utility>
-#include <functional>
-#include <algorithm>
-#include <map>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -54,6 +56,7 @@ namespace gpu {
 struct miopen_apply
 {
    module* mod              = nullptr;
+    module_pass_manager* mpm = nullptr;
    const lowering* pass     = nullptr;
    std::unordered_map<std::string, std::function<instruction_ref(instruction_ref)>> apply_map{};
    instruction_ref last{};
@@ -83,7 +86,7 @@ struct miopen_apply
        auto& ctx      = get_context();
        int8_x4_format = get_int8_x4_format(ctx);
        compute_fp32   = get_compute_fp32_flag();
-        offload_copy   = (mod->name() == "main") ? pass->offload_copy : false;
+        offload_copy   = (mod == mpm->get_root_module()) ? pass->offload_copy : false;

        add_generic_op("contiguous");

@@ -103,7 +106,7 @@ struct miopen_apply
        add_extend_op("topk");

        add_convolution_op("convolution");
-        add_convolution_op("deconvolution");
+        add_convolution_op("convolution_backwards");
        add_convolution_op("quant_convolution");
        add_gemm_op<op::dot>("dot");
        add_gemm_op<op::quant_dot>("quant_dot");
@@ -375,7 +378,10 @@ struct miopen_apply
    }
 };

-void lowering::apply(module& m) const { miopen_apply{&m, this}.apply(); }
+void lowering::apply(module_pass_manager& mpm) const
+{
+    miopen_apply{&mpm.get_module(), &mpm, this}.apply();
+}

 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/mlir.cpp
+++ b/src/targets/gpu/mlir.cpp
@@ -36,7 +36,10 @@
 #include <mutex>
 #if !defined(MLIR_MIGRAPHX_DIALECT_API_VERSION) || MLIR_MIGRAPHX_DIALECT_API_VERSION != 3
 #warning "Incompatible version of rocMLIR library used, disabling"
+// Only undefine when not using cppcheck
+#ifndef CPPCHECK
 #undef MIGRAPHX_MLIR
+#endif
 #else
 #include <mlir-c/RegisterRocMLIR.h>
 #endif
@@ -50,8 +53,10 @@
 #include <migraphx/ranges.hpp>
 #include <migraphx/gpu/code_object_op.hpp>
 #include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/compile_gen.hpp>
 #include <migraphx/gpu/device_name.hpp>
 #include <migraphx/gpu/perfdb.hpp>
+#include <migraphx/gpu/tuning_config.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/permutation.hpp>
 #include <deque>
@@ -122,6 +127,9 @@ struct mlir_handle
 #define MIGRAPHX_MANAGE_MLIR_HANDLE(T, F) migraphx::gpu::mlir_handle<T, decltype(&F), &F> // NOLINT

 using mlir_context     = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirContext, mlirContextDestroy);
+using mlir_thread_pool = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirLlvmThreadPool, mlirLlvmThreadPoolDestroy);
+using mlir_dialect_registry  = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirDialectRegistry,
+                                                          mlirDialectRegistryDestroy);
 using mlir_module            = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirModule, mlirModuleDestroy);
 using mlir_operation         = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirOperation, mlirOperationDestroy);
 using mlir_op_printing_flags = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirOpPrintingFlags,
@@ -131,6 +139,10 @@ using mlir_block             = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirBlock, mlirBlockD
 using mlir_pass_manager      = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirPassManager, mlirPassManagerDestroy);
 using mlir_tuning_table      = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirRockTuningTable,
                                                      mlirRockTuningTableDestroy);
+using mlir_tuning_space      = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirRockTuningSpace,
+                                                      mlirRockTuningSpaceDestroy);
+using mlir_tuning_param      = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirRockTuningParam,
+                                                      mlirRockTuningParamDestroy);

 std::string_view to_string_view(MlirStringRef s) { return {s.data, s.length}; }

@@ -164,25 +176,41 @@ std::string mlir_print(F f, T x)
    return ss.str();
 }

-const std::unordered_set<std::string>& get_xdlops_archs()
-{
-    static std::unordered_set<std::string> supported_archs{"gfx908", "gfx90a"};
-    return supported_archs;
-}
-
 struct mlir_program
 {
    mlir_program()
-        : ctx(mlirContextCreate()),
+        : ctx(mlirContextCreateWithRegistry(get_dialect_registry().get(),
+                                            /*threadingEnable=*/false)),
          location(mlirLocationUnknownGet(ctx.get())),
          mmodule(mlirModuleCreateEmpty(location))
    {
-        MlirDialectRegistry registry = mlirDialectRegistryCreate();
-        mlirRegisterRocMLIRDialects(registry);
-        mlirContextAppendDialectRegistry(ctx.get(), registry);
+        mlirContextSetThreadPool(ctx.get(), get_thread_pool().get());
        mlirContextLoadAllAvailableDialects(ctx.get());
-        mlirDialectRegistryDestroy(registry);
-        mlirContextSetAllowUnregisteredDialects(ctx.get(), true /*allow*/);
+    }
+
+    static mlir_dialect_registry& get_dialect_registry()
+    {
+        static std::once_flag init_guard;
+        static mlir_dialect_registry the_registry;
+        // The MLIR registration functions (for dialects and passes) are not
+        // necessarily thread-safe and need to be executed exactly once
+        // (especially since they eventually call non-thread-safe LLVM
+        // initilizations).
+        std::call_once(init_guard, [&]() {
+            the_registry = mlirDialectRegistryCreate();
+            mlirRegisterRocMLIRDialects(the_registry.get());
+            mlirRegisterRocMLIRPasses();
+        });
+        return the_registry;
+    }
+
+    static mlir_thread_pool& get_thread_pool()
+    {
+        // To save on overhead, we create one LLVM thread pool and reuse it
+        // across all MLIR contexts as recommended by MLIR upstream.
+        // Note that this is thread-safe as of C++11.
+        static mlir_thread_pool the_pool = mlirLlvmThreadPoolCreate();
+        return the_pool;
    }

    MlirType make_type(shape::type_t t) const
@@ -244,8 +272,6 @@ struct mlir_program

    MlirAttribute attribute(std::int64_t i) const
    {
-        if(i < 0)
-            MIGRAPHX_THROW("MLIR cant handle negative values since they are ambiguous");
        return mlirIntegerAttrGet(mlirIntegerTypeGet(ctx.get(), 64), i);
    }
    MlirAttribute attribute(std::uint64_t i) const
@@ -324,7 +350,8 @@ struct mlir_program
                                     std::string,
                                     value,
                                     std::vector<value>,
-                                     MlirType>;
+                                     MlirType,
+                                     MlirAttribute>;
    using named_attribute_t = std::pair<std::string_view, attribute_t>;

    MlirNamedAttribute name_attribute(const named_attribute_t& na) const
@@ -365,14 +392,20 @@ struct mlir_program
        mlir_operation_state& add_attributes(const std::vector<named_attribute_t>& named_attrs)
        {
            auto attributes = prog->name_attributes(named_attrs);
+            if(not attributes.empty())
+            {
                mlirOperationStateAddAttributes(&op_state, attributes.size(), attributes.data());
+            }
            return *this;
        }

        mlir_operation_state& add_attribute_value(const value& v)
        {
            auto attributes = prog->name_attributes(v);
+            if(not attributes.empty())
+            {
                mlirOperationStateAddAttributes(&op_state, attributes.size(), attributes.data());
+            }
            return *this;
        }

@@ -395,13 +428,19 @@ struct mlir_program
                return shape{r.type(), r.lens()};
            });
            auto x = prog->make_tensors(reshaped);
+            if(not x.empty())
+            {
                mlirOperationStateAddResults(&op_state, x.size(), x.data());
+            }
            return *this;
        }

        mlir_operation_state& add_operands(const std::vector<MlirValue>& inputs)
+        {
+            if(not inputs.empty())
            {
                mlirOperationStateAddOperands(&op_state, inputs.size(), inputs.data());
+            }
            return *this;
        }

@@ -411,7 +450,10 @@ struct mlir_program
            std::transform(regions.begin(), regions.end(), mregions.begin(), [](const auto& r) {
                return r.get();
            });
+            if(not mregions.empty())
+            {
                mlirOperationStateAddOwnedRegions(&op_state, mregions.size(), mregions.data());
+            }
            mlir_operation op(mlirOperationCreate(&op_state));
            // Release memory since mlir_operation owns it
            for(auto& r : regions)
@@ -468,7 +510,8 @@ struct mlir_program
        ops.add_attributes({{"function_type", make_function_type(inputs, outputs)},
                            {"sym_name", sym_name},
                            {"kernel", std::string("mixr")},
-                            {"arch", target_arch}});
+                            {"arch", target_arch},
+                            {"num_cu", num_cu}});
        ops.add_region(std::move(region));
        insert(body, std::move(ops));

@@ -481,6 +524,10 @@ struct mlir_program
    {
        if(ins->name() == "@return")
            return "func.return";
+        if(ins->name() == "@literal")
+        {
+            return "tosa.const";
+        }
        return "migraphx." + ins->name();
    }

@@ -511,14 +558,7 @@ struct mlir_program

    static std::string get_symbol_name(const module& m)
    {
-        for(auto ins : iterator_for(m))
-        {
-            if(ins->name() == "convolution" or ins->name() == "dot")
-            {
-                return "mlir_" + ins->name();
-            }
-        }
-        return "main";
+        return "mlir_" + gen::generate_name_from_ops(m);
    }

    void parse(const module& m)
@@ -532,20 +572,28 @@ struct mlir_program
        {
            if(ins->name() == "@param")
                continue;
+            if(ins->name() == "contiguous")
+            {
+                ins_map[ins] = ins_map[ins->inputs().at(0)];
+                continue;
+            }
            auto name = get_name(ins);
            auto ops  = create_operation_state(name);
            ops.add_attribute_value(get_operator_value(ins->get_operator()));
            if(ins->name() != "@return")
                ops.add_results({get_shape(ins)});
+            if(ins->name() == "@literal")
+            {
+                literal r            = ins->get_literal();
+                MlirType tensor_type = make_tensor(ins->get_shape());
+                MlirAttribute mlir_value_attr =
+                    mlirDenseElementsAttrRawBufferGet(tensor_type, r.get_shape().bytes(), r.data());
+                ops.add_attributes({{"value", mlir_value_attr}});
+            }
            if(ins->name() == "convolution" or ins->name() == "dot")
            {
                pp =
                    problem_params{ins->get_operator(), to_shapes(ins->inputs()), ins->get_shape()};
-                // check if HW supports xdlops
-                auto target_chip = trim(split_string(target_arch, ':').front());
-                bool xdlops      = contains(get_xdlops_archs(), target_chip);
-                if(xdlops)
-                    ops.add_attributes({{"xdlopsV2", true}});
            }

            std::vector<MlirValue> inputs;
@@ -562,18 +610,30 @@ struct mlir_program
        }
    }

-    code_object_op compile() MIGRAPHX_TIDY_CONST
+    void run_high_level_pipeline() MIGRAPHX_TIDY_CONST
    {
        mlir_pass_manager pm_front{mlirPassManagerCreate(ctx.get())};
-        mlir_pass_manager pm_back{mlirPassManagerCreate(ctx.get())};
-        // 1st pipeline to call
        mlirMIGraphXAddHighLevelPipeline(pm_front.get());
-        mlirPassManagerRun(pm_front.get(), mmodule.get());
+        mlirPassManagerRunOnOp(pm_front.get(), mlirModuleGetOperation(mmodule.get()));
+    }

-        // 2nd pipeline to call
-        get_module_tuned();
+    void run_backend_pipeline() MIGRAPHX_TIDY_CONST
+    {
+        mlir_pass_manager pm_back{mlirPassManagerCreate(ctx.get())};
        mlirMIGraphXAddBackendPipeline(pm_back.get(), target_arch.c_str());
-        mlirPassManagerRun(pm_back.get(), mmodule.get());
+        mlirPassManagerRunOnOp(pm_back.get(), mlirModuleGetOperation(mmodule.get()));
+    }
+
+    code_object_op compile(const value& solution) MIGRAPHX_TIDY_CONST
+    {
+        // 1st pipeline to call
+        run_high_level_pipeline();
+        if(solution.is_null())
+            get_module_tuned();
+        else
+            set_tuning(solution);
+        // 2nd pipeline to call
+        run_backend_pipeline();

        code_object_op op{};
        op.symbol_name                = sym_name;
@@ -582,7 +642,12 @@ struct mlir_program
        return op;
    }

-    void find_target() { target_arch = get_device_name(); }
+    void set_gpu_properties(const context& migraphx_ctx)
+    {
+        const auto& device = migraphx_ctx.get_current_device();
+        target_arch  = device.get_device_name();
+        num_cu       = device.get_cu_count();
+    }

    std::pair<std::size_t, std::size_t> get_launch_params() const
    {
@@ -596,7 +661,7 @@ struct mlir_program

    value::binary get_binary() const
    {
-        int size = 0;
+        size_t size = 0;
        mlirGetBinary(mmodule.get(), &size, nullptr);
        value::binary result(size);
        if(mlirGetBinary(mmodule.get(), &size, reinterpret_cast<char*>(result.data())))
@@ -604,14 +669,52 @@ struct mlir_program
        MIGRAPHX_THROW("Failed to compile mlir program");
    }

+    void set_tuning(const value& v) MIGRAPHX_TIDY_CONST
+    {
+        const auto* str = v.if_string();
+        if(str == nullptr)
+            MIGRAPHX_THROW("mlir tuning solutions must be strings");
+        if(not mlirRockTuningSetFromStr(mmodule.get(), make_mlir_string_ref(*str)))
+            MIGRAPHX_THROW("Failed setting tuning key: " + *str);
+    }
+
+    tuning_config get_tuning_config() MIGRAPHX_TIDY_CONST
+    {
+        tuning_config tc;
+        run_high_level_pipeline();
+        mlir_tuning_space params{
+            mlirRockTuningSpaceCreate(mmodule.get(), RocmlirTuningParamSetKindFull)};
+        for(auto i : range(mlirRockTuningGetNumParams(params.get())))
+        {
+            mlir_tuning_param param{mlirRockTuningParamCreate()};
+            if(not mlirRockTuningParamGet(params.get(), i, param.get()))
+                MIGRAPHX_THROW("Incorrect mlir tuning parameter: " + std::to_string(i));
+            std::array<char, ROCMLIR_TUNING_KEY_BUFSZ> perf_key;
+            size_t perf_key_bytes =
+                mlirRockTuningParamToString(param.get(), perf_key.data(), perf_key.size());
+            if(perf_key_bytes > perf_key.size())
+                MIGRAPHX_THROW("Tuning perf key was " + std::to_string(perf_key_bytes) +
+                               " bytes and thus too long");
+            tc.solutions.emplace_back(perf_key.begin(), perf_key.begin() + perf_key_bytes);
+        }
+        std::array<char, ROCMLIR_TUNING_KEY_BUFSZ> tuning_key;
+        size_t tuning_key_bytes =
+            mlirRockTuningGetKey(mmodule.get(), tuning_key.data(), tuning_key.size());
+        if(tuning_key_bytes > tuning_key.size())
+            MIGRAPHX_THROW("Tuning table key was " + std::to_string(tuning_key_bytes) +
+                           " bytes and thus too long");
+        tc.problem = std::string(tuning_key.begin(), tuning_key.begin() + tuning_key_bytes);
+        return tc;
+    }
+
    std::string get_tune_params(bool xdlops) const { return get_mlir_perf_for_conv(pp, xdlops); }

    // This function appends to tuning cfg file that could be
    // used with rocMLIR tuning scripts.
-    void dump_tuning_cfg(const char* prob_config) const
+    void dump_tuning_cfg(const std::string& prob_config) const
    {
        std::string tuning_cfg_path = string_value_of(MIGRAPHX_MLIR_TUNING_CFG{});
-        if(!tuning_cfg_path.empty())
+        if(not tuning_cfg_path.empty())
        {
            std::vector<std::string> tokens = split_string(prob_config, '\t');
            std::string prob                = tokens[1];
@@ -628,46 +731,66 @@ struct mlir_program
        }
    }

-    static mlir_tuning_table create_tuning_table()
+    static std::pair<mlir_tuning_table, bool> load_tuning_table()
    {
        mlir_tuning_table tuning_table{mlirRockTuningTableCreate()};
+        bool found_table           = false;
        std::string tuning_db_path = string_value_of(MIGRAPHX_MLIR_TUNING_DB{});
-        if(!tuning_db_path.empty())
+        if(not tuning_db_path.empty())
        {
            std::ifstream tuning_db_tsv(tuning_db_path);
            if(tuning_db_tsv)
            {
+                found_table = true;
                std::string line;
                while(std::getline(tuning_db_tsv, line))
                {
                    std::vector<std::string> tokens = split_string(line, '\t');
                    std::string arch                = tokens[0];
-                    std::string prob                = tokens[1];
-                    std::string perf                = tokens[2];
-                    std::string key                 = arch.append("\t").append(prob);
-                    mlirRockTuningUpdateTable(tuning_table.get(), key.c_str(), perf.c_str(), 1.0);
+                    std::string num_cu              = tokens[1];
+                    std::string prob                = tokens[2];
+                    std::string perf                = tokens[3];
+                    std::string key = arch.append("\t").append(num_cu).append("\t").append(prob);
+                    mlirRockTuningUpdateTable(tuning_table.get(),
+                                              make_mlir_string_ref(key),
+                                              make_mlir_string_ref(perf),
+                                              1.0);
                }
            }
        }
        else
        {
+            found_table = false;
            std::cerr
                << "WARNING: MLIR tuning db not found. Please set MIGRAPHX_MLIR_TUNING_DB for "
                   "optimal performance."
                << std::endl;
        }
-        return tuning_table;
+        return std::make_pair(std::move(tuning_table), found_table);
    }

    bool get_module_tuned() const
    {
-        static mlir_tuning_table tuning_table = create_tuning_table();
-        if(!mlirRockTuningSetFromTable(tuning_table.get(), mmodule.get()))
+        static std::pair<mlir_tuning_table, bool> tuning_table = load_tuning_table();
+        if(not mlirRockTuningSetFromTable(tuning_table.first.get(), mmodule.get()))
+        {
+            std::array<char, ROCMLIR_TUNING_KEY_BUFSZ> prob_config;
+            size_t prob_config_bytes =
+                mlirRockTuningGetKey(mmodule.get(), prob_config.data(), prob_config.size());
+            if(prob_config_bytes >= prob_config.size())
            {
-            const char* prob_config = mlirRockTuningGetKey(tuning_table.get(), mmodule.get());
-            std::stringstream key(prob_config);
-            std::cerr << "fails to set param on" << prob_config << std::endl;
-            dump_tuning_cfg(prob_config);
+                std::cerr << "MLIR tuning key overflowed buffer, needed " << prob_config_bytes
+                          << " bytes" << std::endl;
+                return false;
+            }
+            std::string prob_config_str(prob_config.begin(),
+                                        prob_config.begin() + prob_config_bytes);
+            if(tuning_table.second)
+            {
+                std::cerr << "NOTE: MLIR tuning table did not include a key for " << prob_config_str
+                          << std::endl;
+            }
+            dump_tuning_cfg(prob_config_str);
            return false;
        }
        return true;
@@ -678,7 +801,8 @@ struct mlir_program
    mlir_module mmodule;
    problem_params pp;
    std::deque<std::string> strings{};
-    std::string target_arch;
+    std::string target_arch = "";
+    std::size_t num_cu      = 0;
    std::string sym_name;
 };

@@ -690,14 +814,14 @@ std::string dump_mlir(const module& m)
    return mlir_print(&mlirOperationPrint, mod_op);
 }

-void adjust_param_shapes(module& m, const std::vector<instruction_ref>& inputs)
+void adjust_param_shapes(module& m, const std::vector<shape>& inputs)
 {
    auto names = m.get_parameter_names();
    std::sort(names.begin(), names.end());
    for(auto i : range(names.size()))
    {
        const auto& name  = names[i];
-        const auto& input = inputs[i]->get_shape();
+        const auto& input = inputs[i];
        auto param        = m.get_parameter(name);
        if(input.standard())
            continue;
@@ -735,23 +859,25 @@ void adjust_param_shapes(module& m, const std::vector<instruction_ref>& inputs)
    }
 }

-code_object_op compile_mlir(const context&, module m, const std::vector<instruction_ref>& inputs)
+code_object_op compile_mlir(const context& migraphx_ctx,
+                            module m,
+                            const std::vector<instruction_ref>& inputs,
+                            const value& solution)
 {
-    adjust_param_shapes(m, inputs);
+    adjust_param_shapes(m, to_shapes(inputs));
    const bool trace = enabled(MIGRAPHX_TRACE_MLIR{});
+
    if(trace)
        std::cout << m << std::endl;

-    // set mutex while llvm thread support is disabled.
-    static std::mutex g_mlirc_mutex; // NOLINT
-    const std::lock_guard<std::mutex> lock(g_mlirc_mutex);
    mlir_program mp;
-    mp.find_target();
+    mp.set_gpu_properties(migraphx_ctx);
    mp.parse(m);
    auto mod_op = mlirModuleGetOperation(mp.mmodule.get());
    if(trace)
        std::cout << mlir_print(&mlirOperationPrint, mod_op) << std::endl;
-    auto co   = mp.compile();
+    auto co            = mp.compile(solution);
+    co.expected_inputs = to_shapes(inputs);
    co.output          = m.get_output_shapes().front();
    return co;
 }
@@ -772,6 +898,17 @@ instruction_ref insert_mlir(module& m,
    return m.insert_instruction(ins, co, refs);
 }

+tuning_config
+get_tuning_config_mlir(const context& migraphx_ctx, module m, const std::vector<shape>& inputs)
+{
+    adjust_param_shapes(m, inputs);
+
+    mlir_program mp;
+    mp.set_gpu_properties(migraphx_ctx);
+    mp.parse(m);
+    return mp.get_tuning_config();
+}
+
 #else

 std::string dump_mlir(const module&) { return {}; }
@@ -783,20 +920,27 @@ void use(T&)

 // Disabling clang-tidy warning on non-real useage.
 // NOLINTBEGIN(performance-unnecessary-value-param)
-code_object_op compile_mlir(const context&, module, const std::vector<instruction_ref>&)
+code_object_op
+compile_mlir(const context&, module, const std::vector<instruction_ref>&, const value&)
 {
    return {};
 }
-// NOLINTEND(performance-unnecessary-value-param)

 instruction_ref
 // cppcheck-suppress funcArgNamesDifferent
 insert_mlir(module& m, instruction_ref, code_object_op co, const std::vector<instruction_ref>&)
 {
    use(co);
+    use(m);
    return m.end();
 }

+tuning_config get_tuning_config_mlir(const context&, module, const std::vector<shape>&)
+{
+    return {};
+}
+// NOLINTEND(performance-unnecessary-value-param)
+
 #endif

 } // namespace gpu

--- a/src/targets/gpu/rocblas.cpp
+++ b/src/targets/gpu/rocblas.cpp
@@ -47,32 +47,24 @@ rocblas_handle_ptr create_rocblas_handle_ptr(hipStream_t s)
    return rb;
 }

-const std::unordered_set<std::string>& get_rocblas_fp32_archs()
-{
-    static std::unordered_set<std::string> supported_archs{"gfx908", "gfx90a"};
-    return supported_archs;
-}
-
 bool get_compute_fp32_flag()
 {
-    bool compute_fp32 = false;
-#if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
    const auto device_name = trim(split_string(get_device_name(), ':').front());
-    if(contains(get_rocblas_fp32_archs(), device_name))
-        compute_fp32 = true;
-#endif
-    return compute_fp32;
+    return (starts_with(device_name, "gfx9") and device_name >= "gfx908");
 }

 bool get_int8_x4_format(context& ctx)
 {
-    bool int8_x4_format = true;
-#if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
+#if ROCBLAS_VERSION_MAJOR >= 3
+    (void)(ctx);
+    return false;
+#else
+    // int8x4 packed format is only available starting from rocblas-v2.38 and it is deprecated in
+    // v3.0 and will be removed in v4.0
    rocblas_gemm_flags flag;
    rocblas_query_int8_layout_flag(ctx.get_stream().get_rocblas(), &flag);
-    int8_x4_format = (flag == rocblas_gemm_flags_pack_int8x4);
+    return flag == rocblas_gemm_flags_pack_int8x4;
 #endif
-    return int8_x4_format;
 }
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -57,6 +57,7 @@
 #include <migraphx/gpu/concat_gpu_opt.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/device_name.hpp>
+#include <migraphx/gpu/fuse_ck.hpp>
 #include <migraphx/gpu/fuse_mlir.hpp>
 #include <migraphx/gpu/fuse_ops.hpp>
 #include <migraphx/gpu/prefuse_ops.hpp>
@@ -72,9 +73,12 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_SCHEDULE_PASS)
-MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_POINTWISE_FUSION)
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_REDUCE_FUSION)
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_NHWC)
+#ifndef _WIN32
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_CK)
+#endif
+
 struct id_pass
 {
    std::string name() const { return "id"; }
@@ -98,16 +102,17 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
    unsupported_types.erase(shape::type_t::bool_type);
    unsupported_types.erase(shape::type_t::int8_type);
    unsupported_types.erase(shape::type_t::uint8_type);
+    unsupported_types.erase(shape::type_t::int32_type);
    unsupported_types.erase(shape::type_t::tuple_type);
    // clang-format off
    return
    {
-        enable_pass(options.split_single_dyn_dim, split_single_dyn_dim{}),
-        enable_pass(options.split_single_dyn_dim, dead_code_elimination{}),
+        split_single_dyn_dim{},
+        dead_code_elimination{},
        normalize_ops{},
        dead_code_elimination{},
        simplify_qdq{},
-        rewrite_quantization{},
+        enable_pass(not mlir_enabled(), rewrite_quantization{}),
        dead_code_elimination{},
        eliminate_data_type{unsupported_types, shape::type_t::float_type},
        simplify_reshapes{},
@@ -121,7 +126,7 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        inline_module{},
        rewrite_pooling{},
        dead_code_elimination{},
-        rewrite_gelu{},
+        enable_pass(options.fast_math, rewrite_gelu{}),
        optimize_module{},
        enable_pass(enabled(MIGRAPHX_ENABLE_NHWC{}), layout_nhwc{}),
        dead_code_elimination{},
@@ -129,11 +134,15 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        dead_code_elimination{},
        auto_contiguous{},
        optimize_module{},
-        enable_pass(not enabled(MIGRAPHX_DISABLE_POINTWISE_FUSION{}), fuse_pointwise{}),
+        fuse_pointwise{},
        dead_code_elimination{},
        enable_pass(not enabled(MIGRAPHX_DISABLE_REDUCE_FUSION{}), fuse_reduce{}),
        dead_code_elimination{},
-        fuse_mlir{&ctx},
+#ifndef _WIN32
+        enable_pass(enabled(MIGRAPHX_ENABLE_CK{}), fuse_ck{}),
+#endif
+        dead_code_elimination{},
+        enable_pass(mlir_enabled(), fuse_mlir{&ctx}),
        dead_code_elimination{},
        lowering{&ctx, options.offload_copy},
        eliminate_contiguous{"gpu::contiguous"},
@@ -150,7 +159,7 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        dead_code_elimination{},
        adjust_allocation{gpu_allocation_model{}},
        dead_code_elimination{},
-        compile_ops{&ctx},
+        compile_ops{&ctx, options.exhaustive_tune},
        dead_code_elimination{},
        promote_literals{},
        dead_code_elimination{},

--- a/src/targets/gpu/driver/perf.cpp
+++ b/src/targets/gpu/driver/perf.cpp
@@ -21,7 +21,7 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#include <migraphx/gpu/driver/perf.hpp>
+#include <migraphx/gpu/time_op.hpp>
 #include <migraphx/context.hpp>
 #include <migraphx/generate.hpp>
 #include <migraphx/time.hpp>
@@ -30,12 +30,11 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
-namespace driver {

 std::vector<argument> generate_arguments(const std::vector<shape>& shapes, unsigned long seed = 0)
 {
    std::vector<argument> args;
-    std::transform(shapes.begin(), shapes.end(), std::back_inserter(args), [&](auto& s) {
+    std::transform(shapes.begin(), shapes.end(), std::back_inserter(args), [&](const auto& s) {
        return to_gpu(generate_argument(s, seed++));
    });
    return args;
@@ -69,7 +68,6 @@ time_op(context& ictx, operation op, const std::vector<shape>& inputs, int n)
    return std::make_pair(host_time / n, device_time / n);
 }

-} // namespace driver
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/ref/CMakeLists.txt
+++ b/src/targets/ref/CMakeLists.txt
@@ -37,6 +37,8 @@ target_link_libraries(migraphx_ref PUBLIC migraphx)
 target_include_directories(migraphx_ref PRIVATE ${BLAZE_INCLUDE})
 target_compile_definitions(migraphx_ref PRIVATE -DBLAZE_USE_CPP_THREADS)

+migraphx_generate_export_header(migraphx_ref)
+
 rocm_install_targets(
  TARGETS migraphx_ref
  INCLUDE

--- a/src/targets/ref/include/migraphx/ref/context.hpp
+++ b/src/targets/ref/include/migraphx/ref/context.hpp
@@ -25,6 +25,7 @@
 #define MIGRAPHX_GUARD_RTGLIB_CONTEXT_HPP

 #include <migraphx/config.hpp>
+#include <migraphx/ref/export.h>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/ref/include/migraphx/ref/lowering.hpp
+++ b/src/targets/ref/include/migraphx/ref/lowering.hpp
@@ -24,14 +24,14 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_CPU_LOWERING_HPP
 #define MIGRAPHX_GUARD_RTGLIB_CPU_LOWERING_HPP

+#include <migraphx/ref/context.hpp>
 #include <migraphx/program.hpp>
-#include <migraphx/config.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace ref {

-struct lowering
+struct MIGRAPHX_REF_EXPORT lowering
 {
    std::string name() const { return "ref::lowering"; }
    void apply(module& m) const;

--- a/src/targets/ref/include/migraphx/ref/target.hpp
+++ b/src/targets/ref/include/migraphx/ref/target.hpp
@@ -35,7 +35,7 @@ inline namespace MIGRAPHX_INLINE_NS {
 struct pass;
 namespace ref {

-struct target
+struct MIGRAPHX_REF_EXPORT target
 {
    std::string name() const;
    std::vector<pass> get_passes(migraphx::context& ctx, const compile_options&) const;

--- a/src/targets/ref/lowering.cpp
+++ b/src/targets/ref/lowering.cpp
@@ -27,7 +27,7 @@
 #include <migraphx/dfor.hpp>
 #include <migraphx/op/identity.hpp>
 #include <migraphx/op/convolution.hpp>
-#include <migraphx/op/deconvolution.hpp>
+#include <migraphx/op/convolution_backwards.hpp>
 #include <migraphx/op/quant_convolution.hpp>
 #include <migraphx/op/dot.hpp>
 #include <migraphx/op/quant_dot.hpp>

--- a/src/tf/CMakeLists.txt
+++ b/src/tf/CMakeLists.txt
@@ -42,8 +42,9 @@ target_compile_options(tf-proto PRIVATE -w)
 target_link_libraries(tf-proto PRIVATE ${PROTOBUF_LIBRARY})
 set_target_properties(tf-proto PROPERTIES POSITION_INDEPENDENT_CODE On)

-file(GLOB TF_SRCS ${CONFIGURE_DEPENDS} *.cpp)
+file(GLOB TF_SRCS CONFIGURE_DEPENDS *.cpp)
 add_library(migraphx_tf ${TF_SRCS})
+migraphx_generate_export_header(migraphx_tf)
 target_include_directories(migraphx_tf PRIVATE include)
 set_target_properties(migraphx_tf PROPERTIES EXPORT_NAME tf)
 rocm_set_soversion(migraphx_tf ${MIGRAPHX_SO_VERSION})

--- a/src/tf/op_parser.cpp
+++ b/src/tf/op_parser.cpp
@@ -46,6 +46,7 @@ std::vector<std::string> get_op_parsers()
                   op_parser_map().end(),
                   std::back_inserter(result),
                   [&](auto&& p) { return p.first; });
+    std::sort(result.begin(), result.end());
    return result;
 }


--- a/src/tf/parse_batchnorm.cpp
+++ b/src/tf/parse_batchnorm.cpp
@@ -52,7 +52,6 @@ struct parse_batchnorm : op_parser<parse_batchnorm>
        auto x_type = args[0]->get_shape().type();

        // unsqueeze tensors of shape (C) to broadcast correctly
-        auto rt  = info.add_literal(migraphx::literal{migraphx::shape{x_type}, {0.5}});
        auto eps = info.add_literal(migraphx::literal{migraphx::shape{x_type}, {epsilon}});

        auto scale_unsqueeze =
@@ -64,11 +63,11 @@ struct parse_batchnorm : op_parser<parse_batchnorm>
        auto var_unsqueeze =
            info.add_instruction(migraphx::make_op("unsqueeze", {{"axes", {1, 2}}}), args[4]);

-        auto numer   = info.add_broadcastable_binary_op("sub", args[0], mean_unsqueeze);
+        auto x_sub_mean = info.add_broadcastable_binary_op("sub", args[0], mean_unsqueeze);
        auto var_eps    = info.add_broadcastable_binary_op("add", var_unsqueeze, eps);
-        auto denom   = info.add_broadcastable_binary_op("pow", var_eps, rt);
-        auto div0    = info.add_broadcastable_binary_op("div", numer, denom);
-        auto r0      = info.add_broadcastable_binary_op("mul", div0, scale_unsqueeze);
+        auto rsqrt      = info.add_instruction(make_op("rsqrt"), var_eps);
+        auto mul0       = info.add_broadcastable_binary_op("mul", scale_unsqueeze, rsqrt);
+        auto r0         = info.add_broadcastable_binary_op("mul", x_sub_mean, mul0);
        return info.add_broadcastable_binary_op("add", r0, bias_unsqueeze);
    }
 };

--- a/src/tf/tf.cpp
+++ b/src/tf/tf.cpp
@@ -22,6 +22,7 @@
 * THE SOFTWARE.
 */
 #include <migraphx/tf/tf_parser.hpp>
+#include <migraphx/tf/op_parser.hpp>
 #include <iostream>
 #include <fstream>
 #include <unordered_map>
@@ -62,5 +63,7 @@ program parse_tf(const std::string& name, const tf_options& options)
    return std::move(parser.prog);
 }

+std::vector<std::string> get_tf_operators() { return tf::get_op_parsers(); }
+
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/tf/tf_parser.cpp
+++ b/src/tf/tf_parser.cpp
@@ -338,7 +338,7 @@ void tf_parser::parse_node(const std::string& name)
            std::string input_name = input;
            // if input has trailing `:0` index then remove it
            auto multi_out_idx = input.find(':');
-            if(multi_out_idx != std::string::npos && input.substr(multi_out_idx + 1) == "0")
+            if(multi_out_idx != std::string::npos and input.substr(multi_out_idx + 1) == "0")
            {
                input_name = input.substr(0, multi_out_idx);
            }

--- a/src/value.cpp
+++ b/src/value.cpp
@@ -28,6 +28,7 @@
 #include <migraphx/stringutils.hpp>
 #include <migraphx/value.hpp>
 #include <migraphx/optional.hpp>
+#include <migraphx/hash.hpp>
 #include <unordered_map>
 #include <utility>

@@ -284,7 +285,7 @@ bool value::contains(const std::string& pkey) const
 }
 std::size_t value::size() const
 {
-    auto* a = if_array_impl(x);
+    const auto* a = if_array_impl(x);
    if(a == nullptr)
        return 0;
    return a->size();
@@ -519,6 +520,38 @@ std::ostream& operator<<(std::ostream& os, const value& d)
    return os;
 }

+template <class T>
+std::size_t value_hash(const std::string& key, const T& x)
+{
+    std::size_t h = hash_value(key);
+    hash_combine(h, x);
+    return h;
+}
+
+std::size_t value_hash(const std::string& key, std::nullptr_t) { return hash_value(key); }
+
+std::size_t value_hash(const std::string& key, const std::vector<value>& x)
+{
+    std::size_t h = hash_value(key);
+    for(const auto& v : x)
+        hash_combine(h, v);
+    return h;
+}
+std::size_t value_hash(const std::string& key, const value::binary& x)
+{
+    std::size_t h = hash_value(key);
+    for(const auto& v : x)
+        hash_combine(h, v);
+    return h;
+}
+
+std::size_t value::hash() const
+{
+    std::size_t h = 0;
+    this->visit_value([&](const auto& a) { h = value_hash(this->get_key(), a); });
+    return h;
+}
+
 void value::debug_print(bool show_type) const
 {
    if(show_type)

--- a/src/verify_args.cpp
+++ b/src/verify_args.cpp
@@ -35,7 +35,7 @@ bool verify_args(const std::string& name,
    bool passed = true;
    visit_all(ref_arg, target_arg)([&](auto ref, auto target) {
        double error;
-        passed = verify_range(ref, target, tolerance, &error);
+        passed = verify::verify_range(ref, target, tolerance, &error);
        if(not passed)
        {
            // TODO: Check for nans
@@ -45,27 +45,27 @@ bool verify_args(const std::string& name,
                std::cout << "ref:" << ref << std::endl;
            if(target.size() < 32)
                std::cout << "target:" << target << std::endl;
-            if(range_zero(ref))
+            if(verify::range_zero(ref))
                std::cout << "Ref data is all zeros" << std::endl;
-            if(range_zero(target))
+            if(verify::range_zero(target))
                std::cout << "Target data is all zeros" << std::endl;

-            auto mxdiff = max_diff(ref, target);
+            auto mxdiff = verify::max_diff(ref, target);
            std::cout << "Max diff: " << mxdiff << std::endl;

-            auto idx = mismatch_idx(ref, target, float_equal);
-            if(idx < range_distance(ref))
+            auto idx = verify::mismatch_idx(ref, target, float_equal);
+            if(idx < verify::range_distance(ref))
            {
                std::cout << "Mismatch at " << idx << ": " << ref[idx] << " != " << target[idx]
                          << std::endl;
            }

-            auto ref_nan_idx = find_idx(ref, not_finite);
+            auto ref_nan_idx = find_idx(ref, verify::not_finite);
            if(ref_nan_idx >= 0)
                std::cout << "Non finite number found in ref at " << ref_nan_idx << ": "
                          << ref[ref_nan_idx] << std::endl;

-            auto target_nan_idx = find_idx(target, not_finite);
+            auto target_nan_idx = find_idx(target, verify::not_finite);
            if(target_nan_idx >= 0)
                std::cout << "Non finite number found in target at " << target_nan_idx << ": "
                          << target[target_nan_idx] << std::endl;
@@ -73,27 +73,27 @@ bool verify_args(const std::string& name,
        }
        else
        {
-            if(range_zero(ref))
+            if(verify::range_zero(ref))
                std::cout << "Ref data is all zeros" << std::endl;
-            if(range_zero(target))
+            if(verify::range_zero(target))
                std::cout << "Target data is all zeros" << std::endl;

            // auto mxdiff = max_diff(ref, target);
            // std::cout << "Max diff: " << mxdiff << std::endl;

            // auto idx = mismatch_idx(ref, target, float_equal);
-            // if(idx < range_distance(ref))
+            // if(idx < verify::range_distance(ref))
            // {
            //     std::cout << "Mismatch at " << idx << ": " << ref[idx] << " != " << target[idx]
            //               << std::endl;
            // }

-            auto ref_nan_idx = find_idx(ref, not_finite);
+            auto ref_nan_idx = find_idx(ref, verify::not_finite);
            if(ref_nan_idx >= 0)
                std::cout << "Non finite number found in ref at " << ref_nan_idx << ": "
                          << ref[ref_nan_idx] << std::endl;

-            auto target_nan_idx = find_idx(target, not_finite);
+            auto target_nan_idx = find_idx(target, verify::not_finite);
            if(target_nan_idx >= 0)
                std::cout << "Non finite number found in target at " << target_nan_idx << ": "
                          << target[target_nan_idx] << std::endl;

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -24,8 +24,6 @@

 cmake_policy(SET CMP0057 NEW)

-include(CTest)
-
 find_package(Threads REQUIRED)
 include(ProcessorCount)
 ProcessorCount(N)
@@ -100,21 +98,15 @@ endfunction()

 function(add_test_executable TEST_NAME)
    add_executable(${TEST_NAME} EXCLUDE_FROM_ALL ${ARGN})
-    target_link_libraries(${TEST_NAME} ${CMAKE_THREAD_LIBS_INIT})
-
-    # Cmake does not add flags correctly for gcc
-    if(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
-        set_target_properties(${TEST_NAME} PROPERTIES COMPILE_FLAGS -pthread LINK_FLAGS -pthread)
-    endif()
    set(TEST_COMMAND ${TEST_NAME})
    add_test_command(${TEST_NAME} ${TEST_COMMAND})
    add_dependencies(tests ${TEST_NAME})
    add_dependencies(check ${TEST_NAME})
-    target_link_libraries(${TEST_NAME} migraphx migraphx_onnx migraphx_ref)
+    target_link_libraries(${TEST_NAME} Threads::Threads migraphx migraphx_onnx migraphx_ref)
    target_include_directories(${TEST_NAME} PUBLIC include)
 endfunction(add_test_executable)

-file(GLOB TESTS ${CONFIGURE_DEPENDS} *.cpp)
+file(GLOB TESTS CONFIGURE_DEPENDS *.cpp)

 foreach(TEST ${TESTS})
    get_filename_component(BASE_NAME ${TEST} NAME_WE)
@@ -124,7 +116,7 @@ endforeach()

 if(MIGRAPHX_ENABLE_GPU)
    # gpu tests
-    file(GLOB GPU_TESTS ${CONFIGURE_DEPENDS} gpu/*.cpp)
+    file(GLOB GPU_TESTS CONFIGURE_DEPENDS gpu/*.cpp)

    foreach(TEST ${GPU_TESTS})
        get_filename_component(BASE_NAME ${TEST} NAME_WE)
@@ -134,13 +126,16 @@ if(MIGRAPHX_ENABLE_GPU)
            COST 10
            RESOURCE_LOCK gpu
        )
+        if(MIGRAPHX_USE_HIPRTC)
+        target_compile_definitions(test_gpu_${BASE_NAME} PUBLIC -DMIGRAPHX_USE_HIPRTC)
+        endif()
        target_link_libraries(test_gpu_${BASE_NAME} migraphx_gpu migraphx_kernels)
    endforeach()
 endif()

 if(MIGRAPHX_ENABLE_FPGA)
    # fpga tests
-    file(GLOB FPGA_TESTS ${CONFIGURE_DEPENDS} fpga/*.cpp)
+    file(GLOB FPGA_TESTS CONFIGURE_DEPENDS fpga/*.cpp)

    foreach(TEST ${FPGA_TESTS})
        get_filename_component(BASE_NAME ${TEST} NAME_WE)
@@ -187,12 +182,36 @@ if(MIGRAPHX_ENABLE_PYTHON)
    add_subdirectory(py)
 endif()

+# multitarget test
+if(MIGRAPHX_ENABLE_GPU AND MIGRAPHX_ENABLE_CPU AND MIGRAPHX_ENABLE_FPGA)
+    set(TEST_MULTI_TARGET_DIR ${CMAKE_CURRENT_SOURCE_DIR}/multi_target)
+    file(GLOB MULTI_TARGET_TESTS CONFIGURE_DEPENDS ${TEST_MULTI_TARGET_DIR}/*.cpp)
+
+    foreach(MULTI_TARGET_TEST ${MULTI_TARGET_TESTS})
+        get_filename_component(BASE_NAME ${MULTI_TARGET_TEST} NAME_WE)
+        set(TEST_NAME test_${BASE_NAME})
+        add_executable(${TEST_NAME} ${MULTI_TARGET_TEST})
+        rocm_clang_tidy_check(${TEST_NAME})
+        target_link_libraries(${TEST_NAME} migraphx migraphx_onnx migraphx_tf migraphx_all_targets)
+        target_include_directories(${TEST_NAME} PUBLIC include)
+        add_test(NAME ${TEST_NAME} COMMAND $<TARGET_FILE:${TEST_NAME}> WORKING_DIRECTORY ${TEST_MULTI_TARGET_DIR})
+        add_dependencies(tests ${TEST_NAME})
+        add_dependencies(check ${TEST_NAME})
+    endforeach()
+endif()
+
+
 function(test_header NAME HEADER)
-    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/header-main-include-${NAME}.cpp
-        "#include <${HEADER}>\nint main() {}\n"
+    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/header-main-include-${NAME}.cpp "
+#include <${HEADER}>
+int main() {}\n"
    )
-    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/header-static-include-${NAME}.cpp
-        "#include <${HEADER}>\n"
+    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/header-static-include-${NAME}.cpp "
+#include <${HEADER}>
+#if defined(min) || defined(max) || defined(near) || defined(far)
+#error \"Do not include windows.h in header files\"
+#endif
+\n"
    )
    add_test_executable(${NAME}
        ${CMAKE_CURRENT_BINARY_DIR}/header-main-include-${NAME}.cpp
@@ -201,14 +220,14 @@ function(test_header NAME HEADER)
 endfunction()

 function(test_headers PREFIX)
-    file(GLOB HEADERS ${CONFIGURE_DEPENDS} ${ARGN})
+    file(GLOB HEADERS CONFIGURE_DEPENDS ${ARGN})

    foreach(HEADER ${HEADERS})
        file(RELATIVE_PATH HEADER_REL ${CMAKE_SOURCE_DIR} ${HEADER})
        string(MAKE_C_IDENTIFIER ${HEADER_REL} TEST_NAME)
        get_filename_component(BASE_NAME ${HEADER} NAME_WE)
        test_header(header_${TEST_NAME} ${PREFIX}/${BASE_NAME}.hpp)
-        target_link_libraries(header_${TEST_NAME} migraphx_all_targets)
+        target_link_libraries(header_${TEST_NAME} migraphx migraphx_onnx migraphx_tf migraphx_all_targets)
    endforeach()
 endfunction()

@@ -225,3 +244,4 @@ if(MIGRAPHX_ENABLE_FPGA)
    test_headers(migraphx/fpga ${CMAKE_SOURCE_DIR}/src/targets/fpga/include/migraphx/fpga/*.hpp)
 endif()

+