Merge branch 'develop' into multinomial_parse

63952fb9 · Brian Pickrell · 61f3895c · e7471141 · 63952fb9 · 63952fb9
Commit 63952fb9 authored Jul 06, 2023 by Brian Pickrell
20 changed files
--- a/src/targets/cpu/CMakeLists.txt
+++ b/src/targets/cpu/CMakeLists.txt
@@ -88,8 +88,6 @@ foreach(LIBRARY ${OpenMP_CXX_LIBRARIES})
    endif()
 endforeach()

-target_link_libraries(migraphx_all_targets INTERFACE migraphx_cpu)
-
 rocm_install_targets(
  TARGETS migraphx_cpu
  INCLUDE

--- a/src/targets/gpu/compile_ops.cpp
+++ b/src/targets/gpu/compile_ops.cpp
@@ -111,9 +111,27 @@ struct compile_plan
    context* ctx;
    operation preop;
    instruction_ref ins;
-    optional<tuning_config> config       = nullopt;
-    std::vector<compiled_result> results = {};
-    void update_config() { config = get_tuning_config(*ctx, ins, preop); }
+    optional<tuning_config> config                 = nullopt;
+    std::vector<optional<compiled_result>> results = {};
+    void update_config(bool exhaustive)
+    {
+        config = get_tuning_config(*ctx, ins, preop, exhaustive);
+    }
+    template <class Vector>
+    void insert_compiles(Vector& compiles, const value& solution, std::size_t i)
+    {
+        compiles.emplace_back([=] {
+            try
+            {
+                results[i] = compiled_result{compile(*ctx, ins, preop, solution), ins};
+            }
+            catch(...)
+            {
+                results[i] = nullopt;
+            }
+        });
+    }
+
    template <class Vector>
    void add_compiles(Vector& compiles, problem_cache& pc)
    {
@@ -127,9 +145,7 @@ struct compile_plan
                if(solution.is_null())
                    return;
                results.resize(1);
-                compiles.emplace_back([=] {
-                    results[0] = compiled_result{compile(*ctx, ins, preop, solution), ins};
-                });
+                insert_compiles(compiles, solution, 0);
            }
            else
            {
@@ -139,18 +155,14 @@ struct compile_plan
                for(auto i : range(solutions.size()))
                {
                    auto solution = solutions[i];
-                    compiles.emplace_back([=] {
-                        results[i] = compiled_result{compile(*ctx, ins, preop, solution), ins};
-                    });
+                    insert_compiles(compiles, solution, i);
                }
            }
        }
        else
        {
            results.resize(1);
-            compiles.emplace_back([=] {
-                results[0] = compiled_result{compile(*ctx, ins, preop, value{}), ins};
-            });
+            insert_compiles(compiles, value{}, 0);
        }
    }
    const compiled_result& benchmark(problem_cache& pc) const
@@ -158,7 +170,11 @@ struct compile_plan
        if(results.empty())
            MIGRAPHX_THROW("No configs to tune");
        if(results.size() == 1)
-            return results.front();
+        {
+            if(not results.front().has_value())
+                MIGRAPHX_THROW("No configs to tune");
+            return *results.front();
+        }
        if(not config)
            MIGRAPHX_THROW("Multiple kernels without config");
        std::cout << "Benchmarking " << preop.name() << ": " << results.size() << " configs"
@@ -167,11 +183,17 @@ struct compile_plan
        times.reserve(results.size());
        std::transform(
            results.begin(), results.end(), std::back_inserter(times), [&](const auto& cr) {
-                return time_op(*ctx, cr.replace.code_object, to_shapes(cr.ins->inputs()), 20).first;
+                if(not cr.has_value())
+                    return std::numeric_limits<double>::max();
+                return time_op(*ctx, cr->replace.code_object, to_shapes(cr->ins->inputs()), 20)
+                    .first;
            });
        auto i = std::distance(times.begin(), std::min_element(times.begin(), times.end()));
+        std::cout << "Fastest solution: " << config->solutions.at(i) << std::endl;
        pc.insert(preop.name(), config->problem, config->solutions.at(i));
-        return results[i];
+        if(not results[i].has_value())
+            MIGRAPHX_THROW("No valid tuned compilation.");
+        return *results[i];
    }
    void replace(module& m, problem_cache& pc) const
    {
@@ -185,7 +207,10 @@ void par_compile(std::size_t n, F f)
 {
    if(n == 0)
        return;
-    par_for(n, n / value_of(MIGRAPHX_GPU_COMPILE_PARALLEL{}, n), f);
+    auto d = value_of(MIGRAPHX_GPU_COMPILE_PARALLEL{});
+    if(d == 0)
+        d = n;
+    par_for(n, n / d, f);
 }

 struct compile_manager
@@ -202,9 +227,7 @@ struct compile_manager

    void update_configs()
    {
-        if(not exhaustive)
-            return;
-        par_compile(cps.size(), [&](auto i) { cps[i].update_config(); });
+        par_compile(cps.size(), [&](auto i) { cps[i].update_config(exhaustive); });
    }

    void compile(module& m)

--- a/src/targets/gpu/compiler.cpp
+++ b/src/targets/gpu/compiler.cpp
@@ -63,9 +63,10 @@ compile_op(const std::string& name, context& ctx, const std::vector<shape>& inpu
    return compiler_map().at(name).compile_op(ctx, inputs, v);
 }

-optional<tuning_config> get_tuning_config(context& ctx, instruction_ref ins, const operation& op)
+optional<tuning_config>
+get_tuning_config(context& ctx, instruction_ref ins, const operation& op, bool exhaustive)
 {
-    return compiler_map().at(op.name()).get_tuning_config(ctx, ins, op);
+    return compiler_map().at(op.name()).get_tuning_config(ctx, ins, op, exhaustive);
 }

 } // namespace gpu

--- a/src/targets/gpu/fuse_ck.cpp
+++ b/src/targets/gpu/fuse_ck.cpp
@@ -83,10 +83,23 @@ MIGRAPHX_PRED_MATCHER(is_ck_gemm, instruction_ref ins)
        return false;
    auto a = ins->inputs().front()->get_shape();
    auto b = ins->inputs().back()->get_shape();
+    auto m = a.lens()[a.lens().size() - 2];
+    auto n = b.lens().back();
+    auto k = a.lens().back();
+    // Integer gemms must be divisible by 4 in ck
+    if(contains({shape::int8_type, shape::int32_type}, ins->get_shape().type()))
+    {
+        if(m % 4 != 0)
+            return false;
+        if(n % 4 != 0)
+            return false;
+        if(k % 4 != 0)
+            return false;
+    }
    // Skipping GEMMs with a K dimension greater than 2048 is a course-grained strategy
    // to avoid poor-performing GEMM kernels from CK
    // To-do: Investigate a more precise strategy
-    return a.lens().back() <= 2048;
+    return k <= 2048;
 }

 struct find_ck_gemm_pointwise

--- a/src/targets/gpu/fuse_mlir.cpp
+++ b/src/targets/gpu/fuse_mlir.cpp
@@ -139,7 +139,8 @@ struct find_mlir_op
    auto matcher() const
    {
        auto dot_or_conv = match::skip(match::name("contiguous"))(
-            match::any_of(match::name("dot"), is_mlir_conv()).bind("gemm_based_op"));
+            match::any_of(match::name("dot"), match::name("quant_dot"), is_mlir_conv())
+                .bind("gemm_based_op"));
        return match::name("pointwise")(match::any_of[match::inputs()](dot_or_conv.bind("x")));
    }

@@ -190,6 +191,68 @@ struct find_mlir_op
        return {new_gemm_based_op, top_inputs};
    }

+    // Whitelist supported fusion options, including imposing type constraints
+    // for cases where MLIR only supports an operation (usually a pointwise function)
+    // on particular types.
+    bool is_pointwise_op_supported_by_mlir(const instruction& i) const
+    {
+        using type_t                                      = shape::type_t;
+        const auto& name                                  = i.name();
+        const auto result_type                            = i.get_shape().type();
+        const std::initializer_list<type_t> allowed_types = {type_t::float_type,
+                                                             type_t::half_type,
+                                                             type_t::int8_type,
+                                                             type_t::int32_type,
+                                                             type_t::bool_type};
+        // Preliminary type check.
+        if(not contains(allowed_types, result_type))
+        {
+            return false;
+        }
+        const std::initializer_list<std::string> any_type_ops = {"@literal", "@param", "@return"};
+        const std::initializer_list<std::string> no_bool_ops  = {"convolution",
+                                                                "quant_convolution",
+                                                                "dot",
+                                                                "quant_dot",
+                                                                "add",
+                                                                "clip",
+                                                                "sub",
+                                                                "mul",
+                                                                "div",
+                                                                "pow",
+                                                                "where",
+                                                                "quantizelinear",
+                                                                "dequantizelinear",
+                                                                "abs",
+                                                                "neg"};
+        const std::initializer_list<std::string> fp_only_ops  = {"ceil",
+                                                                "erf",
+                                                                "exp",
+                                                                "floor",
+                                                                "log",
+                                                                "recip",
+                                                                "rsqrt",
+                                                                "sigmoid"
+                                                                "softmax",
+                                                                "tanh"};
+        bool is_float = contains({type_t::float_type, type_t::half_type}, result_type);
+        if(contains(any_type_ops, name))
+            return true;
+        if(result_type != type_t::bool_type && contains(no_bool_ops, name))
+            return true;
+        if(is_float && contains(fp_only_ops, name))
+            return true;
+        // Only conversions between floating types are known to be unambigiously
+        // supported.
+        if(is_float && name == "convert")
+        {
+            return std::all_of(i.inputs().begin(), i.inputs().end(), [](const auto& arg) {
+                return contains({type_t::float_type, type_t::half_type}, arg->get_shape().type());
+            });
+        }
+        return false;
+    }
+
    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
    {
        auto ins           = r.result;
@@ -197,31 +260,12 @@ struct find_mlir_op
        auto x_ins         = r.instructions["x"]; // input after contiguous
        auto* pm           = ins->module_inputs().front();
        auto names         = pm->get_parameter_names();
-        // Whitelist pointwise operators
-        if(std::any_of(pm->begin(), pm->end(), [](const auto& i) {
-               return not contains({"@literal",
-                                    "@param",
-                                    "@return",
-                                    "convolution",
-                                    "quant_convolution",
-                                    "dot",
-                                    "add",
-                                    "relu",
-                                    "dequantizelinear",
-                                    "quantizelinear",
-                                    "mul"},
-                                   i.name());
-           }))
-            return;
-        // Only fuse with fp32/fp16/int8/int32
-        if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [&](auto i) {
-               return not contains({shape::type_t::float_type,
-                                    shape::type_t::half_type,
-                                    shape::type_t::int8_type,
-                                    shape::type_t::int32_type},
-                                   i->get_shape().type());
+        // Whitelist pointwise operators.
+        if(std::any_of(pm->begin(), pm->end(), [&](const auto& i) {
+               return not is_pointwise_op_supported_by_mlir(i);
           }))
            return;
+
        std::sort(names.begin(), names.end());
        module_ref mm = mpm.create_module("mlir_" + pm->name());
        mm->set_bypass();

--- a/src/targets/gpu/include/migraphx/gpu/compiler.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compiler.hpp
@@ -79,7 +79,7 @@ using compiler_compile =
 using compiler_compile_op =
    std::function<operation(context&, const std::vector<shape>& inputs, const value&)>;
 using compiler_tuning_config =
-    std::function<optional<tuning_config>(context&, instruction_ref, const operation&)>;
+    std::function<optional<tuning_config>(context&, instruction_ref, const operation&, bool)>;

 void register_compiler(const std::string& name,
                       compiler_compile c,
@@ -91,7 +91,8 @@ compiler_replace
 compile(context& ctx, instruction_ref ins, const operation& op, const value& solution);
 operation
 compile_op(const std::string& name, context& ctx, const std::vector<shape>& inputs, const value& v);
-optional<tuning_config> get_tuning_config(context& ctx, instruction_ref ins, const operation& op);
+optional<tuning_config>
+get_tuning_config(context& ctx, instruction_ref ins, const operation& op, bool exhaustive);

 template <class T>
 void register_compiler()
@@ -125,7 +126,8 @@ template <class Derived>
 struct compiler : auto_register_compiler<Derived>
 {
    const Derived& derived() const { return static_cast<const Derived&>(*this); }
-    optional<tuning_config> get_tuning_config(context&, instruction_ref, const operation&) const
+    optional<tuning_config>
+    get_tuning_config(context&, instruction_ref, const operation&, bool) const
    {
        return nullopt;
    }

--- a/src/targets/gpu/include/migraphx/gpu/lowering.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/lowering.hpp
@@ -30,7 +30,7 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

-struct module;
+struct module_pass_manager;

 namespace gpu {

@@ -45,7 +45,7 @@ struct lowering
    context* ctx;
    bool offload_copy;
    std::string name() const { return "gpu::lowering"; }
-    void apply(module& m) const;
+    void apply(module_pass_manager& mpm) const;
 };

 } // namespace gpu

--- a/src/targets/gpu/jit/ck_gemm.cpp
+++ b/src/targets/gpu/jit/ck_gemm.cpp
@@ -50,6 +50,7 @@ MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_LOG_CK_GEMM);
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_CK_TUNING);
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_CK_TUNING_VALUE);
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_CK_DEBUG);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TUNE_CK);

 // NOLINTNEXTLINE
 static const char* const ck_gemm_kernel = R"__migraphx__(
@@ -265,7 +266,7 @@ struct ck_gemm_compiler : compiler<ck_gemm_compiler>
        s       = shape{s.type(), {m1, m2}};
    }

-    std::vector<std::string> names() const { return {"gpu::ck_gemm"}; }
+    std::vector<std::string> names() const { return {"ck_gemm", "gpu::ck_gemm"}; }

    static bool standard_batch(const shape& s)
    {
@@ -418,9 +419,7 @@ struct ck_gemm_compiler : compiler<ck_gemm_compiler>
    {
        auto shapes = to_shapes(ins->inputs());
        auto v      = create_settings(ins, op);
-        if(solution.is_null())
-            v["tuning_value"] = 4;
-        else
+        if(not solution.is_null())
            v["tuning_value"] = solution;
        return {compile_op(ctx, shapes, v),
                [=](module& m, instruction_ref ins2, const operation& code_object) {
@@ -436,8 +435,10 @@ struct ck_gemm_compiler : compiler<ck_gemm_compiler>
    }

    optional<tuning_config>
-    get_tuning_config(context& ctx, instruction_ref ins, const operation& op) const
+    get_tuning_config(context& ctx, instruction_ref ins, const operation& op, bool exhaustive) const
    {
+        if(not exhaustive and not enabled(MIGRAPHX_TUNE_CK{}))
+            return nullopt;
        tuning_config tc;
        auto shapes    = to_shapes(ins->inputs());
        auto problem   = create_problem(shapes, create_settings(ins, op));

--- a/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm.hpp
@@ -52,7 +52,7 @@ __device__ void ck_gemm_matrix(E e, A a, B b, Ds... ds)
                                             ck::make_tuple(to_ck_tensor<Ds>()...),
                                             to_ck_tensor<E>());

-    static_assert(desc.is_valid, "Invalid ck gemm.");
+    static_assert(desc.IsValid(), "Invalid ck gemm.");

    G::Run(desc,
           to_ck_const_pointer(a.data()),

--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -22,12 +22,19 @@
 * THE SOFTWARE.
 */
 #include <iterator>
-#include <migraphx/gpu/lowering.hpp>
+#include <utility>
+#include <functional>
+#include <algorithm>
+#include <map>
+
 #include <migraphx/manage_ptr.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/make_op.hpp>
 #include <migraphx/instruction_ref.hpp>
 #include <migraphx/stringutils.hpp>
+#include <migraphx/pass_manager.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/program.hpp>

 #include <migraphx/op/dot.hpp>
 #include <migraphx/op/if_op.hpp>
@@ -35,17 +42,12 @@
 #include <migraphx/op/quant_dot.hpp>

 #include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/lowering.hpp>
 #include <migraphx/gpu/device_name.hpp>
 #include <migraphx/gpu/gemm.hpp>
 #include <migraphx/gpu/miopen.hpp>
 #include <migraphx/gpu/rocblas.hpp>
 #include <migraphx/gpu/compiler.hpp>
-#include <migraphx/iterator_for.hpp>
-#include <migraphx/program.hpp>
-#include <utility>
-#include <functional>
-#include <algorithm>
-#include <map>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -53,8 +55,9 @@ namespace gpu {

 struct miopen_apply
 {
-    module* mod          = nullptr;
-    const lowering* pass = nullptr;
+    module* mod              = nullptr;
+    module_pass_manager* mpm = nullptr;
+    const lowering* pass     = nullptr;
    std::unordered_map<std::string, std::function<instruction_ref(instruction_ref)>> apply_map{};
    instruction_ref last{};
    bool offload_copy   = false;
@@ -83,8 +86,7 @@ struct miopen_apply
        auto& ctx      = get_context();
        int8_x4_format = get_int8_x4_format(ctx);
        compute_fp32   = get_compute_fp32_flag();
-        // TODO: Set Offload copy based on root modules' compile options
-        offload_copy = (mod->name() == "main") ? pass->offload_copy : false;
+        offload_copy   = (mod == mpm->get_root_module()) ? pass->offload_copy : false;

        add_generic_op("contiguous");

@@ -376,7 +378,10 @@ struct miopen_apply
    }
 };

-void lowering::apply(module& m) const { miopen_apply{&m, this}.apply(); }
+void lowering::apply(module_pass_manager& mpm) const
+{
+    miopen_apply{&mpm.get_module(), &mpm, this}.apply();
+}

 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/mlir.cpp
+++ b/src/targets/gpu/mlir.cpp
@@ -121,7 +121,10 @@ struct mlir_handle

 #define MIGRAPHX_MANAGE_MLIR_HANDLE(T, F) migraphx::gpu::mlir_handle<T, decltype(&F), &F> // NOLINT

-using mlir_context           = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirContext, mlirContextDestroy);
+using mlir_context     = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirContext, mlirContextDestroy);
+using mlir_thread_pool = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirLlvmThreadPool, mlirLlvmThreadPoolDestroy);
+using mlir_dialect_registry  = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirDialectRegistry,
+                                                          mlirDialectRegistryDestroy);
 using mlir_module            = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirModule, mlirModuleDestroy);
 using mlir_operation         = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirOperation, mlirOperationDestroy);
 using mlir_op_printing_flags = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirOpPrintingFlags,
@@ -173,16 +176,38 @@ bool has_xdlops(const std::string& target_arch)
 struct mlir_program
 {
    mlir_program()
-        : ctx(mlirContextCreate()),
+        : ctx(mlirContextCreateWithRegistry(get_dialect_registry().get(),
+                                            /*threadingEnable=*/false)),
          location(mlirLocationUnknownGet(ctx.get())),
          mmodule(mlirModuleCreateEmpty(location))
    {
-        MlirDialectRegistry registry = mlirDialectRegistryCreate();
-        mlirRegisterRocMLIRDialects(registry);
-        mlirContextAppendDialectRegistry(ctx.get(), registry);
+        mlirContextSetThreadPool(ctx.get(), get_thread_pool().get());
        mlirContextLoadAllAvailableDialects(ctx.get());
-        mlirDialectRegistryDestroy(registry);
-        mlirContextSetAllowUnregisteredDialects(ctx.get(), true /*allow*/);
+    }
+
+    static mlir_dialect_registry& get_dialect_registry()
+    {
+        static std::once_flag init_guard;
+        static mlir_dialect_registry the_registry;
+        // The MLIR registration functions (for dialects and passes) are not
+        // necessarily thread-safe and need to be executed exactly once
+        // (especially since they eventually call non-thread-safe LLVM
+        // initilizations).
+        std::call_once(init_guard, [&]() {
+            the_registry = mlirDialectRegistryCreate();
+            mlirRegisterRocMLIRDialects(the_registry.get());
+            mlirRegisterRocMLIRPasses();
+        });
+        return the_registry;
+    }
+
+    static mlir_thread_pool& get_thread_pool()
+    {
+        // To save on overhead, we create one LLVM thread pool and reuse it
+        // across all MLIR contexts as recommended by MLIR upstream.
+        // Note that this is thread-safe as of C++11.
+        static mlir_thread_pool the_pool = mlirLlvmThreadPoolCreate();
+        return the_pool;
    }

    MlirType make_type(shape::type_t t) const
@@ -244,8 +269,6 @@ struct mlir_program

    MlirAttribute attribute(std::int64_t i) const
    {
-        if(i < 0)
-            MIGRAPHX_THROW("MLIR cant handle negative values since they are ambiguous");
        return mlirIntegerAttrGet(mlirIntegerTypeGet(ctx.get(), 64), i);
    }
    MlirAttribute attribute(std::uint64_t i) const

--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -76,7 +76,6 @@ MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_SCHEDULE_PASS)
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_REDUCE_FUSION)
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_NHWC)
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_CK)
-MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_FAST_GELU)

 struct id_pass
 {
@@ -125,7 +124,7 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        inline_module{},
        rewrite_pooling{},
        dead_code_elimination{},
-        enable_pass(not enabled(MIGRAPHX_DISABLE_FAST_GELU{}), rewrite_gelu{}),
+        enable_pass(options.fast_math, rewrite_gelu{}),
        optimize_module{},
        enable_pass(enabled(MIGRAPHX_ENABLE_NHWC{}), layout_nhwc{}),
        dead_code_elimination{},

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -24,8 +24,6 @@

 cmake_policy(SET CMP0057 NEW)

-include(CTest)
-
 find_package(Threads REQUIRED)
 include(ProcessorCount)
 ProcessorCount(N)

--- a/test/dead_code_elimination_test.cpp
+++ b/test/dead_code_elimination_test.cpp
@@ -232,7 +232,6 @@ TEST_CASE(reused_twice)

    auto count = std::distance(mm->begin(), mm->end());
    run_pass(p);
-    p.debug_print();
    EXPECT(std::distance(mm->begin(), mm->end()) != count);
    EXPECT(std::distance(mm->begin(), mm->end()) == 4);
 }
@@ -274,4 +273,17 @@ TEST_CASE(param_not_eliminated)
    EXPECT(p == create_program());
 }

+TEST_CASE(tuple_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    auto one = mm->add_literal(1);
+    auto two = mm->add_literal(2);
+    mm->add_instruction(tuple_op{}, one, two);
+    mm->add_return({one, two});
+    auto count = std::distance(mm->begin(), mm->end());
+    run_pass(p);
+    EXPECT(std::distance(mm->begin(), mm->end()) == (count - 1));
+}
+
 int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/gpu/fuse_mlir.cpp
+++ b/test/gpu/fuse_mlir.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/dead_code_elimination.hpp>
+#include <migraphx/gpu/fuse_mlir.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/pass_manager.hpp>
+#include <migraphx/program.hpp>
+#include <basic_ops.hpp>
+#include <migraphx/make_op.hpp>
+
+#include <test.hpp>
+#include <pointwise.hpp>
+
+void run_pass(migraphx::program& p)
+{
+    migraphx::run_passes(p, {migraphx::gpu::fuse_mlir{}, migraphx::dead_code_elimination{}});
+}
+
+template <class F>
+migraphx::instruction_ref add_mlir(migraphx::program& p,
+                                   const std::string& name,
+                                   std::vector<migraphx::instruction_ref> inputs,
+                                   std::vector<std::string> arg_names,
+                                   F f)
+{
+    assert(inputs.size() == arg_names.size() && "One interior parameter name given per input.");
+    auto* mm = p.get_main_module();
+    auto* pm = p.create_module(name);
+    pm->set_bypass();
+    std::vector<migraphx::instruction_ref> params;
+    for(size_t i = 0, e = inputs.size(); i < e; ++i)
+    {
+        params.push_back(pm->add_parameter(arg_names[i], inputs[i]->get_shape()));
+    }
+    auto values = f(pm, params);
+    auto root   = std::get<0>(values);
+    auto r      = std::get<1>(values);
+    pm->add_return({r});
+    return mm->add_instruction(
+        migraphx::make_op("gpu::mlir_op", {{"op", migraphx::to_value(root->get_operator())}}),
+        inputs,
+        {pm});
+}
+
+TEST_CASE(dot_add)
+{
+    migraphx::shape s{migraphx::shape::float_type, {1, 3, 3}};
+    migraphx::program p1;
+    {
+        auto* mm = p1.get_main_module();
+        auto a   = mm->add_parameter("a", s);
+        auto b   = mm->add_parameter("b", s);
+        auto x   = mm->add_parameter("x", s);
+        auto dot = mm->add_instruction(migraphx::make_op("dot"), a, b);
+        auto add = add_pointwise(p1, "main:pointwise0", {dot, x}, single_pointwise("add"));
+        mm->add_return({add});
+    }
+    run_pass(p1);
+    migraphx::program p2;
+    {
+        auto* mm = p2.get_main_module();
+        auto a   = mm->add_parameter("a", s);
+        auto b   = mm->add_parameter("b", s);
+        auto x   = mm->add_parameter("x", s);
+        auto fused =
+            add_mlir(p2,
+                     "mlir_main:pointwise0",
+                     {x, a, b},
+                     {"x1", "y0", "y1"},
+                     [=](auto* pm, const auto& inputs) {
+                         auto dot =
+                             pm->add_instruction(migraphx::make_op("dot"), inputs[1], inputs[2]);
+                         auto add = pm->add_instruction(migraphx::make_op("add"), dot, inputs[0]);
+                         return std::make_tuple(dot, add);
+                     });
+        mm->add_return({fused});
+    }
+    EXPECT(p1.sort() == p2.sort());
+}
+
+TEST_CASE(int_quant_dot_abs)
+{
+    migraphx::shape s_a{migraphx::shape::int8_type, {5, 4}};
+    migraphx::shape s_b{migraphx::shape::int8_type, {4, 3}};
+    migraphx::program p1;
+    {
+        auto* mm = p1.get_main_module();
+        auto a   = mm->add_parameter("a", s_a);
+        auto b   = mm->add_parameter("b", s_b);
+        auto dot = mm->add_instruction(migraphx::make_op("quant_dot"), a, b);
+        auto abs = add_pointwise(p1, "main:pointwise0", {dot}, single_pointwise("abs"));
+        mm->add_return({abs});
+    }
+    run_pass(p1);
+    migraphx::program p2;
+    {
+        auto* mm   = p2.get_main_module();
+        auto a     = mm->add_parameter("a", s_a);
+        auto b     = mm->add_parameter("b", s_b);
+        auto fused = add_mlir(
+            p2, "mlir_main:pointwise0", {a, b}, {"y0", "y1"}, [=](auto* pm, const auto& inputs) {
+                auto dot =
+                    pm->add_instruction(migraphx::make_op("quant_dot"), inputs[0], inputs[1]);
+                auto abs = pm->add_instruction(migraphx::make_op("abs"), dot);
+                return std::make_tuple(dot, abs);
+            });
+        mm->add_return({fused});
+    }
+    EXPECT(p1.sort() == p2.sort());
+}
+
+TEST_CASE(int_quant_dot_tanh_fails)
+{
+    migraphx::shape s_a{migraphx::shape::int8_type, {5, 4}};
+    migraphx::shape s_b{migraphx::shape::int8_type, {4, 3}};
+    migraphx::program p1;
+    {
+        auto* mm  = p1.get_main_module();
+        auto a    = mm->add_parameter("a", s_a);
+        auto b    = mm->add_parameter("b", s_b);
+        auto dot  = mm->add_instruction(migraphx::make_op("quant_dot"), a, b);
+        auto tanh = add_pointwise(p1, "main:pointwise0", {dot}, single_pointwise("tanh"));
+        mm->add_return({tanh});
+    }
+    migraphx::program p2(p1);
+    // This pass should do nothing as int32_t tanh isn't supported.
+    run_pass(p1);
+    EXPECT(p1 == p2);
+}
+
+int main(int argc, const char* argv[])
+{
+    if(migraphx::gpu::mlir_enabled())
+        test::run(argc, argv);
+    return 0;
+}
--- a/test/gpu/mlir.cpp
+++ b/test/gpu/mlir.cpp
@@ -187,12 +187,39 @@ module {
    EXPECT(verify_mlir(m));
 }

+TEST_CASE(quant_dot_add)
+{
+    const std::string mlir_output = R"__migraphx__(
+module {
+  func.func @main(%arg0: tensor<1x5x4xi8>, %arg1: tensor<1x4x3xi8>, %arg2: tensor<1x5x3xi32>) -> tensor<1x5x3xi32> attributes {arch = "", kernel = "mixr"} {
+    %0 = migraphx.quant_dot(%arg0, %arg1) : (tensor<1x5x4xi8>, tensor<1x4x3xi8>) -> tensor<1x5x3xi32>
+    %1 = migraphx.add(%0, %arg2) : (tensor<1x5x3xi32>, tensor<1x5x3xi32>) -> tensor<1x5x3xi32>
+    return %1 : tensor<1x5x3xi32>
+  }
+}
+)__migraphx__";
+    migraphx::module m;
+    auto arg0 = m.add_parameter("arg0", {migraphx::shape::int8_type, {1, 5, 4}});
+    auto arg1 = m.add_parameter("arg1", {migraphx::shape::int8_type, {1, 4, 3}});
+    auto arg2 = m.add_parameter("arg2", {migraphx::shape::int32_type, {1, 5, 3}});
+    auto conv = m.add_instruction(migraphx::make_op("quant_dot"), arg0, arg1);
+    auto add  = m.add_instruction(migraphx::make_op("add"), conv, arg2);
+    m.add_return({add});
+
+    auto s = migraphx::gpu::dump_mlir(m);
+    // Skip test if MLIR is not enabled
+    if(s.empty())
+        return;
+    CHECK(encode(s) == encode(mlir_output));
+    EXPECT(verify_mlir(m));
+}
+
 TEST_CASE(dot_add)
 {
    const std::string mlir_output = R"__migraphx__(
 module {
  func.func @mlir_dot(%arg0: tensor<1x5x4xf32>, %arg1: tensor<1x4x3xf32>, %arg2: tensor<1x5x3xf32>) -> tensor<1x5x3xf32> attributes {arch = "", kernel = "mixr"} {
-    %0 = migraphx.dot(%arg0, %arg1) : tensor<1x5x4xf32>, tensor<1x4x3xf32> -> tensor<1x5x3xf32>
+    %0 = migraphx.dot(%arg0, %arg1) : (tensor<1x5x4xf32>, tensor<1x4x3xf32>) -> tensor<1x5x3xf32>
    %1 = migraphx.add(%0, %arg2) : (tensor<1x5x3xf32>, tensor<1x5x3xf32>) -> tensor<1x5x3xf32>
    return %1 : tensor<1x5x3xf32>
  }
@@ -246,4 +273,57 @@ module {
    EXPECT(verify_mlir(m));
 }

+TEST_CASE(dot_convert)
+{
+    const std::string mlir_output = R"__migraphx__(
+module {
+  func.func @mlir_dot(%arg0: tensor<1x5x4xf32>, %arg1: tensor<1x4x3xf32>) -> tensor<1x5x3xf16> attributes {arch = "", kernel = "mixr"} {
+    %0 = migraphx.dot(%arg0, %arg1) : (tensor<1x5x4xf32>, tensor<1x4x3xf32>) -> tensor<1x5x3xf32>
+    %1 = migraphx.convert(%0) {target_type  =  1  :  i64} : (tensor<1x5x3xf32>) -> tensor<1x5x3xf16>
+    return %1 : tensor<1x5x3xf16>
+  }
+}
+)__migraphx__";
+    migraphx::module m;
+    auto arg0  = m.add_parameter("arg0", {migraphx::shape::float_type, {1, 5, 4}});
+    auto arg1  = m.add_parameter("arg1", {migraphx::shape::float_type, {1, 4, 3}});
+    auto dot   = m.add_instruction(migraphx::make_op("dot"), arg0, arg1);
+    auto trunc = m.add_instruction(
+        migraphx::make_op("convert", {{"target_type", migraphx::shape::half_type}}), dot);
+    m.add_return({trunc});
+    auto s = migraphx::gpu::dump_mlir(m);
+    // Skip test if MLIR is not enabled
+    if(s.empty())
+        return;
+    CHECK(encode(s) == encode(mlir_output));
+    EXPECT(verify_mlir(m));
+}
+
+TEST_CASE(dot_where)
+{
+    const std::string mlir_output = R"__migraphx__(
+module {
+  func.func @mlir_dot(%arg0: tensor<1x5x4xf32>, %arg1: tensor<1x4x3xf32>, %arg2: tensor<1x5x3xi8>, %arg3: tensor<1x5x3xf32>) -> tensor<1x5x3xf32> attributes {arch = "", kernel = "mixr"} {
+    %0 = migraphx.dot(%arg0, %arg1) : (tensor<1x5x4xf32>, tensor<1x4x3xf32>) -> tensor<1x5x3xf32>
+    %1 = migraphx.where(%arg2, %0, %arg3) : (tensor<1x5x3xi8>, tensor<1x5x3xf32>, tensor<1x5x3xf32>) -> tensor<1x5x3xf32>
+    return %1 : tensor<1x5x3xf32>
+  }
+}
+)__migraphx__";
+    migraphx::module m;
+    auto arg0  = m.add_parameter("arg0", {migraphx::shape::float_type, {1, 5, 4}});
+    auto arg1  = m.add_parameter("arg1", {migraphx::shape::float_type, {1, 4, 3}});
+    auto arg2  = m.add_parameter("arg2", {migraphx::shape::bool_type, {1, 5, 3}});
+    auto arg3  = m.add_parameter("arg3", {migraphx::shape::float_type, {1, 5, 3}});
+    auto dot   = m.add_instruction(migraphx::make_op("dot"), arg0, arg1);
+    auto where = m.add_instruction(migraphx::make_op("where"), arg2, dot, arg3);
+    m.add_return({where});
+    auto s = migraphx::gpu::dump_mlir(m);
+    // Skip test if MLIR is not enabled
+    if(s.empty())
+        return;
+    CHECK(encode(s) == encode(mlir_output));
+    EXPECT(verify_mlir(m));
+}
+
 int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/include/basic_ops.hpp
+++ b/test/include/basic_ops.hpp
@@ -186,6 +186,21 @@ struct nop
    migraphx::shape compute_shape(const std::vector<migraphx::shape>&) const { return {}; }
 };

+struct tuple_op
+{
+    std::string name() const { return "tuple_op"; }
+    migraphx::shape compute_shape(const std::vector<migraphx::shape>& inputs) const
+    {
+        return {inputs};
+    }
+    migraphx::argument compute(migraphx::context&,
+                               const migraphx::shape&,
+                               const std::vector<migraphx::argument>& input_args) const
+    {
+        return input_args;
+    }
+};
+
 inline migraphx::literal get_2x2(int base = 0)
 {
    return migraphx::literal{{migraphx::shape::float_type, {2, 2}},

--- a/test/onnx/.onnxrt-commit
+++ b/test/onnx/.onnxrt-commit
-5a43828b3d73028bfd33b3856f82698d9ab02cb1
+fbf08c4b4dce5da245189203d9f6cfc41f6663a2
--- a/test/onnx/clip_dyn_min_max_test.onnx
+++ b/test/onnx/clip_dyn_min_max_test.onnx
--- a/test/onnx/clip_dyn_min_only_test.onnx
+++ b/test/onnx/clip_dyn_min_only_test.onnx