[6.1] Add support for dot-(mul)-softmax-dot offloads to MLIR (#2345)

ff485c7a · Manupa Karunaratne · GitHub · 6d84f7c6 · ff485c7a · ff485c7a
Unverified Commit ff485c7a authored Nov 30, 2023 by Manupa Karunaratne Committed by GitHub Nov 30, 2023
9 changed files
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -136,12 +136,14 @@ rocmtest clang_debug: rocmnode('mi100+') { cmake_build ->
    }
 }, mlir_debug: rocmnode('mi100+') { cmake_build ->
    stage('MLIR Debug') {
-        withEnv(['MIGRAPHX_ENABLE_EXTRA_MLIR=1']) {
+        withEnv(['MIGRAPHX_ENABLE_EXTRA_MLIR=1', 'MIGRAPHX_MLIR_USE_SPECIFIC_OPS=fused,attention,convolution,dot']) {
            def sanitizers = "undefined"
            // Note: the -fno-sanitize= is copied from upstream LLVM_UBSAN_FLAGS.
            def debug_flags_cxx = "-g -O2 -fsanitize=${sanitizers} -fno-sanitize=vptr,function -fno-sanitize-recover=${sanitizers}"
            def debug_flags = "-g -O2 -fsanitize=${sanitizers} -fno-sanitize=vptr -fno-sanitize-recover=${sanitizers}"
            def gpu_targets = getgputargets()
+            // Since the purpose of this run verify all things MLIR supports,
+            // enabling all possible types of offloads
            cmake_build(flags: "-DCMAKE_BUILD_TYPE=debug -DMIGRAPHX_ENABLE_PYTHON=Off -DMIGRAPHX_ENABLE_MLIR=On -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags_cxx}' -DCMAKE_C_FLAGS_DEBUG='${debug_flags}' -DGPU_TARGETS='${gpu_targets}'")
        }
    }

--- a/requirements.txt
+++ b/requirements.txt
@@ -29,4 +29,4 @@ pybind/pybind11@d159a563383d10c821ba7b2a71905d1207db6de4 --build
 msgpack/msgpack-c@cpp-3.3.0 -DMSGPACK_BUILD_TESTS=Off
 sqlite3@3.43.2 -DCMAKE_POSITION_INDEPENDENT_CODE=On
 ROCmSoftwarePlatform/composable_kernel@70eefcf4f263aa5c25f3c9ff0db8f6f199ef0fb9 -DCK_BUILD_JIT_LIB=On -DCMAKE_POSITION_INDEPENDENT_CODE=On
-ROCmSoftwarePlatform/rocMLIR@13f6c2a69cfe80a575c6b241ec7353d1e953cb12 -DBUILD_FAT_LIBROCKCOMPILER=On
+ROCmSoftwarePlatform/rocMLIR@9e66e8050209f03349a41b6b497f0da2b285a53b -DBUILD_FAT_LIBROCKCOMPILER=On
--- a/src/targets/gpu/fuse_mlir.cpp
+++ b/src/targets/gpu/fuse_mlir.cpp
@@ -38,6 +38,18 @@ namespace gpu {
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_EXTRA_MLIR);
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_MLIR);
+/**
+ * @brief Declares a new MIGraphX environment variable which forces to generate
+ * only specific MLIR operations.
+ *
+ * The variable, if defined, forces MIGraphX to use only specific operations
+ * with MLIR regardless of the underlying GPU architecture. The variable accepts
+ * a list of operations separated by comma. The variable recognizes the following
+ * operations: "fused", "convolution", "dot". If the variable is not defined MIGraphX
+ * will decide by itself which operations to delegate to MLIR. The variable is
+ * intended to be primarily used by rocMLIR developers.
+ */
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_MLIR_USE_SPECIFIC_OPS);
 bool mlir_enabled()
 {
@@ -49,6 +61,26 @@ bool mlir_enabled()
 #endif
 }
+static bool is_requested(std::string_view option, bool fallback = false)
+{
+    auto string_value = string_value_of(MIGRAPHX_MLIR_USE_SPECIFIC_OPS{}, "");
+    if(string_value.empty())
+        return fallback;
+    const auto options = split_string(string_value, ',');
+    return contains(options, option);
+}
+bool mlir_attention_enabled()
+{
+#ifdef MIGRAPHX_MLIR
+    if(not mlir_enabled())
+        return false;
+    return is_requested("attention");
+#else
+    return false;
+#endif
+}
 #ifdef MIGRAPHX_MLIR
 struct mlir_op
@@ -62,31 +94,20 @@ struct mlir_op
        return pack(f(self.op, "op"));
    }
-    shape compute_shape(std::vector<shape> inputs, const std::vector<module_ref>& mods) const
+    shape compute_shape(const std::vector<shape>& inputs, const std::vector<module_ref>& mods) const
    {
+        module_ref mod = mods[0];
        check_shapes{inputs, *this}.packed_or_broadcasted();
        if(mods.size() != 1)
            MIGRAPHX_THROW("should have one submodule.");
        if(inputs.size() < 2)
            MIGRAPHX_THROW("should have at least two inputs.");
-        module_ref mod = mods[0];
+        auto type = mod->get_output_shapes().front().type();
-        auto type      = mod->get_output_shapes().front().type();
        std::unordered_map<instruction_ref, shape> ins_shapes;
-        size_t param_cnt               = 0;
-        std::vector<std::string> names = mod->get_parameter_names();
-        std::sort(names.begin(), names.end());
-        for(const std::string& param_name : names)
-        {
-            ins_shapes[mod->get_parameter(param_name)] = inputs[param_cnt++];
-        }
        for(auto ins : iterator_for(*mod))
        {
-            if(ins->name() == "@param")
+            if(ins->name() == "@literal" or ins->name() == "@param")
-            {
-                continue;
-            }
-            if(ins->name() == "@literal")
            {
                ins_shapes[ins] = ins->get_shape();
                continue;
@@ -112,38 +133,48 @@ struct mlir_op
 MIGRAPHX_REGISTER_OP(mlir_op);
 namespace {
+std::tuple<instruction_ref, std::vector<operation>>
+get_fusable_input_op_stream(instruction_ref lower_input)
+{
+    instruction_ref upper_input = lower_input;
+    std::vector<operation> op_stream;
+    while(
+        contains({"slice", "transpose", "contiguous", "reshape", "squeeze", "flatten", "unsqueeze"},
+                 upper_input->name()))
+    {
+        operation op = upper_input->get_operator();
+        if(contains({"squeeze", "flatten", "unsqueeze"}, upper_input->name()))
+        {
+            op = migraphx::make_op("reshape", {{"dims", upper_input->get_shape().lens()}});
+        }
+        op_stream.push_back(op);
+        upper_input = upper_input->inputs().at(0);
+    }
+    return {upper_input, op_stream};
+}
 std::tuple<instruction_ref, std::vector<instruction_ref>>
-fuse_input_ops_and_gemm_based_op(module_ref mm, instruction_ref gemm_based_op)
+fuse_input_ops_and_gemm_based_op(module_ref mm,
+                                 const std::vector<instruction_ref>& gemm_based_op_inputs,
+                                 const operation& gemm_based_op)
 {
    std::vector<instruction_ref> top_inputs;
    std::vector<instruction_ref> imm_inputs;
    size_t input_cnt = 0;
-    for(instruction_ref input : gemm_based_op->inputs())
+    for(instruction_ref input : gemm_based_op_inputs)
    {
-        std::vector<operation> op_stream;
+        auto [upper_input, op_stream] = get_fusable_input_op_stream(input);
-        while(contains(
+        top_inputs.push_back(upper_input);
-            {"slice", "transpose", "contiguous", "reshape", "squeeze", "flatten", "unsqueeze"},
-            input->name()))
-        {
-            operation op = input->get_operator();
-            if(contains({"squeeze", "flatten", "unsqueeze"}, input->name()))
-            {
-                op = migraphx::make_op("reshape", {{"dims", input->get_shape().lens()}});
-            }
-            op_stream.push_back(op);
-            input = input->inputs().at(0);
-        }
-        top_inputs.push_back(input);
        instruction_ref prev_input =
-            mm->add_parameter("y" + std::to_string(input_cnt++), input->get_shape());
+            mm->add_parameter("y" + std::to_string(input_cnt++), upper_input->get_shape());
        for(const auto& op : reverse(op_stream))
        {
            prev_input = mm->add_instruction(op, {prev_input});
        }
        imm_inputs.push_back(prev_input);
    }
-    instruction_ref new_gemm_based_op =
+    instruction_ref new_gemm_based_op = mm->add_instruction(gemm_based_op, imm_inputs);
-        mm->add_instruction(gemm_based_op->get_operator(), imm_inputs);
    return {new_gemm_based_op, top_inputs};
 }
@@ -205,102 +236,135 @@ auto is_mlir_conv(mlir_mode mode)
    });
 }
-struct find_mlir_fused_ops
+std::unordered_map<instruction_ref, instruction_ref>
+create_param_map_with_literals(module_ref mm, const module* pm, const shape& shape)
 {
-    mlir_mode conv_mode = mlir_mode::none;
+    std::unordered_map<instruction_ref, instruction_ref> ins_map;
-    mlir_mode dot_mode  = mlir_mode::none;
+    for(auto ins : iterator_for(*pm))
-    auto matcher() const
    {
-        auto dot_or_conv = match::skip(match::name("contiguous"))(
+        if(ins->name() != "@literal")
-            match::any_of(is_mlir_dot(dot_mode), is_mlir_conv(conv_mode)).bind("gemm_based_op"));
-        return match::name("pointwise")(match::any_of[match::inputs()](dot_or_conv.bind("x")));
-    }
-    std::unordered_map<instruction_ref, instruction_ref>
-    create_param_map_with_literals(module_ref mm, const module* pm, const shape& shape) const
-    {
-        std::unordered_map<instruction_ref, instruction_ref> ins_map;
-        for(auto ins : iterator_for(*pm))
        {
-            if(ins->name() != "@literal")
+            continue;
-            {
-                continue;
-            }
-            literal r               = ins->get_literal();
-            instruction_ref literal = mm->add_literal(r);
-            instruction_ref mbcast  = mm->add_instruction(
-                make_op("multibroadcast", {{"out_lens", shape.lens()}}), literal);
-            ins_map[ins] = mbcast;
        }
-        return ins_map;
+        literal r               = ins->get_literal();
+        instruction_ref literal = mm->add_literal(r);
+        instruction_ref mbcast =
+            mm->add_instruction(make_op("multibroadcast", {{"out_lens", shape.lens()}}), literal);
+        ins_map[ins] = mbcast;
    }
+    return ins_map;
+}
-    // Whitelist supported fusion options, including imposing type constraints
+std::vector<instruction_ref>
-    // for cases where MLIR only supports an operation (usually a pointwise function)
+fold_pointwise_mod(instruction_ref pm_ins,
-    // on particular types.
+                   module_ref parent_mod,
-    bool is_pointwise_op_supported_by_mlir(const instruction& i) const
+                   const std::unordered_map<instruction_ref, instruction_ref>& ins_map)
+{
+    auto* pm   = pm_ins->module_inputs().front();
+    auto names = pm->get_parameter_names();
+    std::sort(names.begin(), names.end());
+    std::unordered_map<instruction_ref, instruction_ref> param_map =
+        create_param_map_with_literals(parent_mod, pm, pm_ins->get_shape());
+    std::transform(names.begin(),
+                   names.end(),
+                   pm_ins->inputs().begin(),
+                   std::inserter(param_map, param_map.end()),
+                   [&](auto name, auto input) {
+                       if(ins_map.count(input))
+                           return std::make_pair(pm->get_parameter(name), ins_map.at(input));
+                       return std::make_pair(pm->get_parameter(name),
+                                             parent_mod->add_parameter(name, input->get_shape()));
+                   });
+    return parent_mod->insert_instructions(parent_mod->end(), pm, param_map);
+}
+// Whitelist supported fusion options, including imposing type constraints
+// for cases where MLIR only supports an operation (usually a pointwise function)
+// on particular types.
+bool is_pointwise_op_supported_by_mlir(const instruction& i)
+{
+    using type_t                                      = shape::type_t;
+    const auto& name                                  = i.name();
+    const auto result_type                            = i.get_shape().type();
+    const std::initializer_list<type_t> allowed_types = {type_t::float_type,
+                                                         type_t::half_type,
+                                                         type_t::int8_type,
+                                                         type_t::int32_type,
+                                                         type_t::bool_type};
+    // Preliminary type check.
+    if(not contains(allowed_types, result_type))
    {
-        using type_t                                      = shape::type_t;
-        const auto& name                                  = i.name();
-        const auto result_type                            = i.get_shape().type();
-        const std::initializer_list<type_t> allowed_types = {type_t::float_type,
-                                                             type_t::half_type,
-                                                             type_t::int8_type,
-                                                             type_t::int32_type,
-                                                             type_t::bool_type};
-        // Preliminary type check.
-        if(not contains(allowed_types, result_type))
-        {
-            return false;
-        }
-        const std::initializer_list<std::string> any_type_ops = {"@literal", "@param", "@return"};
-        const std::initializer_list<std::string> no_bool_ops  = {
-            "convolution",
-            "quant_convolution",
-            "dot",
-            "quant_dot",
-            "add",
-            "clip",
-            "relu",
-            "sub",
-            "mul",
-            "div",
-            "pow",
-            "where",
-            "quantizelinear",
-            "dequantizelinear",
-            "abs",
-            "neg",
-        };
-        const std::initializer_list<std::string> fp_only_ops = {
-            "ceil",
-            "erf",
-            "exp",
-            "floor",
-            "log",
-            "recip",
-            "rsqrt",
-            "sigmoid",
-            "softmax",
-            "tanh",
-        };
-        bool is_float = contains({type_t::float_type, type_t::half_type}, result_type);
-        if(contains(any_type_ops, name))
-            return true;
-        if(result_type != type_t::bool_type and contains(no_bool_ops, name))
-            return true;
-        if(is_float and contains(fp_only_ops, name))
-            return true;
-        // Only conversions between floating types are known to be unambigiously
-        // supported.
-        if(is_float and name == "convert")
-        {
-            return std::all_of(i.inputs().begin(), i.inputs().end(), [](const auto& arg) {
-                return contains({type_t::float_type, type_t::half_type}, arg->get_shape().type());
-            });
-        }
        return false;
    }
+    const std::initializer_list<std::string> any_type_ops = {"@literal", "@param", "@return"};
+    const std::initializer_list<std::string> no_bool_ops  = {
+        "convolution",
+        "quant_convolution",
+        "dot",
+        "quant_dot",
+        "add",
+        "clip",
+        "relu",
+        "sub",
+        "mul",
+        "div",
+        "pow",
+        "where",
+        "quantizelinear",
+        "dequantizelinear",
+        "abs",
+        "neg",
+    };
+    const std::initializer_list<std::string> fp_only_ops = {
+        "ceil",
+        "erf",
+        "exp",
+        "floor",
+        "log",
+        "recip",
+        "rsqrt",
+        "sigmoid",
+        "softmax",
+        "tanh",
+    };
+    bool is_float = contains({type_t::float_type, type_t::half_type}, result_type);
+    if(contains(any_type_ops, name))
+        return true;
+    if(result_type != type_t::bool_type and contains(no_bool_ops, name))
+        return true;
+    if(is_float and contains(fp_only_ops, name))
+        return true;
+    // Only conversions between floating types are known to be unambigiously
+    // supported.
+    if(is_float and name == "convert")
+    {
+        return std::all_of(i.inputs().begin(), i.inputs().end(), [](const auto& arg) {
+            return contains({type_t::float_type, type_t::half_type}, arg->get_shape().type());
+        });
+    }
+    return false;
+}
+MIGRAPHX_PRED_MATCHER(mlir_pointwise, instruction_ref ins)
+{
+    if(ins->name() != "pointwise")
+        return false;
+    auto* pm = ins->module_inputs().front();
+    return std::all_of(pm->begin(), pm->end(), [&](const auto& i) {
+        return is_pointwise_op_supported_by_mlir(i);
+    });
+}
+struct find_mlir_fused_ops
+{
+    mlir_mode conv_mode = mlir_mode::none;
+    mlir_mode dot_mode  = mlir_mode::none;
+    auto matcher() const
+    {
+        auto dot_or_conv = match::skip(match::name("contiguous"))(
+            match::any_of(is_mlir_dot(dot_mode), is_mlir_conv(conv_mode)).bind("gemm_based_op"));
+        return mlir_pointwise()(match::any_of[match::inputs()](dot_or_conv.bind("x")));
+    }
    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
    {
@@ -309,29 +373,12 @@ struct find_mlir_fused_ops
        auto x_ins         = r.instructions["x"]; // input after contiguous
        auto* pm           = ins->module_inputs().front();
        auto names         = pm->get_parameter_names();
-        // Whitelist pointwise operators.
-        if(std::any_of(pm->begin(), pm->end(), [&](const auto& i) {
-               return not is_pointwise_op_supported_by_mlir(i);
-           }))
-            return;
        std::sort(names.begin(), names.end());
        module_ref mm = mpm.create_module("mlir_" + pm->name());
        mm->set_bypass();
-        std::unordered_map<instruction_ref, instruction_ref> param_map =
+        auto [anchor_op, top_inputs] = fuse_input_ops_and_gemm_based_op(
-            create_param_map_with_literals(mm, pm, gemm_based_op->get_shape());
+            mm, gemm_based_op->inputs(), gemm_based_op->get_operator());
-        auto [anchor_op, top_inputs] = fuse_input_ops_and_gemm_based_op(mm, gemm_based_op);
+        mm->add_return(fold_pointwise_mod(ins, mm, {{x_ins, anchor_op}}));
-        std::transform(names.begin(),
-                       names.end(),
-                       ins->inputs().begin(),
-                       std::inserter(param_map, param_map.end()),
-                       [&, &anchor = anchor_op](auto name, auto input) {
-                           if(input == x_ins)
-                               return std::make_pair(pm->get_parameter(name), anchor);
-                           return std::make_pair(pm->get_parameter(name),
-                                                 mm->add_parameter(name, input->get_shape()));
-                       });
-        mm->add_return(mm->insert_instructions(mm->end(), pm, param_map));
        std::vector<instruction_ref> inputs;
        std::copy_if(ins->inputs().begin(),
@@ -349,52 +396,103 @@ struct find_mlir_standalone_op
 {
    mlir_mode mode = mlir_mode::none;
    auto matcher() const { return Matcher(mode); }
    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
    {
-        auto conv_based_op = r.result;
+        auto gemm_based_op = r.result;
+        //
        // enable only for fp32/fp16/i8 types
-        if(std::any_of(conv_based_op->inputs().begin(), conv_based_op->inputs().end(), [&](auto i) {
+        if(std::any_of(gemm_based_op->inputs().begin(), gemm_based_op->inputs().end(), [&](auto i) {
               return not contains(
                   {shape::type_t::float_type, shape::type_t::half_type, shape::type_t::int8_type},
                   i->get_shape().type());
           }))
            return;
        static size_t counter = 0;
        module_ref mm =
-            mpm.create_module("mlir_" + conv_based_op->name() + std::to_string(counter++));
+            mpm.create_module("mlir_" + gemm_based_op->name() + std::to_string(counter++));
        mm->set_bypass();
-        auto [anchor_op, top_inputs] = fuse_input_ops_and_gemm_based_op(mm, conv_based_op);
+        auto [anchor_op, top_inputs] = fuse_input_ops_and_gemm_based_op(
+            mm, gemm_based_op->inputs(), gemm_based_op->get_operator());
        mm->add_return({anchor_op});
        mpm.get_module().replace_instruction(
-            conv_based_op, mlir_op{conv_based_op->get_operator()}, top_inputs, {mm});
+            gemm_based_op, mlir_op{gemm_based_op->get_operator()}, top_inputs, {mm});
    }
 };
 using find_mlir_standalone_convolution_op = find_mlir_standalone_op<&is_mlir_conv>;
 using find_mlir_standalone_dot_op         = find_mlir_standalone_op<&is_mlir_dot>;
-/**
+struct find_mlir_standalone_attention_op
- * @brief Declares a new MIGraphX environment variable which forces to generate
+{
- * only specific MLIR operations.
+    auto matcher() const
- *
+    {
- * The variable, if defined, forces MIGraphX to use only specific operations
+        return match::name("gpu::pre_gemm_softmax_gemm").bind("gemm_softmax_gemm");
- * with MLIR regardless of the underlying GPU architecture. The variable accepts
+    }
- * a list of operations separated by comma. The variable recognizes the following
- * operations: "fused", "convolution", "dot". If the variable is not defined MIGraphX
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
- * will decide by itself which operations to delegate to MLIR. The variable is
+    {
- * intended to be primarily used by rocMLIR developers.
+        static size_t counter  = 0;
- */
+        module_ref mm          = mpm.create_module("mlir_" + std::to_string(counter++));
-MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_MLIR_USE_SPECIFIC_OPS);
+        auto gemm_softmax_gemm = r.instructions["gemm_softmax_gemm"];
+        std::vector<instruction_ref> inputs;
+        mm->set_bypass();
-bool is_requested(std::string_view option, bool fallback = false)
+        std::unordered_map<instruction_ref, instruction_ref> ins_map;
+        auto gemm0_inputs = gemm_softmax_gemm->inputs();
+        gemm0_inputs.pop_back();
+        auto [gemm0, top_gemm0_inputs] =
+            fuse_input_ops_and_gemm_based_op(mm, gemm0_inputs, make_op("dot"));
+        inputs.insert(inputs.begin(), top_gemm0_inputs.begin(), top_gemm0_inputs.end());
+        // handle scale
+        auto v = gemm_softmax_gemm->get_operator().to_value();
+        assert(v.contains("scale"));
+        auto scale     = v.at("scale").to<float>();
+        auto scale_lit = mm->add_literal(literal{shape{gemm0->get_shape().type()}, {scale}});
+        instruction_ref scale_lit_mbcast = mm->add_instruction(
+            make_op("multibroadcast", {{"out_lens", gemm0->get_shape().lens()}}), scale_lit);
+        auto scaled_gemm0 = mm->add_instruction(make_op("mul"), gemm0, scale_lit_mbcast);
+        auto softmax = mm->add_instruction(
+            make_op("softmax", {{"axis", gemm0->get_shape().lens().size() - 1}}), scaled_gemm0);
+        auto [old_upper_v, upper_v_op_stream] =
+            get_fusable_input_op_stream(gemm_softmax_gemm->inputs()[2]);
+        instruction_ref new_upper_v = mm->add_parameter("z", old_upper_v->get_shape());
+        for(const auto& op : reverse(upper_v_op_stream))
+        {
+            new_upper_v = mm->add_instruction(op, {new_upper_v});
+        }
+        inputs.push_back(old_upper_v);
+        auto gemm1                 = mm->add_instruction(make_op("dot"), {softmax, new_upper_v});
+        ins_map[gemm_softmax_gemm] = gemm1;
+        auto ins_to_replace        = gemm1;
+        auto ins_to_be_replaced    = gemm_softmax_gemm;
+        if(r.instructions.find("trailing_pm") != r.instructions.end())
+        {
+            ins_to_replace = fold_pointwise_mod(r.instructions["trailing_pm"], mm, ins_map)[0];
+            std::copy_if(r.instructions["trailing_pm"]->inputs().begin(),
+                         r.instructions["trailing_pm"]->inputs().end(),
+                         std::back_inserter(inputs),
+                         [&](auto input) { return input != gemm_softmax_gemm; });
+            ins_to_be_replaced = r.instructions["trailing_pm"];
+        }
+        mm->add_return({ins_to_replace});
+        mpm.get_module().replace_instruction(
+            ins_to_be_replaced, mlir_op{gemm1->get_operator()}, inputs, {mm});
+    }
+};
+struct find_mlir_attention_fused_ops : public find_mlir_standalone_attention_op
 {
-    auto string_value  = string_value_of(MIGRAPHX_MLIR_USE_SPECIFIC_OPS{}, "");
+    auto matcher() const
-    if(string_value.empty())
+    {
-        return fallback;
+        auto standalone_matcher = find_mlir_standalone_attention_op::matcher();
-    const auto options = split_string(string_value, ',');
+        return mlir_pointwise()(
-    return contains(options, option);
+            match::any_of[match::inputs()](standalone_matcher).bind("trailing_pm"));
-}
+        ;
+    }
+};
 } // namespace
 #endif // MIGRAPHX_MLIR
@@ -416,6 +514,13 @@ void fuse_mlir::apply(module_pass_manager& mpm) const
    mlir_mode mode =
        (enabled(MIGRAPHX_ENABLE_EXTRA_MLIR{}) or enable_extra) ? mlir_mode::fast : mlir_mode::none;
+    // Attention offloads; default disabled
+    if(mlir_attention_enabled())
+    {
+        match::find_matches(mpm, find_mlir_attention_fused_ops{});
+        match::find_matches(mpm, find_mlir_standalone_attention_op{});
+    }
    match::find_matches(mpm,
                        find_mlir_fused_ops{.conv_mode = get_mode("fused", mlir_mode::fast),
                                            .dot_mode  = get_mode("fused", mode)});

--- a/src/targets/gpu/include/migraphx/gpu/fuse_mlir.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/fuse_mlir.hpp
@@ -34,10 +34,11 @@ struct module_pass_manager;
 namespace gpu {
 MIGRAPHX_GPU_EXPORT bool mlir_enabled();
+MIGRAPHX_GPU_EXPORT bool mlir_attention_enabled();
 struct MIGRAPHX_GPU_EXPORT fuse_mlir
 {
-    context* ctx = nullptr;
+    context* ctx      = nullptr;
    bool enable_extra = false;
    std::string name() const { return "gpu::fuse_mlir"; }
    void apply(module_pass_manager& mpm) const;

--- a/src/targets/gpu/include/migraphx/gpu/gemm_softmax_gemm.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/gemm_softmax_gemm.hpp
@@ -66,6 +66,10 @@ struct gemm_softmax_gemm
    }
    static bool is_ck_supported_type(shape::type_t t) { return contains({shape::half_type}, t); }
+    static bool is_mlir_supported_type(shape::type_t t)
+    {
+        return contains({shape::type_t::float_type, shape::half_type}, t);
+    }
 };
 } // namespace gpu

--- a/src/targets/gpu/mlir.cpp
+++ b/src/targets/gpu/mlir.cpp
@@ -1032,6 +1032,15 @@ tuning_config get_tuning_config_mlir(const context& migraphx_ctx,
    mlir_program mp;
    mp.set_gpu_properties(migraphx_ctx);
    mp.parse(m);
+    const bool trace = enabled(MIGRAPHX_TRACE_MLIR{});
+    static std::mutex mutex;
+    if(trace)
+    {
+        const std::lock_guard<std::mutex> lock(mutex);
+        auto mod_op = mlirModuleGetOperation(mp.mmodule.get());
+        std::cout << mlir_print(&mlirOperationPrint, mod_op) << std::endl;
+    }
    return mp.get_tuning_config(exhaustive);
 }

--- a/src/targets/gpu/prefuse_ops.cpp
+++ b/src/targets/gpu/prefuse_ops.cpp
@@ -31,6 +31,7 @@
 #ifdef MIGRAPHX_USE_COMPOSABLEKERNEL
 #include <migraphx/gpu/ck.hpp>
 #endif
+#include <migraphx/gpu/fuse_mlir.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -124,34 +125,55 @@ struct find_add_layernorm
    }
 };
-#ifdef MIGRAPHX_USE_COMPOSABLEKERNEL
 struct pre_gemm_softmax_gemm : gemm_softmax_gemm
 {
    std::string name() const { return "gpu::pre_gemm_softmax_gemm"; }
 };
 MIGRAPHX_REGISTER_OP(pre_gemm_softmax_gemm);
-MIGRAPHX_PRED_MATCHER(is_ck_gemm, instruction_ref ins)
+auto is_ck_gemm()
 {
-    if(ins->name() != "dot")
+    return match::make_basic_pred_matcher([=](instruction_ref ins) {
-        return false;
+#ifdef MIGRAPHX_USE_COMPOSABLEKERNEL
-    if(not pre_gemm_softmax_gemm::is_ck_supported_type(ins->get_shape().type()))
+        if(not enabled(MIGRAPHX_ENABLE_CK{}))
+            return false;
+        if(ins->name() != "dot")
+            return false;
+        if(not pre_gemm_softmax_gemm::is_ck_supported_type(ins->get_shape().type()))
+            return false;
+        return true;
+#else
+        (void)ins;
        return false;
-    return true;
+#endif
+    });
+}
+auto is_mlir_gemm()
+{
+    return match::make_basic_pred_matcher([=](instruction_ref ins) {
+        if(not mlir_attention_enabled())
+            return false;
+        if(ins->name() != "dot")
+            return false;
+        return std::all_of(ins->inputs().begin(), ins->inputs().end(), [&](auto i) {
+            return pre_gemm_softmax_gemm::is_mlir_supported_type(i->get_shape().type());
+        });
+    });
 }
 struct find_gemm_softmax_gemm
 {
    auto matcher() const
    {
-        auto gemm1 =
+        auto gemm1 = match::skip(match::name("contiguous"))(
-            match::skip(match::name("contiguous"))(match::name("dot")(is_ck_gemm().bind("gemm1")));
+            match::name("dot")(match::any_of(is_ck_gemm(), is_mlir_gemm()).bind("gemm1")));
        auto mul = match::name("mul")(
            match::nargs(2), match::either_arg(0, 1)(match::is_constant().bind("scale"), gemm1));
        auto softmax = match::name("softmax")(match::arg(0)(mul)).bind("softmax");
-        return match::name("dot")(is_ck_gemm().bind("gemm2"))(match::arg(0)(softmax));
+        return match::name("dot")(match::any_of(is_ck_gemm(), is_mlir_gemm()).bind("gemm2"))(
+            match::arg(0)(softmax));
    }
    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
@@ -179,8 +201,6 @@ struct find_gemm_softmax_gemm
    }
 };
-#endif
 } // namespace
 void prefuse_ops::apply(module_pass_manager& mpm) const
@@ -188,10 +208,7 @@ void prefuse_ops::apply(module_pass_manager& mpm) const
    match::find_matches(mpm.get_module(), find_layernorm{});
    mpm.run_pass(dead_code_elimination{});
    match::find_matches(mpm.get_module(), find_add_layernorm{});
-#ifdef MIHRAPHX_USE_COMPOSABLEKERNEL
+    match::find_matches(mpm, find_gemm_softmax_gemm{});
-    if(enabled(MIGRAPHX_ENABLE_CK{}))
-        match::find_matches(mpm, find_gemm_softmax_gemm{});
-#endif
 }
 } // namespace gpu

--- a/test/gpu/fuse_mlir.cpp
+++ b/test/gpu/fuse_mlir.cpp
@@ -144,10 +144,12 @@ TEST_CASE(int_quant_dot_tanh_fails)
        auto tanh = add_pointwise(p1, "main:pointwise0", {dot}, single_pointwise("tanh"));
        mm->add_return({tanh});
    }
-    migraphx::program p2(p1);
+    // This pass should not fuse as int32_t tanh isn't supported.
-    // This pass should do nothing as int32_t tanh isn't supported.
    run_pass(p1);
-    EXPECT(p1 == p2);
+    auto* mm = p1.get_main_module();
+    bool has_pointwise =
+        std::any_of(mm->begin(), mm->end(), [&](const auto& i) { return i.name() == "pointwise"; });
+    EXPECT(has_pointwise);
 }
 int main(int argc, const char* argv[])

--- a/test/verify/gemm_softmax_gemm_relu.cpp
+++ b/test/verify/gemm_softmax_gemm_relu.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+struct gemm_softmax_gemm_relu : verify_program<gemm_softmax_gemm_relu>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+        migraphx::shape m1_shape{migraphx::shape::half_type, {1, 12, 256, 256}};
+        migraphx::shape m2_shape{migraphx::shape::half_type, {1, 12, 256, 256}};
+        auto m2_elements = m2_shape.elements();
+        auto a           = mm->add_parameter("1", m1_shape);
+        auto b           = mm->add_parameter("2", m1_shape);
+        auto b1          = mm->add_parameter("3", m1_shape);
+        std::vector<float> eights(m2_elements, 0.125);
+        auto eight = mm->add_literal(migraphx::literal{m2_shape, eights});
+        std::vector<float> zeros(m2_elements, 0);
+        auto zero = mm->add_literal(migraphx::literal{m2_shape, zeros});
+        b = mm->add_instruction(migraphx::make_op("transpose", {{"permutation", {0, 1, 3, 2}}}), b);
+        auto gemm1   = mm->add_instruction(migraphx::make_op("dot"), a, b);
+        auto scale   = mm->add_instruction(migraphx::make_op("mul"), gemm1, eight);
+        auto bias    = mm->add_instruction(migraphx::make_op("add"), scale, zero);
+        auto softmax = mm->add_instruction(migraphx::make_op("softmax", {{"axis", 3}}), bias);
+        auto gemm2   = mm->add_instruction(migraphx::make_op("dot"), softmax, b1);
+        mm->add_instruction(migraphx::make_op("relu"), gemm2);
+        return p;
+    }
+};