Support Context Parallel for Multi Latent Attention (MLA) (#1729)

* Support MLA (qk_dim != v_dim) for AttnFuncWithCPAndKVP2P Signed-off-by: Yuzhong Wang <yuzhongw@nvidia.com> * add UT for MLA CP Signed-off-by: Yuzhong Wang <yuzhongw@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * refine the code Signed-off-by: Yuzhong Wang <yuzhongw@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * refine the code Signed-off-by: Yuzhong Wang <yuzhongw@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Yuzhong Wang <yuzhongw@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Xin Yao <xiny@nvidia.com> Co-authored-by: Xiaowei Ren <103958965+xrennvidia@users.noreply.github.com>

Support Context Parallel for Multi Latent Attention (MLA) (#1729)
* Support MLA (qk_dim != v_dim) for AttnFuncWithCPAndKVP2P Signed-off-by: Yuzhong Wang <yuzhongw@nvidia.com> * add UT for MLA CP Signed-off-by: Yuzhong Wang <yuzhongw@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * refine the code Signed-off-by: Yuzhong Wang <yuzhongw@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * refine the code Signed-off-by: Yuzhong Wang <yuzhongw@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Yuzhong Wang <yuzhongw@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Xin Yao <xiny@nvidia.com> Co-authored-by: Xiaowei Ren <103958965+xrennvidia@users.noreply.github.com>
faee0e8b · yuzhongw-nvidia · GitHub · 031c6cf6 · faee0e8b · faee0e8b
Unverified Commit faee0e8b authored Jun 11, 2025 by yuzhongw-nvidia Committed by GitHub Jun 10, 2025
3 changed files
--- a/tests/pytorch/fused_attn/test_fused_attn_with_cp.py
+++ b/tests/pytorch/fused_attn/test_fused_attn_with_cp.py
@@ -107,6 +107,18 @@ model_configs_fused_attn = {
    "cp_2_4": ModelConfig(
        2, 12, 2, 128, 4096, 4096, 0.0, "causal", "no_bias", window_size=(512, 0)
    ),  # GQA
+    "cp_3_0": ModelConfig(
+        2, 12, 12, 128, 4096, 4096, 0.0, "causal", "no_bias", head_dim_v=64
+    ),  # MLA
+    "cp_3_1": ModelConfig(
+        2, 12, 12, 128, 4096, 4096, 0.0, "no_mask", "no_bias", head_dim_v=64
+    ),  # MLA
+    "cp_3_2": ModelConfig(
+        2, 12, 12, 128, 4096, 4096, 0.0, "causal", "post_scale_bias", head_dim_v=64
+    ),  # MLA
+    "cp_3_3": ModelConfig(
+        2, 12, 12, 128, 4096, 4096, 0.0, "no_mask", "post_scale_bias", head_dim_v=64
+    ),  # MLA
 }


@@ -159,6 +171,8 @@ def test_cp_with_fused_attention(dtype, model, qkv_format, cp_comm_type, fp8_mha
        )
    if dtype != "fp8" and fp8_mha:
        pytest.skip("Only fp8 works with fp8_mha=True!")
+    if "p2p" not in cp_comm_type and config.head_dim_qk != config.head_dim_v:
+        pytest.skip("MLA CP currently only support KV P2P!")

    subprocess.run(
        get_bash_arguments(

--- a/transformer_engine/pytorch/attention/dot_product_attention/context_parallel.py
+++ b/transformer_engine/pytorch/attention/dot_product_attention/context_parallel.py
--- a/transformer_engine/pytorch/attention/dot_product_attention/utils.py
+++ b/transformer_engine/pytorch/attention/dot_product_attention/utils.py
@@ -608,11 +608,6 @@ def get_attention_backend(
                " bias for THD format"
            )
            use_fused_attention = False
-        elif head_dim_qk != head_dim_v:
-            logger.debug(
-                "Disabling FusedAttention as it does not support context parallelism with MLA"
-            )
-            use_fused_attention = False

    # Filter: Attention mask
    # attn_mask_type              | attention_mask                       | supported backends