[DCU]Fix the dimension bug in the MLA under the FlashAttention backend.

Signed-off-by: zhaochao <zhaochao1@sugon.com>

[DCU]Fix the dimension bug in the MLA under the FlashAttention backend.
Signed-off-by: zhaochao <zhaochao1@sugon.com>
ca2958a8 · zhaochao · 565fd629 · ca2958a8 · ca2958a8
Commit ca2958a8 authored Oct 23, 2025 by zhaochao
2 changed files
--- a/tests/pytorch/attention/test_attention.py
+++ b/tests/pytorch/attention/test_attention.py
@@ -216,6 +216,9 @@ def test_dot_product_attention(

    # FlashAttention backend
    if flash_attn_supported:
+        from torch.utils.cpp_extension import IS_HIP_EXTENSION
+        if IS_HIP_EXTENSION and config.head_dim_qk < config.head_dim_v:
+            pytest.skip("FlashAttention on ROCm does not support MLA with head_dim_qk < head_dim_v")
        flash_attn_fwd, flash_attn_bwd = _run_dot_product_attention(
            dtype,
            config,

--- a/transformer_engine/pytorch/attention/dot_product_attention/backends.py
+++ b/transformer_engine/pytorch/attention/dot_product_attention/backends.py
@@ -890,6 +890,13 @@ class FlashAttention(torch.nn.Module):
        elif q_format == "thd":
            # thd -> t(hd)
            output = output.reshape(output.shape[0], -1)
+        if value_layer.shape[-1] != query_layer.shape[-1]:
+            v_dim = value_layer.shape[-1]
+            num_heads = query_layer.shape[-2]
+            # 恢复为 (..., num_heads, head_dim_qk)
+            out_shape_heads = output.shape[:-1] + (num_heads, query_layer.shape[-1])
+            output = output.view(out_shape_heads)[..., :v_dim]          # 裁剪到 V 的维度
+            output = output.reshape(output.shape[:-2] + (num_heads * v_dim,))

        return output.contiguous()