if kv_cache_detype=="fp8_e4m3", use non fused cat + mla

ba0cd35c · zhuwenwen · 92058666 · ba0cd35c
Commit ba0cd35c authored Dec 01, 2025 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 2 deletions

vllm/v1/attention/backends/mla/flashmla.py vllm/v1/attention/backends/mla/flashmla.py +2 -2

No files found.
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -168,7 +168,7 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
        assert kv_c_and_k_pe_cache.numel() > 0
        assert attn_metadata.decode is not None

-        if not envs.VLLM_USE_CAT_MLA:
+        if not envs.VLLM_USE_CAT_MLA or kv_cache_dtype == "fp8_e4m3":
            if envs.VLLM_USE_OPT_CAT:
                if q_nope.shape[0] < 1024:
                    from vllm.v1.attention.backends.mla.test_concat import concat_helper_decode
@@ -181,7 +181,7 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
                q = torch.cat([q_nope, q_pe], dim=-1)\
                    .unsqueeze(1) # Add seqlen dim of 1 (decode)

-        if not envs.VLLM_USE_CAT_MLA:
+        if not envs.VLLM_USE_CAT_MLA or kv_cache_dtype == "fp8_e4m3":
            o, _ = flash_mla_with_kvcache(
                q=q,
                k_cache=kv_c_and_k_pe_cache.unsqueeze(-2),  # Add head dim of 1