update flash_mla_with_kvcache_fp8 interface and k_cache

77210184 · zhuwenwen · 347fc09c · 77210184 · 77210184 · 77210184
Commit 77210184 authored Dec 17, 2025 by zhuwenwen
3 changed files
--- a/vllm/attention/backends/flashmla.py
+++ b/vllm/attention/backends/flashmla.py
@@ -17,7 +17,6 @@ from vllm.attention.backends.mla.common import (MLACommonBackend,
 from vllm.attention.ops.flashmla import (flash_mla_with_kvcache,
                                         get_mla_metadata,
                                         flash_mla_with_kvcache_fp8,
-                                         get_mla_decoding_metadata_dense_fp8,
                                         is_flashmla_supported)
 from vllm import envs
@@ -239,7 +238,7 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
        if torch.cuda.get_device_properties("cuda").gcnArchName.split(':')[0] == "gfx938" and kv_cache_dtype == "fp8_e4m3" and envs.VLLM_USE_FLASH_MLA_FP8:
            o, _ = flash_mla_with_kvcache_fp8(
                q=q.to(torch.float8_e4m3fn),
-                k_cache=kv_c_and_k_pe_cache.view(torch.float8_e4m3fn).unsqueeze(-2),  # Add head dim of 1
+                k_cache=kv_c_and_k_pe_cache.unsqueeze(-2).to(torch.float8_e4m3fn),  # Add head dim of 1
                block_table=decode_meta.block_tables,
                cache_seqlens=decode_meta.seq_lens_tensor,
                head_dim_v=self.kv_lora_rank,

--- a/vllm/attention/ops/flashmla.py
+++ b/vllm/attention/ops/flashmla.py
@@ -73,6 +73,7 @@ def get_mla_decoding_metadata_dense_fp8(
    cache_seqlens: torch.Tensor,
    num_heads_per_head_k: int,
    num_heads_k: int,
+    num_heads_q : int = 16,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
    """
    Arguments:
@@ -87,7 +88,7 @@ def get_mla_decoding_metadata_dense_fp8(
    """
    return flash_mla_cuda.get_mla_decoding_metadata_dense_fp8(cache_seqlens,
                            num_heads_per_head_k,
-                            num_heads_k)
+                            num_heads_k, num_heads_q)
 def flash_mla_with_kvcache(

--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -12,7 +12,6 @@ from vllm.attention.ops.flashmla import (flash_mla_with_kvcache,
                                         flash_mla_with_kvcache_q_nope_pe,
                                         get_mla_metadata,
                                         flash_mla_with_kvcache_fp8,
-                                         get_mla_decoding_metadata_dense_fp8,
                                         is_flashmla_supported)
 from vllm.logger import init_logger
 from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
@@ -183,10 +182,9 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
            else:
                q = torch.cat([q_nope, q_pe], dim=-1)\
                    .unsqueeze(1) # Add seqlen dim of 1 (decode)
            o, _ = flash_mla_with_kvcache_fp8(
                q=q.to(torch.float8_e4m3fn),
-                k_cache=kv_c_and_k_pe_cache.view(torch.float8_e4m3fn).unsqueeze(-2),  # Add head dim of 1
+                k_cache=kv_c_and_k_pe_cache.unsqueeze(-2).to(torch.float8_e4m3fn),  # Add head dim of 1
                block_table=attn_metadata.decode.block_table,
                cache_seqlens=attn_metadata.decode.seq_lens,
                head_dim_v=self.kv_lora_rank,
@@ -213,7 +211,6 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
                    q = torch.cat([q_nope, q_pe], dim=-1)\
                        .unsqueeze(1) # Add seqlen dim of 1 (decode)
-            if not envs.VLLM_USE_CAT_MLA or kv_cache_dtype == "fp8_e4m3":
                o, _ = flash_mla_with_kvcache(
                    q=q,
                    k_cache=kv_c_and_k_pe_cache.unsqueeze(-2),  # Add head dim of 1