update flash_mla_with_kvcache

1e622f10 · zhuwenwen · 31a3beb5 · 1e622f10 · 1e622f10 · 1e622f10
Commit 1e622f10 authored Dec 19, 2025 by zhuwenwen
3 changed files
--- a/vllm/attention/backends/flashmla.py
+++ b/vllm/attention/backends/flashmla.py
@@ -288,6 +288,8 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
                num_splits=decode_meta.decode_num_splits,
                softmax_scale=self.scale,
                causal=True,
+                is_fp8_kvcache=False,
+                indices= None,
                k_scale = k_scale,
                kv_cache_dtype = kv_cache_dtype,
            )

--- a/vllm/attention/ops/flashmla.py
+++ b/vllm/attention/ops/flashmla.py
@@ -101,6 +101,8 @@ def flash_mla_with_kvcache(
    num_splits: torch.Tensor,
    softmax_scale: Optional[float] = None,
    causal: bool = False,
+    is_fp8_kvcache: bool = False,
+    indices: Optional[torch.Tensor] = None,
    k_scale = None,
    kv_cache_dtype = "auto",
 ) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -145,7 +147,6 @@ def flash_mla_with_kvcache(
        out, softmax_lse = flash_mla_cuda.fwd_kvcache_mla(
            q,
            k_cache,
-            None,
            head_dim_v,
            cache_seqlens,
            block_table,
@@ -153,6 +154,8 @@ def flash_mla_with_kvcache(
            causal,
            tile_scheduler_metadata,
            num_splits,
+            is_fp8_kvcache,
+            indices,
        )
    else:
        out, softmax_lse = torch.ops._flashmla_C.fwd_kvcache_mla(

--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -232,6 +232,8 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
                    num_splits=attn_metadata.decode.num_splits,
                    softmax_scale=self.scale,
                    causal=True,
+                    is_fp8_kvcache=False,
+                    indices= None,
                    k_scale = k_scale,
                    kv_cache_dtype = kv_cache_dtype,            
                )