update flash_mla_with_kvcache

9663a03f · zhuwenwen · d0e16bf5 · 9663a03f · 9663a03f
Commit 9663a03f authored Dec 23, 2025 by zhuwenwen
Show whitespace changes
Inline Side-by-side

Showing with 36 additions and 24 deletions

vllm/attention/ops/flashmla.py vllm/attention/ops/flashmla.py +9 -11

vllm/v1/attention/backends/mla/flashmla.py vllm/v1/attention/backends/mla/flashmla.py +27 -13

No files found.
--- a/vllm/attention/ops/flashmla.py
+++ b/vllm/attention/ops/flashmla.py
@@ -226,16 +226,14 @@ def flash_mla_with_kvcache(
            out, softmax_lse = flash_mla_cuda.fwd_kvcache_mla(
                q, 
                k_cache, 
-                block_table,
+                None, 
-                cache_seqlens,
                head_dim_v, 
-                tile_scheduler_metadata,
+                cache_seqlens, 
-                num_splits,
+                block_table, 
                softmax_scale,
                causal, 
-                is_fp8_kvcache,
+                tile_scheduler_metadata, 
-                indices,
+                num_splits)
-            )
        else:
            out, softmax_lse = torch.ops._flashmla_C.fwd_kvcache_mla(
                q,

--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -34,6 +34,7 @@ from vllm.v1.attention.backends.utils import (
 )
 from vllm.v1.kv_cache_interface import AttentionSpec
 from vllm import envs
+from vllm.platforms import current_platform
 logger = init_logger(__name__)
@@ -310,6 +311,19 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
            # zeros of length B+1
            num_splits = torch.zeros((B + 1,), dtype=dtype, device=device)
+        if current_platform.is_rocm():
+            o, lse = flash_mla_with_kvcache(
+                q=q,
+                k_cache=kv_c_and_k_pe_cache.unsqueeze(-2),  # Add head dim of 1
+                block_table=attn_metadata.decode.block_table,
+                cache_seqlens=attn_metadata.decode.seq_lens,
+                head_dim_v=self.kv_lora_rank,
+                tile_scheduler_metadata=tile_scheduler_metadata,
+                num_splits=num_splits,
+                softmax_scale=self.scale,
+                causal=True,
+            )
+        else:
            o, lse = flash_mla_with_kvcache(
                q=q,
                k_cache=kv_c_and_k_pe_cache.unsqueeze(-2),  # Add head dim of 1