update flash_mla_with_kvcache

9663a03f · zhuwenwen · d0e16bf5 · 9663a03f · 9663a03f
Commit 9663a03f authored Dec 23, 2025 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 36 additions and 24 deletions

vllm/attention/ops/flashmla.py vllm/attention/ops/flashmla.py +9 -11

vllm/v1/attention/backends/mla/flashmla.py vllm/v1/attention/backends/mla/flashmla.py +27 -13

No files found.
--- a/vllm/attention/ops/flashmla.py
+++ b/vllm/attention/ops/flashmla.py
@@ -224,18 +224,16 @@ def flash_mla_with_kvcache(
    else:
        if current_platform.is_rocm():
            out, softmax_lse = flash_mla_cuda.fwd_kvcache_mla(
-                q,
+                q, 
-                k_cache,
+                k_cache, 
-                block_table,
+                None, 
-                cache_seqlens,
+                head_dim_v, 
-                head_dim_v,
+                cache_seqlens, 
-                tile_scheduler_metadata,
+                block_table, 
-                num_splits,
                softmax_scale,
-                causal,
+                causal, 
-                is_fp8_kvcache,
+                tile_scheduler_metadata, 
-                indices,
+                num_splits)
-            )
        else:
            out, softmax_lse = torch.ops._flashmla_C.fwd_kvcache_mla(
                q,

--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -34,6 +34,7 @@ from vllm.v1.attention.backends.utils import (
 )
 from vllm.v1.kv_cache_interface import AttentionSpec
 from vllm import envs
+from vllm.platforms import current_platform
 logger = init_logger(__name__)
@@ -310,19 +311,32 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
            # zeros of length B+1
            num_splits = torch.zeros((B + 1,), dtype=dtype, device=device)
-        o, lse = flash_mla_with_kvcache(
+        if current_platform.is_rocm():
-            q=q,
+            o, lse = flash_mla_with_kvcache(
-            k_cache=kv_c_and_k_pe_cache.unsqueeze(-2),  # Add head dim of 1
+                q=q,
-            block_table=attn_metadata.decode.block_table,
+                k_cache=kv_c_and_k_pe_cache.unsqueeze(-2),  # Add head dim of 1
-            cache_seqlens=attn_metadata.decode.seq_lens,
+                block_table=attn_metadata.decode.block_table,
-            head_dim_v=self.kv_lora_rank,
+                cache_seqlens=attn_metadata.decode.seq_lens,
-            tile_scheduler_metadata=tile_scheduler_metadata,
+                head_dim_v=self.kv_lora_rank,
-            num_splits=num_splits,
+                tile_scheduler_metadata=tile_scheduler_metadata,
-            softmax_scale=self.scale,
+                num_splits=num_splits,
-            causal=True,
+                softmax_scale=self.scale,
-            descale_q=layer._q_scale.reshape(1),
+                causal=True,
-            descale_k=layer._k_scale.reshape(1),
+            )
-        )
+        else:
+            o, lse = flash_mla_with_kvcache(
+                q=q,
+                k_cache=kv_c_and_k_pe_cache.unsqueeze(-2),  # Add head dim of 1
+                block_table=attn_metadata.decode.block_table,
+                cache_seqlens=attn_metadata.decode.seq_lens,
+                head_dim_v=self.kv_lora_rank,
+                tile_scheduler_metadata=tile_scheduler_metadata,
+                num_splits=num_splits,
+                softmax_scale=self.scale,
+                causal=True,
+                descale_q=layer._q_scale.reshape(1),
+                descale_k=layer._k_scale.reshape(1),
+            )
        o = reshape_attn_output_for_spec_decode(o)