[Attention] FlashAttn MLA (#14258)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com> Co-authored-by: Matthew Bonanni <mbonanni001@gmail.com> Co-authored-by: Matthew Bonanni <mbonanni@redhat.com>

[Attention] FlashAttn MLA (#14258)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com> Co-authored-by: Matthew Bonanni <mbonanni001@gmail.com> Co-authored-by: Matthew Bonanni <mbonanni@redhat.com>
402759d4 · Lucas Wilkinson · GitHub · 2c301ee2 · 402759d4 · 402759d4
Unverified Commit 402759d4 authored Sep 04, 2025 by Lucas Wilkinson Committed by GitHub Sep 04, 2025
Showing with 14 additions and 9 deletions

vllm/v1/attention/backends/short_conv_attn.py vllm/v1/attention/backends/short_conv_attn.py +4 -3

vllm/v1/attention/backends/xformers.py vllm/v1/attention/backends/xformers.py +10 -6

No files found.
--- a/vllm/v1/attention/backends/short_conv_attn.py
+++ b/vllm/v1/attention/backends/short_conv_attn.py
@@ -58,8 +58,9 @@ class ShortConvAttentionMetadataBuilder(
        state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-            split_decodes_and_prefills(common_attn_metadata,
+            split_decodes_and_prefills(
-                                       decode_threshold=1))
+                common_attn_metadata,
+                decode_threshold=self.reorder_batch_threshold))
        has_initial_states = None
        if num_prefills > 0:
            #[batch,]
@@ -78,4 +79,4 @@ class ShortConvAttentionMetadataBuilder(
            has_initial_states=has_initial_states,
            state_indices_tensor=state_indices_tensor,
        )
        return attn_metadata
\ No newline at end of file
--- a/vllm/v1/attention/backends/xformers.py
+++ b/vllm/v1/attention/backends/xformers.py
@@ -3,7 +3,7 @@
 """Attention layer with XFormersAttention."""
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, ClassVar, Optional
 import torch
@@ -197,6 +197,8 @@ class XFormersAttentionMetadata:
 class XFormersAttentionMetadataBuilder(
        AttentionMetadataBuilder[XFormersAttentionMetadata]):
+    reorder_batch_threshold: ClassVar[int] = 1
    def __init__(
        self,
        kv_cache_spec: AttentionSpec,
@@ -212,9 +214,10 @@ class XFormersAttentionMetadataBuilder(
    def reorder_batch(self, input_batch: "InputBatch",
                      scheduler_output: "SchedulerOutput") -> bool:
-        return reorder_batch_to_split_decodes_and_prefills(input_batch,
+        return reorder_batch_to_split_decodes_and_prefills(
-                                                           scheduler_output,
+            input_batch,
-                                                           decode_threshold=1)
+            scheduler_output,
+            decode_threshold=self.reorder_batch_threshold)
    def build(
        self,
@@ -223,8 +226,9 @@ class XFormersAttentionMetadataBuilder(
        fast_build: bool = False,
    ) -> XFormersAttentionMetadata:
        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-            split_decodes_and_prefills(common_attn_metadata,
+            split_decodes_and_prefills(
-                                       decode_threshold=1))
+                common_attn_metadata,
+                decode_threshold=self.reorder_batch_threshold))
        num_actual_tokens = common_attn_metadata.num_actual_tokens
        q_start_loc = common_attn_metadata.query_start_loc