[Bugfix] Fix stale SSM state for new Mamba requests scheduled as decode (#32118)

Signed-off-by: Josephasafg <ajgard7@gmail.com>

[Bugfix] Fix stale SSM state for new Mamba requests scheduled as decode (#32118)
Signed-off-by: Josephasafg <ajgard7@gmail.com>
8fb2c135 · Asaf Joseph Gardin · GitHub · 8863c2b2 · 8fb2c135 · 8fb2c135
Unverified Commit 8fb2c135 authored Jan 12, 2026 by Asaf Joseph Gardin Committed by GitHub Jan 12, 2026
Show whitespace changes
Inline Side-by-side

Showing with 24 additions and 3 deletions

tests/v1/attention/test_batch_reordering.py tests/v1/attention/test_batch_reordering.py +21 -0

vllm/v1/attention/backends/utils.py vllm/v1/attention/backends/utils.py +3 -3

No files found.
--- a/tests/v1/attention/test_batch_reordering.py
+++ b/tests/v1/attention/test_batch_reordering.py
@@ -98,6 +98,27 @@ REORDER_TEST_CASES = {
        expected_order=[0, 1, 6, 8, 4, 3, 2, 7, 5],
        expected_modified=True,
    ),
+    "new_request_single_token_prefill": ReorderTestCase(
+        requests=[
+            (100, 0),
+            (1, 0),  # New request with only 1 token (STILL prefill)
+            (50, 100),
+            (1, 10),
+        ],
+        # Only index 3 is a true decode (has num_computed_tokens > 0)
+        expected_order=[3, 2, 0, 1],
+        expected_modified=True,
+    ),
+    "multiple_new_requests_single_token_prefill": ReorderTestCase(
+        requests=[
+            (1, 0),  # New prefill (1 token, no computed)
+            (1, 0),  # New prefill (1 token, no computed)
+            (1, 50),
+            (200, 0),
+        ],
+        expected_order=[2, 1, 0, 3],
+        expected_modified=True,
+    ),
 }



--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -1040,9 +1040,9 @@ def reorder_batch_to_split_decodes_and_prefills(
    num_scheduled_tokens_np = np.array(num_scheduled_tokens)
    num_computed_tokens_np = input_batch.num_computed_tokens_cpu[:num_reqs]

-    is_decode = num_scheduled_tokens_np <= decode_threshold
-    is_extend = (~is_decode) & (num_computed_tokens_np > 0)
-    is_prefill = (~is_decode) & (num_computed_tokens_np == 0)
+    is_prefill = num_computed_tokens_np == 0
+    is_decode = (num_scheduled_tokens_np <= decode_threshold) & (~is_prefill)
+    is_extend = (num_scheduled_tokens_np > decode_threshold) & (~is_prefill)

    # Desired order: decode → extend → prefill
    req_regions = np.zeros(is_decode.shape, dtype=np.int32)  # 0 = decode by default