[BugFix] Fix async scheduling + chunked prefill + preemption (#28787)

Signed-off-by: Nick Hill <nhill@redhat.com>

[BugFix] Fix async scheduling + chunked prefill + preemption (#28787)
Signed-off-by: Nick Hill <nhill@redhat.com>
80b6080d · Nick Hill · GitHub · 03ee4811 · 80b6080d · 80b6080d
Unverified Commit 80b6080d authored Nov 16, 2025 by Nick Hill Committed by GitHub Nov 17, 2025
Showing with 8 additions and 9 deletions

tests/v1/e2e/test_async_scheduling.py tests/v1/e2e/test_async_scheduling.py +4 -6

vllm/v1/core/sched/scheduler.py vllm/v1/core/sched/scheduler.py +1 -3

vllm/v1/utils.py vllm/v1/utils.py +3 -0

No files found.
--- a/tests/v1/e2e/test_async_scheduling.py
+++ b/tests/v1/e2e/test_async_scheduling.py
@@ -65,9 +65,8 @@ def test_without_spec_decoding(
        (True, "mp", True, None, False),
        (True, "uni", True, None, False),
        (False, "mp", True, None, True),
-        # Async scheduling + preemption + chunked prefill needs to be fixed (WIP)
-        # (True, "mp", True, None, True),
-        # (True, "uni", True, None, True),
+        (True, "mp", True, None, True),
+        (True, "uni", True, None, True),
    ]

    run_tests(
@@ -103,9 +102,8 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
        (False, "mp", True, spec_config_short, True),
        (True, "uni", True, spec_config, False),
        (True, "uni", True, spec_config_short, False),
-        # Async scheduling + preemption + chunked prefill needs to be fixed (WIP)
-        #  (True, "mp", True, spec_config, True),
-        #  (True, "uni", True, spec_config_short, True),
+        (True, "mp", True, spec_config, True),
+        (True, "uni", True, spec_config_short, True),
    ]

    run_tests(

--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -778,9 +778,7 @@ class Scheduler(SchedulerInterface):
                assert not scheduled_in_prev_step
                resumed_req_ids.add(req_id)
            if not scheduled_in_prev_step:
-                all_token_ids[req_id] = req.all_token_ids[
-                    : req.num_computed_tokens + num_tokens
-                ]
+                all_token_ids[req_id] = req.all_token_ids.copy()
            new_block_ids.append(
                req_to_new_blocks[req_id].get_block_ids(allow_none=True)
            )

--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -97,6 +97,9 @@ class ConstantList(Generic[T], Sequence):
    def __repr__(self):
        return f"ConstantList({self._x})"

+    def copy(self) -> list[T]:
+        return self._x.copy()
+

 class CpuGpuBuffer:
    """Buffer to easily copy tensors between CPU and GPU."""