[V1][PP] Do not block engine core when no requests to schedule (#14585)

Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>

[V1][PP] Do not block engine core when no requests to schedule (#14585)
Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
4290b704 · Cody Yu · GitHub · c91b64f7 · 4290b704
Unverified Commit 4290b704 authored Mar 10, 2025 by Cody Yu Committed by GitHub Mar 10, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 11 additions and 16 deletions

vllm/v1/engine/core.py vllm/v1/engine/core.py +11 -16

No files found.
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -205,23 +205,18 @@ class EngineCore:
                self.batch_queue.put_nowait(
                    (future, scheduler_output))  # type: ignore
-        # If all requests are scheduled or the job queue is full,
+        scheduled_batch = (scheduler_output is not None
+                           and scheduler_output.total_num_scheduled_tokens > 0)
+        # If no more requests can be scheduled and the job queue is not empty,
        # block until the first batch in the job queue is finished.
-        if (scheduler_output is None
+        if not scheduled_batch and not self.batch_queue.empty():
-                or scheduler_output.total_num_scheduled_tokens == 0):
+            future, scheduler_output = self.batch_queue.get_nowait()
-            try:
+            # Blocking until the first result is available.
-                future, scheduler_output = self.batch_queue.get(
+            model_output = future.result()
-                    timeout=POLLING_TIMEOUT_S)
+            self.batch_queue.task_done()
-                # Blocking until the first result is available.
+            engine_core_outputs = self.scheduler.update_from_output(
-                model_output = future.result()
+                scheduler_output, model_output)
-                self.batch_queue.task_done()
-                engine_core_outputs = self.scheduler.update_from_output(
-                    scheduler_output, model_output)
-            except queue.Empty:
-                # If the queue is empty (timeout at .get), return
-                # an empty EngineCoreOutputs for logging.
-                engine_core_outputs = EngineCoreOutputs(
-                    outputs=[], scheduler_stats=self.scheduler.make_stats())
        return engine_core_outputs