Merge remote-tracking branch 'origin/v0.8.5-zero_overhead' into v0.8.5.post1-dev

0e8619b8 · zhuwenwen · 90f05cd6 · 0c5b1695 · 0e8619b8 · 0e8619b8
Commit 0e8619b8 authored May 29, 2025 by zhuwenwen
Show whitespace changes
Inline Side-by-side

Showing with 19 additions and 0 deletions

vllm/engine/multiprocessing/engine.py vllm/engine/multiprocessing/engine.py +13 -0

vllm/envs.py vllm/envs.py +6 -0

No files found.
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -35,6 +35,7 @@ from vllm.transformers_utils.config import (
    maybe_register_config_serialize_by_value)
 from vllm.usage.usage_lib import UsageContext
 from vllm.worker.model_runner_base import InputProcessingError
+import time

 logger = init_logger(__name__)

@@ -209,6 +210,8 @@ class MQLLMEngine:
    def run_engine_loop(self):
        """Core busy loop of the LLMEngine."""

+        last_no_req_time_refreshed = True
+        last_no_req_time = time.perf_counter()
        while True:
            if not self.engine.has_unfinished_requests():
                # Poll until there is work to do.
@@ -218,10 +221,20 @@ class MQLLMEngine:
                    self._health_check()
                    self.engine.do_log_stats()
                    logger.debug("Waiting for new requests in engine loop.")
+                last_no_req_time = time.perf_counter()
+                last_no_req_time_refreshed = True

            # Handle any input from the client.
            self.handle_new_input()

+            if envs.VLLM_TBO_REQ_DELAY_MS > 0 and last_no_req_time_refreshed and envs.VLLM_ENABLE_TBO:
+                if self.engine.get_num_unfinished_requests() < 2:
+                    time_diff_ms = int((time.perf_counter() - last_no_req_time) * 1000)
+                    if time_diff_ms < envs.VLLM_TBO_REQ_DELAY_MS:
+                        time.sleep(0.01) # sleep and waiting more request to merge in one batch
+                        continue
+
+            last_no_req_time_refreshed = False
            # Engine step.
            request_outputs = self.engine_step()


--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -126,6 +126,8 @@ if TYPE_CHECKING:
    VLLM_HAS_CONTEXT_DEFAULT: bool = False
    VLLM_FLASH_ATTN_BACKEND: bool = False
    VLLM_ENABLE_TBO: bool = False
+    VLLM_TBO_REQ_DELAY_MS:int = 0
+
    VLLM_ZERO_OVERHEAD: bool = False
    VLLM_ENABLE_MOE_FUSED_GATE: bool = False

@@ -817,6 +819,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_ENABLE_TBO":
    lambda: bool(int(os.getenv("VLLM_ENABLE_TBO", "0"))),
    
+    # set delay on server when only one requet, the purpose is to merge a larger batch.
+    "VLLM_TBO_REQ_DELAY_MS":
+    lambda: int(os.getenv("VLLM_TBO_REQ_DELAY_MS", "0")),
+
    # Enable zero overhead scheduler.
    "VLLM_ZERO_OVERHEAD":
    lambda: bool(int(os.getenv("VLLM_ZERO_OVERHEAD", "0"))),