Commit 0c5b1695 authored by lizhigong's avatar lizhigong
Browse files

add delay setting in server, sleep and waiting more requests to merge in one batch

parent bf790acd
...@@ -35,6 +35,7 @@ from vllm.transformers_utils.config import ( ...@@ -35,6 +35,7 @@ from vllm.transformers_utils.config import (
maybe_register_config_serialize_by_value) maybe_register_config_serialize_by_value)
from vllm.usage.usage_lib import UsageContext from vllm.usage.usage_lib import UsageContext
from vllm.worker.model_runner_base import InputProcessingError from vllm.worker.model_runner_base import InputProcessingError
import time
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -209,6 +210,8 @@ class MQLLMEngine: ...@@ -209,6 +210,8 @@ class MQLLMEngine:
def run_engine_loop(self): def run_engine_loop(self):
"""Core busy loop of the LLMEngine.""" """Core busy loop of the LLMEngine."""
last_no_req_time_refreshed = True
last_no_req_time = time.perf_counter()
while True: while True:
if not self.engine.has_unfinished_requests(): if not self.engine.has_unfinished_requests():
# Poll until there is work to do. # Poll until there is work to do.
...@@ -218,10 +221,20 @@ class MQLLMEngine: ...@@ -218,10 +221,20 @@ class MQLLMEngine:
self._health_check() self._health_check()
self.engine.do_log_stats() self.engine.do_log_stats()
logger.debug("Waiting for new requests in engine loop.") logger.debug("Waiting for new requests in engine loop.")
last_no_req_time = time.perf_counter()
last_no_req_time_refreshed = True
# Handle any input from the client. # Handle any input from the client.
self.handle_new_input() self.handle_new_input()
if envs.VLLM_TBO_REQ_DELAY_MS > 0 and last_no_req_time_refreshed and envs.VLLM_ENABLE_TBO:
if self.engine.get_num_unfinished_requests() < 2:
time_diff_ms = int((time.perf_counter() - last_no_req_time) * 1000)
if time_diff_ms < envs.VLLM_TBO_REQ_DELAY_MS:
time.sleep(0.01) # sleep and waiting more request to merge in one batch
continue
last_no_req_time_refreshed = False
# Engine step. # Engine step.
request_outputs = self.engine_step() request_outputs = self.engine_step()
......
...@@ -126,6 +126,8 @@ if TYPE_CHECKING: ...@@ -126,6 +126,8 @@ if TYPE_CHECKING:
VLLM_HAS_CONTEXT_DEFAULT: bool = False VLLM_HAS_CONTEXT_DEFAULT: bool = False
VLLM_ENABLE_TBO: bool = False VLLM_ENABLE_TBO: bool = False
VLLM_TBO_REQ_DELAY_MS:int = 0
VLLM_ZERO_OVERHEAD: bool = False VLLM_ZERO_OVERHEAD: bool = False
def get_default_cache_root(): def get_default_cache_root():
...@@ -803,6 +805,10 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -803,6 +805,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_ENABLE_TBO": "VLLM_ENABLE_TBO":
lambda: bool(int(os.getenv("VLLM_ENABLE_TBO", "0"))), lambda: bool(int(os.getenv("VLLM_ENABLE_TBO", "0"))),
# set delay on server when only one requet, the purpose is to merge a larger batch.
"VLLM_TBO_REQ_DELAY_MS":
lambda: int(os.getenv("VLLM_TBO_REQ_DELAY_MS", "0")),
# Enable zero overhead scheduler. # Enable zero overhead scheduler.
"VLLM_ZERO_OVERHEAD": "VLLM_ZERO_OVERHEAD":
lambda: bool(int(os.getenv("VLLM_ZERO_OVERHEAD", "0"))), lambda: bool(int(os.getenv("VLLM_ZERO_OVERHEAD", "0"))),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment