Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0c5b1695
Commit
0c5b1695
authored
May 29, 2025
by
lizhigong
Browse files
add delay setting in server, sleep and waiting more requests to merge in one batch
parent
bf790acd
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
19 additions
and
0 deletions
+19
-0
vllm/engine/multiprocessing/engine.py
vllm/engine/multiprocessing/engine.py
+13
-0
vllm/envs.py
vllm/envs.py
+6
-0
No files found.
vllm/engine/multiprocessing/engine.py
View file @
0c5b1695
...
...
@@ -35,6 +35,7 @@ from vllm.transformers_utils.config import (
maybe_register_config_serialize_by_value
)
from
vllm.usage.usage_lib
import
UsageContext
from
vllm.worker.model_runner_base
import
InputProcessingError
import
time
logger
=
init_logger
(
__name__
)
...
...
@@ -209,6 +210,8 @@ class MQLLMEngine:
def
run_engine_loop
(
self
):
"""Core busy loop of the LLMEngine."""
last_no_req_time_refreshed
=
True
last_no_req_time
=
time
.
perf_counter
()
while
True
:
if
not
self
.
engine
.
has_unfinished_requests
():
# Poll until there is work to do.
...
...
@@ -218,10 +221,20 @@ class MQLLMEngine:
self
.
_health_check
()
self
.
engine
.
do_log_stats
()
logger
.
debug
(
"Waiting for new requests in engine loop."
)
last_no_req_time
=
time
.
perf_counter
()
last_no_req_time_refreshed
=
True
# Handle any input from the client.
self
.
handle_new_input
()
if
envs
.
VLLM_TBO_REQ_DELAY_MS
>
0
and
last_no_req_time_refreshed
and
envs
.
VLLM_ENABLE_TBO
:
if
self
.
engine
.
get_num_unfinished_requests
()
<
2
:
time_diff_ms
=
int
((
time
.
perf_counter
()
-
last_no_req_time
)
*
1000
)
if
time_diff_ms
<
envs
.
VLLM_TBO_REQ_DELAY_MS
:
time
.
sleep
(
0.01
)
# sleep and waiting more request to merge in one batch
continue
last_no_req_time_refreshed
=
False
# Engine step.
request_outputs
=
self
.
engine_step
()
...
...
vllm/envs.py
View file @
0c5b1695
...
...
@@ -126,6 +126,8 @@ if TYPE_CHECKING:
VLLM_HAS_CONTEXT_DEFAULT
:
bool
=
False
VLLM_ENABLE_TBO
:
bool
=
False
VLLM_TBO_REQ_DELAY_MS
:
int
=
0
VLLM_ZERO_OVERHEAD
:
bool
=
False
def
get_default_cache_root
():
...
...
@@ -803,6 +805,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_ENABLE_TBO"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_ENABLE_TBO"
,
"0"
))),
# set delay on server when only one requet, the purpose is to merge a larger batch.
"VLLM_TBO_REQ_DELAY_MS"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_TBO_REQ_DELAY_MS"
,
"0"
)),
# Enable zero overhead scheduler.
"VLLM_ZERO_OVERHEAD"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_ZERO_OVERHEAD"
,
"0"
))),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment