Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0e8619b8
Commit
0e8619b8
authored
May 29, 2025
by
zhuwenwen
Browse files
Merge remote-tracking branch 'origin/v0.8.5-zero_overhead' into v0.8.5.post1-dev
parents
90f05cd6
0c5b1695
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
19 additions
and
0 deletions
+19
-0
vllm/engine/multiprocessing/engine.py
vllm/engine/multiprocessing/engine.py
+13
-0
vllm/envs.py
vllm/envs.py
+6
-0
No files found.
vllm/engine/multiprocessing/engine.py
View file @
0e8619b8
...
...
@@ -35,6 +35,7 @@ from vllm.transformers_utils.config import (
maybe_register_config_serialize_by_value
)
from
vllm.usage.usage_lib
import
UsageContext
from
vllm.worker.model_runner_base
import
InputProcessingError
import
time
logger
=
init_logger
(
__name__
)
...
...
@@ -209,6 +210,8 @@ class MQLLMEngine:
def
run_engine_loop
(
self
):
"""Core busy loop of the LLMEngine."""
last_no_req_time_refreshed
=
True
last_no_req_time
=
time
.
perf_counter
()
while
True
:
if
not
self
.
engine
.
has_unfinished_requests
():
# Poll until there is work to do.
...
...
@@ -218,10 +221,20 @@ class MQLLMEngine:
self
.
_health_check
()
self
.
engine
.
do_log_stats
()
logger
.
debug
(
"Waiting for new requests in engine loop."
)
last_no_req_time
=
time
.
perf_counter
()
last_no_req_time_refreshed
=
True
# Handle any input from the client.
self
.
handle_new_input
()
if
envs
.
VLLM_TBO_REQ_DELAY_MS
>
0
and
last_no_req_time_refreshed
and
envs
.
VLLM_ENABLE_TBO
:
if
self
.
engine
.
get_num_unfinished_requests
()
<
2
:
time_diff_ms
=
int
((
time
.
perf_counter
()
-
last_no_req_time
)
*
1000
)
if
time_diff_ms
<
envs
.
VLLM_TBO_REQ_DELAY_MS
:
time
.
sleep
(
0.01
)
# sleep and waiting more request to merge in one batch
continue
last_no_req_time_refreshed
=
False
# Engine step.
request_outputs
=
self
.
engine_step
()
...
...
vllm/envs.py
View file @
0e8619b8
...
...
@@ -126,6 +126,8 @@ if TYPE_CHECKING:
VLLM_HAS_CONTEXT_DEFAULT
:
bool
=
False
VLLM_FLASH_ATTN_BACKEND
:
bool
=
False
VLLM_ENABLE_TBO
:
bool
=
False
VLLM_TBO_REQ_DELAY_MS
:
int
=
0
VLLM_ZERO_OVERHEAD
:
bool
=
False
VLLM_ENABLE_MOE_FUSED_GATE
:
bool
=
False
...
...
@@ -817,6 +819,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_ENABLE_TBO"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_ENABLE_TBO"
,
"0"
))),
# set delay on server when only one requet, the purpose is to merge a larger batch.
"VLLM_TBO_REQ_DELAY_MS"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_TBO_REQ_DELAY_MS"
,
"0"
)),
# Enable zero overhead scheduler.
"VLLM_ZERO_OVERHEAD"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_ZERO_OVERHEAD"
,
"0"
))),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment