Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
3ce3e3d9
Commit
3ce3e3d9
authored
Apr 15, 2025
by
王敏
Browse files
[feat]添加VLLM_ENFORCE_EAGER_BS_THRESHOLD环境变量,支持cudagraph模式下,当bs大于阈值时,强制切换为eager模式,对大bs有效果
parent
6f49c1ed
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
12 additions
and
1 deletion
+12
-1
vllm/envs.py
vllm/envs.py
+5
-0
vllm/worker/model_runner.py
vllm/worker/model_runner.py
+7
-1
No files found.
vllm/envs.py
View file @
3ce3e3d9
...
...
@@ -95,6 +95,7 @@ if TYPE_CHECKING:
VLLM_RAY_PER_WORKER_GPUS
:
float
=
1.0
VLLM_RAY_BUNDLE_INDICES
:
str
=
""
VLLM_SPEC_DECODE_EAGER
:
bool
=
False
VLLM_ENFORCE_EAGER_BS_THRESHOLD
:
Optional
[
int
]
=
None
def
get_default_cache_root
():
...
...
@@ -618,6 +619,10 @@ environment_variables: Dict[str, Callable[[], Any]] = {
# If set, vLLM will disable the draft model in cudagraph mode.
"VLLM_SPEC_DECODE_EAGER"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_SPEC_DECODE_EAGER"
,
"0"
))),
# If set, vLLM will disable the draft model in cudagraph mode.
"VLLM_ENFORCE_EAGER_BS_THRESHOLD"
:
lambda
:
int
(
os
.
environ
.
get
(
"VLLM_ENFORCE_EAGER_BS_THRESHOLD"
,
"-1"
)),
}
# end-env-vars-definition
...
...
vllm/worker/model_runner.py
View file @
3ce3e3d9
# SPDX-License-Identifier: Apache-2.0
import
sys
import
dataclasses
import
gc
import
inspect
...
...
@@ -1109,6 +1110,10 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
# multi-step model runner does not have `_builder_cls`
self
.
builder
=
self
.
_builder_cls
(
weakref
.
proxy
(
self
))
self
.
enforce_eager_bs_threshould
=
sys
.
maxsize
if
envs
.
VLLM_ENFORCE_EAGER_BS_THRESHOLD
is
not
None
and
envs
.
VLLM_ENFORCE_EAGER_BS_THRESHOLD
>
0
:
self
.
enforce_eager_bs_threshould
=
envs
.
VLLM_ENFORCE_EAGER_BS_THRESHOLD
def
load_model
(
self
)
->
None
:
logger
.
info
(
"Starting to load model %s..."
,
self
.
model_config
.
model
)
with
DeviceMemoryProfiler
()
as
m
:
...
...
@@ -1680,7 +1685,8 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
# TODO(andoorve): We can remove this once all
# virtual engines share the same kv cache.
virtual_engine
=
model_input
.
virtual_engine
if
prefill_meta
is
None
and
decode_meta
.
use_cuda_graph
:
if
prefill_meta
is
None
and
decode_meta
.
use_cuda_graph
and
\
model_input
.
input_tokens
.
shape
[
0
]
<=
self
.
enforce_eager_bs_threshould
:
assert
model_input
.
input_tokens
is
not
None
graph_batch_size
=
model_input
.
input_tokens
.
shape
[
0
]
model_executable
=
self
.
graph_runners
[
virtual_engine
][
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment