Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
04e38500
Unverified
Commit
04e38500
authored
Jul 29, 2025
by
Richard Zou
Committed by
GitHub
Jul 29, 2025
Browse files
[Bugfix] VLLM_V1 supports passing other compilation levels (#19340)
Signed-off-by:
Richard Zou
<
zou3519@gmail.com
>
parent
ab714131
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
88 additions
and
5 deletions
+88
-5
tests/compile/test_config.py
tests/compile/test_config.py
+53
-2
vllm/compilation/counter.py
vllm/compilation/counter.py
+2
-0
vllm/config.py
vllm/config.py
+19
-2
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+12
-1
vllm/worker/model_runner.py
vllm/worker/model_runner.py
+2
-0
No files found.
tests/compile/test_config.py
View file @
04e38500
...
...
@@ -26,6 +26,8 @@ def test_use_cudagraphs_dynamic(monkeypatch):
assert
not
vllm_config
.
compilation_config
.
use_cudagraph
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
@
pytest
.
mark
.
forked
# NB: We don't test VLLM_DISABLE_COMPILE_CACHE=0 because that depends
# on the state of the cache directory on the current machine, which
# may be influenced by other tests.
...
...
@@ -33,8 +35,8 @@ def test_use_cudagraphs_dynamic(monkeypatch):
def
test_VLLM_DISABLE_COMPILE_CACHE
(
vllm_runner
,
monkeypatch
,
val
):
assert
vllm
.
envs
.
VLLM_USE_V1
#
spawn means
that the counter
s are
in the same process
.
monkeypatch
.
setenv
(
'VLLM_
WORKER
_MULTIPROC
_METHOD'
,
"spawn"
)
#
Disable multiprocessing so
that the counter
is
in the same process
monkeypatch
.
setenv
(
'VLLM_
ENABLE_V1
_MULTIPROC
ESSING'
,
'0'
)
monkeypatch
.
setenv
(
'VLLM_DISABLE_COMPILE_CACHE'
,
val
)
compilation_config
=
{
...
...
@@ -50,6 +52,8 @@ def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
pass
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
@
pytest
.
mark
.
forked
@
pytest
.
mark
.
parametrize
(
"enabled"
,
[
True
,
False
])
def
test_use_cudagraphs
(
vllm_runner
,
monkeypatch
,
enabled
):
assert
vllm
.
envs
.
VLLM_USE_V1
...
...
@@ -72,3 +76,50 @@ def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
compilation_config
=
compilation_config
,
gpu_memory_utilization
=
0.4
)
as
_
):
pass
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
@
pytest
.
mark
.
forked
def
test_dynamo_as_is
(
vllm_runner
,
monkeypatch
):
# Disable multiprocessing so that the counter is in the same process
monkeypatch
.
setenv
(
'VLLM_ENABLE_V1_MULTIPROCESSING'
,
'0'
)
with
(
compilation_counter
.
expect
(
dynamo_as_is_count
=
1
),
# loading the model causes compilation (if enabled) to happen
vllm_runner
(
'facebook/opt-125m'
,
compilation_config
=
{
"level"
:
1
},
gpu_memory_utilization
=
0.4
)
as
_
):
pass
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
@
pytest
.
mark
.
forked
def
test_no_compilation
(
vllm_runner
,
monkeypatch
):
# Disable multiprocessing so that the counter is in the same process
monkeypatch
.
setenv
(
'VLLM_ENABLE_V1_MULTIPROCESSING'
,
'0'
)
with
(
compilation_counter
.
expect
(
num_graphs_seen
=
0
,
dynamo_as_is_count
=
0
),
# loading the model causes compilation (if enabled) to happen
vllm_runner
(
'facebook/opt-125m'
,
compilation_config
=
{
"level"
:
0
},
gpu_memory_utilization
=
0.4
)
as
_
):
pass
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
@
pytest
.
mark
.
forked
def
test_enforce_eager
(
vllm_runner
,
monkeypatch
):
# Disable multiprocessing so that the counter is in the same process
monkeypatch
.
setenv
(
'VLLM_ENABLE_V1_MULTIPROCESSING'
,
'0'
)
with
(
compilation_counter
.
expect
(
num_graphs_seen
=
0
,
dynamo_as_is_count
=
0
),
# loading the model causes compilation (if enabled) to happen
vllm_runner
(
'facebook/opt-125m'
,
enforce_eager
=
True
,
gpu_memory_utilization
=
0.4
)
as
_
):
pass
vllm/compilation/counter.py
View file @
04e38500
...
...
@@ -27,6 +27,8 @@ class CompilationCounter:
num_cache_entries_updated
:
int
=
0
# The number of standalone_compile compiled artifacts saved
num_compiled_artifacts_saved
:
int
=
0
# Number of times a model was loaded with CompilationLevel.DYNAMO_AS_IS
dynamo_as_is_count
:
int
=
0
def
clone
(
self
)
->
"CompilationCounter"
:
return
copy
.
deepcopy
(
self
)
...
...
vllm/config.py
View file @
04e38500
...
...
@@ -4106,9 +4106,11 @@ class CompilationConfig:
certain small batchsizes, where inductor is good at optimizing.
"""
# Top-level Compilation control
level
:
int
=
0
level
:
Optional
[
int
]
=
None
"""The level of compilation:
- None: If None, we will select the default compilation level.
For V1 engine this is 3, for V0 engine this is 0.
- 0: no compilation.
- 1: dynamo as is.
- 2: dynamo once.
...
...
@@ -4664,6 +4666,22 @@ class VllmConfig:
"To workaround this limitation, vLLM will set 'ieee' input "
"precision for chunked prefill triton kernels."
)
# If the user does not explicitly set a compilation level, then
# we use the default level. The default level depends on other
# settings (see the below code).
if
self
.
compilation_config
.
level
is
None
:
if
envs
.
VLLM_USE_V1
:
if
(
self
.
model_config
is
not
None
and
not
self
.
model_config
.
enforce_eager
):
self
.
compilation_config
.
level
=
CompilationLevel
.
PIECEWISE
else
:
self
.
compilation_config
.
level
=
\
CompilationLevel
.
NO_COMPILATION
else
:
# NB: Passing both --enforce-eager and a compilation level
# in V0 means the compilation level wins out.
self
.
compilation_config
.
level
=
CompilationLevel
.
NO_COMPILATION
# async tp is built on top of sequence parallelism
# and requires it to be enabled.
if
self
.
compilation_config
.
pass_config
.
enable_async_tp
:
...
...
@@ -4676,7 +4694,6 @@ class VllmConfig:
# By default, V1 uses piecewise CUDA graphs. If full_cuda_graph
# is set to True, full CUDA graphs will be used.
self
.
compilation_config
.
cudagraph_num_of_warmups
=
1
self
.
compilation_config
.
level
=
CompilationLevel
.
PIECEWISE
self
.
compilation_config
.
set_splitting_ops_for_v1
()
self
.
_set_cudagraph_sizes
()
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
04e38500
...
...
@@ -43,7 +43,7 @@ from vllm.sequence import IntermediateTensors, PoolerOutput
from
vllm.tasks
import
GenerationTask
,
PoolingTask
,
SupportedTask
from
vllm.utils
import
(
STR_DTYPE_TO_TORCH_DTYPE
,
DeviceMemoryProfiler
,
GiB_bytes
,
LazyLoader
,
check_use_alibi
,
get_dtype_size
,
is_pin_memory_available
,
round_up
)
is_pin_memory_available
,
round_up
,
supports_dynamo
)
from
vllm.v1.attention.backends.mamba_selectors
import
get_mamba_attn_backend
from
vllm.v1.attention.backends.utils
import
(
AttentionMetadataBuilder
,
CommonAttentionMetadata
,
...
...
@@ -1930,6 +1930,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
rank_mapping
,
)
if
(
self
.
vllm_config
.
compilation_config
.
level
==
\
CompilationLevel
.
DYNAMO_AS_IS
and
supports_dynamo
()
):
backend
=
self
.
vllm_config
.
compilation_config
.
init_backend
(
self
.
vllm_config
)
compilation_counter
.
dynamo_as_is_count
+=
1
self
.
model
.
compile
(
fullgraph
=
envs
.
VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE
,
backend
=
backend
)
def
reload_weights
(
self
)
->
None
:
assert
getattr
(
self
,
"model"
,
None
)
is
not
None
,
\
"Cannot reload weights before model is loaded."
...
...
vllm/worker/model_runner.py
View file @
04e38500
...
...
@@ -22,6 +22,7 @@ import vllm.envs as envs
from
vllm.attention
import
AttentionMetadata
,
get_attn_backend
from
vllm.attention.backends.abstract
import
AttentionState
from
vllm.attention.backends.utils
import
CommonAttentionState
from
vllm.compilation.counter
import
compilation_counter
from
vllm.config
import
CompilationLevel
,
VllmConfig
from
vllm.core.scheduler
import
SchedulerOutputs
from
vllm.distributed
import
broadcast_tensor_dict
,
get_pp_group
...
...
@@ -1121,6 +1122,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
CompilationLevel
.
DYNAMO_AS_IS
and
supports_dynamo
():
backend
=
self
.
vllm_config
.
compilation_config
.
init_backend
(
self
.
vllm_config
)
compilation_counter
.
dynamo_as_is_count
+=
1
self
.
model
=
torch
.
compile
(
self
.
model
,
fullgraph
=
envs
.
VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment