Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
3da17c2c
Unverified
Commit
3da17c2c
authored
Sep 19, 2025
by
Lucas Kabela
Committed by
GitHub
Sep 19, 2025
Browse files
[Bugfix] Remove VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE #2969 (#25090)
Signed-off-by:
Lucas Kabela
<
lucaskabela@meta.com
>
parent
14c14327
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
11 additions
and
36 deletions
+11
-36
tests/compile/test_basic_correctness.py
tests/compile/test_basic_correctness.py
+2
-14
tests/compile/test_full_graph.py
tests/compile/test_full_graph.py
+1
-3
vllm/compilation/wrapper.py
vllm/compilation/wrapper.py
+4
-6
vllm/envs.py
vllm/envs.py
+0
-5
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+1
-3
vllm/worker/model_runner.py
vllm/worker/model_runner.py
+3
-5
No files found.
tests/compile/test_basic_correctness.py
View file @
3da17c2c
...
@@ -20,7 +20,6 @@ class TestSetting:
...
@@ -20,7 +20,6 @@ class TestSetting:
tp_size
:
int
tp_size
:
int
attn_backend
:
str
attn_backend
:
str
method
:
str
method
:
str
fullgraph
:
bool
# we cannot afford testing the full Cartesian product
# we cannot afford testing the full Cartesian product
...
@@ -36,7 +35,6 @@ class TestSetting:
...
@@ -36,7 +35,6 @@ class TestSetting:
tp_size
=
2
,
tp_size
=
2
,
attn_backend
=
"FLASH_ATTN"
,
attn_backend
=
"FLASH_ATTN"
,
method
=
"generate"
,
method
=
"generate"
,
fullgraph
=
True
,
),
),
# llama model with quantization
# llama model with quantization
TestSetting
(
TestSetting
(
...
@@ -46,7 +44,6 @@ class TestSetting:
...
@@ -46,7 +44,6 @@ class TestSetting:
tp_size
=
1
,
tp_size
=
1
,
attn_backend
=
"FLASH_ATTN"
,
attn_backend
=
"FLASH_ATTN"
,
method
=
"generate"
,
method
=
"generate"
,
fullgraph
=
True
,
),
),
# MoE model
# MoE model
TestSetting
(
TestSetting
(
...
@@ -56,7 +53,6 @@ class TestSetting:
...
@@ -56,7 +53,6 @@ class TestSetting:
tp_size
=
2
,
tp_size
=
2
,
attn_backend
=
"FLASH_ATTN"
,
attn_backend
=
"FLASH_ATTN"
,
method
=
"generate"
,
method
=
"generate"
,
fullgraph
=
True
,
),
),
# embedding model
# embedding model
TestSetting
(
TestSetting
(
...
@@ -73,7 +69,6 @@ class TestSetting:
...
@@ -73,7 +69,6 @@ class TestSetting:
tp_size
=
1
,
tp_size
=
1
,
attn_backend
=
"FLASH_ATTN"
,
attn_backend
=
"FLASH_ATTN"
,
method
=
"encode"
,
method
=
"encode"
,
fullgraph
=
True
,
),
),
TestSetting
(
TestSetting
(
model
=
"BAAI/bge-base-en-v1.5"
,
model
=
"BAAI/bge-base-en-v1.5"
,
...
@@ -82,7 +77,6 @@ class TestSetting:
...
@@ -82,7 +77,6 @@ class TestSetting:
tp_size
=
1
,
tp_size
=
1
,
attn_backend
=
"FLASH_ATTN"
,
attn_backend
=
"FLASH_ATTN"
,
method
=
"encode"
,
method
=
"encode"
,
fullgraph
=
True
,
),
),
# vision language model
# vision language model
TestSetting
(
TestSetting
(
...
@@ -92,7 +86,6 @@ class TestSetting:
...
@@ -92,7 +86,6 @@ class TestSetting:
tp_size
=
1
,
tp_size
=
1
,
attn_backend
=
"FLASH_ATTN"
,
attn_backend
=
"FLASH_ATTN"
,
method
=
"generate_with_image"
,
method
=
"generate_with_image"
,
fullgraph
=
False
,
),
),
],
],
)
)
...
@@ -109,9 +102,8 @@ def test_compile_correctness(
...
@@ -109,9 +102,8 @@ def test_compile_correctness(
tp_size
=
test_setting
.
tp_size
tp_size
=
test_setting
.
tp_size
attn_backend
=
test_setting
.
attn_backend
attn_backend
=
test_setting
.
attn_backend
method
=
test_setting
.
method
method
=
test_setting
.
method
fullgraph
=
test_setting
.
fullgraph
if
cuda_device_count_stateless
()
<
pp_size
*
tp_size
:
if
cuda_device_count_stateless
()
!=
pp_size
*
tp_size
:
pytest
.
skip
(
f
"Need at least
{
pp_size
}
*
{
tp_size
}
CUDA gpus but got "
pytest
.
skip
(
f
"Need exactly
{
pp_size
}
*
{
tp_size
}
CUDA gpus but got "
f
"
{
cuda_device_count_stateless
()
}
"
)
f
"
{
cuda_device_count_stateless
()
}
"
)
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
...
@@ -149,9 +141,5 @@ def test_compile_correctness(
...
@@ -149,9 +141,5 @@ def test_compile_correctness(
]:
]:
all_args
.
append
(
final_args
+
[
f
"-O
{
level
}
"
])
all_args
.
append
(
final_args
+
[
f
"-O
{
level
}
"
])
all_envs
.
append
({})
all_envs
.
append
({})
if
level
!=
CompilationLevel
.
DYNAMO_ONCE
and
not
fullgraph
:
# "DYNAMO_ONCE" will always use fullgraph
all_envs
[
-
1
][
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"
]
=
"0"
# type: ignore
compare_all_settings
(
model
,
all_args
*
3
,
all_envs
,
method
=
method
)
compare_all_settings
(
model
,
all_args
*
3
,
all_envs
,
method
=
method
)
tests/compile/test_full_graph.py
View file @
3da17c2c
...
@@ -79,9 +79,7 @@ def test_full_graph(
...
@@ -79,9 +79,7 @@ def test_full_graph(
):
):
model
,
model_kwargs
=
model_info
model
,
model_kwargs
=
model_info
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
():
# make sure these models can be captured in full graph mode
m
.
setenv
(
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"
,
"1"
)
print
(
f
"MODEL=
{
model
}
"
)
print
(
f
"MODEL=
{
model
}
"
)
run_model
(
optimization_level
,
model
,
model_kwargs
)
run_model
(
optimization_level
,
model
,
model_kwargs
)
...
...
vllm/compilation/wrapper.py
View file @
3da17c2c
...
@@ -10,7 +10,6 @@ from typing import Callable, Optional
...
@@ -10,7 +10,6 @@ from typing import Callable, Optional
import
torch
import
torch
import
vllm.envs
as
envs
from
vllm.config
import
(
CompilationLevel
,
CUDAGraphMode
,
from
vllm.config
import
(
CompilationLevel
,
CUDAGraphMode
,
get_current_vllm_config
)
get_current_vllm_config
)
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
...
@@ -47,9 +46,8 @@ class TorchCompileWrapperWithCustomDispatcher:
...
@@ -47,9 +46,8 @@ class TorchCompileWrapperWithCustomDispatcher:
options
=
get_current_vllm_config
(
options
=
get_current_vllm_config
(
).
compilation_config
.
inductor_compile_config
).
compilation_config
.
inductor_compile_config
compiled_callable
=
torch
.
compile
(
compiled_callable
=
torch
.
compile
(
self
.
forward
,
self
.
forward
,
fullgraph
=
True
,
fullgraph
=
envs
.
VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE
,
backend
=
backend
,
backend
=
backend
,
options
=
options
)
options
=
options
)
...
...
vllm/envs.py
View file @
3da17c2c
...
@@ -434,11 +434,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
...
@@ -434,11 +434,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_FLASH_ATTN_VERSION"
:
"VLLM_FLASH_ATTN_VERSION"
:
lambda
:
maybe_convert_int
(
os
.
environ
.
get
(
"VLLM_FLASH_ATTN_VERSION"
,
None
)),
lambda
:
maybe_convert_int
(
os
.
environ
.
get
(
"VLLM_FLASH_ATTN_VERSION"
,
None
)),
# Internal flag to enable Dynamo fullgraph capture
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"
:
lambda
:
bool
(
os
.
environ
.
get
(
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"
,
"1"
)
!=
"0"
),
# Feature flag to enable/disable Inductor standalone compile.
# Feature flag to enable/disable Inductor standalone compile.
# In torch <= 2.7 we ignore this flag; in torch >= 2.8 this is
# In torch <= 2.7 we ignore this flag; in torch >= 2.8 this is
# enabled by default.
# enabled by default.
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
3da17c2c
...
@@ -2602,9 +2602,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -2602,9 +2602,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
backend
=
self
.
vllm_config
.
compilation_config
.
init_backend
(
backend
=
self
.
vllm_config
.
compilation_config
.
init_backend
(
self
.
vllm_config
)
self
.
vllm_config
)
compilation_counter
.
dynamo_as_is_count
+=
1
compilation_counter
.
dynamo_as_is_count
+=
1
self
.
model
.
compile
(
self
.
model
.
compile
(
fullgraph
=
True
,
backend
=
backend
)
fullgraph
=
envs
.
VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE
,
backend
=
backend
)
return
return
# for other compilation levels, cudagraph behavior is controlled by
# for other compilation levels, cudagraph behavior is controlled by
# CudagraphWraper and CudagraphDispatcher of vllm.
# CudagraphWraper and CudagraphDispatcher of vllm.
...
...
vllm/worker/model_runner.py
View file @
3da17c2c
...
@@ -18,7 +18,6 @@ import torch.distributed
...
@@ -18,7 +18,6 @@ import torch.distributed
import
torch.nn
as
nn
import
torch.nn
as
nn
from
tqdm.auto
import
tqdm
from
tqdm.auto
import
tqdm
import
vllm.envs
as
envs
from
vllm.attention
import
AttentionMetadata
,
get_attn_backend
from
vllm.attention
import
AttentionMetadata
,
get_attn_backend
from
vllm.attention.backends.abstract
import
AttentionState
from
vllm.attention.backends.abstract
import
AttentionState
from
vllm.attention.backends.utils
import
CommonAttentionState
from
vllm.attention.backends.utils
import
CommonAttentionState
...
@@ -1099,9 +1098,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
...
@@ -1099,9 +1098,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
backend
=
self
.
vllm_config
.
compilation_config
.
init_backend
(
backend
=
self
.
vllm_config
.
compilation_config
.
init_backend
(
self
.
vllm_config
)
self
.
vllm_config
)
compilation_counter
.
dynamo_as_is_count
+=
1
compilation_counter
.
dynamo_as_is_count
+=
1
self
.
model
=
torch
.
compile
(
self
.
model
=
torch
.
compile
(
self
.
model
,
self
.
model
,
fullgraph
=
True
,
fullgraph
=
envs
.
VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE
,
backend
=
backend
)
backend
=
backend
)
def
get_model
(
self
)
->
nn
.
Module
:
def
get_model
(
self
)
->
nn
.
Module
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment