Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d3fa2342
Commit
d3fa2342
authored
Jan 06, 2026
by
zhuwenwen
Browse files
[Perf] Change default CUDAGraphMode from FULL_AND_PIECEWISE to PIECEWISE
parent
55989b60
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
58 additions
and
50 deletions
+58
-50
vllm/config/compilation.py
vllm/config/compilation.py
+1
-1
vllm/config/vllm.py
vllm/config/vllm.py
+24
-21
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+33
-28
No files found.
vllm/config/compilation.py
View file @
d3fa2342
...
@@ -78,7 +78,7 @@ class CUDAGraphMode(enum.Enum):
...
@@ -78,7 +78,7 @@ class CUDAGraphMode(enum.Enum):
return
self
.
has_mode
(
CUDAGraphMode
.
PIECEWISE
)
return
self
.
has_mode
(
CUDAGraphMode
.
PIECEWISE
)
def
max_cudagraph_mode
(
self
)
->
"CUDAGraphMode"
:
def
max_cudagraph_mode
(
self
)
->
"CUDAGraphMode"
:
return
CUDAGraphMode
(
max
(
self
.
value
)
if
not
envs
.
VLLM_USE_PIECEWISE
else
min
(
self
.
value
)
)
if
self
.
separate_routine
()
else
self
return
CUDAGraphMode
(
max
(
self
.
value
))
if
self
.
separate_routine
()
else
self
def
has_full_cudagraphs
(
self
)
->
bool
:
def
has_full_cudagraphs
(
self
)
->
bool
:
return
self
.
max_cudagraph_mode
()
==
CUDAGraphMode
.
FULL
return
self
.
max_cudagraph_mode
()
==
CUDAGraphMode
.
FULL
...
...
vllm/config/vllm.py
View file @
d3fa2342
...
@@ -694,28 +694,31 @@ class VllmConfig:
...
@@ -694,28 +694,31 @@ class VllmConfig:
if
current_platform
.
support_static_graph_mode
():
if
current_platform
.
support_static_graph_mode
():
# if cudagraph_mode has full cudagraphs, we need to check support
# if cudagraph_mode has full cudagraphs, we need to check support
if
model_config
:
=
self
.
model_config
:
if
model_config
:
=
self
.
model_config
:
if
(
if
not
envs
.
VLLM_USE_PIECEWISE
:
self
.
compilation_config
.
cudagraph_mode
.
has_full_cudagraphs
()
if
(
and
model_config
.
pooler_config
is
not
None
self
.
compilation_config
.
cudagraph_mode
.
has_full_cudagraphs
()
):
and
model_config
.
pooler_config
is
not
None
logger
.
warning_once
(
):
"Pooling models do not support full cudagraphs. "
logger
.
warning_once
(
"Overriding cudagraph_mode to PIECEWISE."
"Pooling models do not support full cudagraphs. "
)
"Overriding cudagraph_mode to PIECEWISE."
)
self
.
compilation_config
.
cudagraph_mode
=
CUDAGraphMode
.
PIECEWISE
elif
(
model_config
.
is_encoder_decoder
and
self
.
compilation_config
.
cudagraph_mode
not
in
(
CUDAGraphMode
.
NONE
,
CUDAGraphMode
.
FULL_DECODE_ONLY
)
):
logger
.
info_once
(
"Encoder-decoder models do not support %s. "
"Overriding cudagraph_mode to FULL_DECODE_ONLY."
,
self
.
compilation_config
.
cudagraph_mode
.
name
,
)
self
.
compilation_config
.
cudagraph_mode
=
(
CUDAGraphMode
.
FULL_DECODE_ONLY
)
else
:
self
.
compilation_config
.
cudagraph_mode
=
CUDAGraphMode
.
PIECEWISE
self
.
compilation_config
.
cudagraph_mode
=
CUDAGraphMode
.
PIECEWISE
elif
(
model_config
.
is_encoder_decoder
and
self
.
compilation_config
.
cudagraph_mode
not
in
(
CUDAGraphMode
.
NONE
,
CUDAGraphMode
.
FULL_DECODE_ONLY
)
):
logger
.
info_once
(
"Encoder-decoder models do not support %s. "
"Overriding cudagraph_mode to FULL_DECODE_ONLY."
,
self
.
compilation_config
.
cudagraph_mode
.
name
,
)
self
.
compilation_config
.
cudagraph_mode
=
(
CUDAGraphMode
.
FULL_DECODE_ONLY
)
# disable cudagraph when enforce eager execution
# disable cudagraph when enforce eager execution
if
self
.
model_config
is
not
None
and
self
.
model_config
.
enforce_eager
:
if
self
.
model_config
is
not
None
and
self
.
model_config
.
enforce_eager
:
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
d3fa2342
...
@@ -4109,7 +4109,11 @@ class GPUModelRunner(
...
@@ -4109,7 +4109,11 @@ class GPUModelRunner(
# TODO(luka) better system for describing dummy batches
# TODO(luka) better system for describing dummy batches
seq_lens
=
[
1
]
*
num_decode_tokens
+
[
num_prefill_tokens
+
1
]
seq_lens
=
[
1
]
*
num_decode_tokens
+
[
num_prefill_tokens
+
1
]
else
:
else
:
seq_lens
=
max_query_len
# type: ignore[assignment]
if
not
envs
.
VLLM_USE_PIECEWISE
:
seq_lens
=
max_query_len
else
:
# Make sure max_model_len is used at the graph capture time.
seq_lens
=
self
.
max_model_len
self
.
seq_lens
.
np
[:
num_reqs
]
=
seq_lens
self
.
seq_lens
.
np
[:
num_reqs
]
=
seq_lens
self
.
seq_lens
.
np
[
num_reqs
:]
=
0
self
.
seq_lens
.
np
[
num_reqs
:]
=
0
self
.
seq_lens
.
copy_to_gpu
()
self
.
seq_lens
.
copy_to_gpu
()
...
@@ -4825,35 +4829,36 @@ class GPUModelRunner(
...
@@ -4825,35 +4829,36 @@ class GPUModelRunner(
logger
.
warning
(
msg
)
logger
.
warning
(
msg
)
# check that if we are doing decode full-cudagraphs it is supported
# check that if we are doing decode full-cudagraphs it is supported
if
(
if
not
envs
.
VLLM_USE_PIECEWISE
:
cudagraph_mode
.
decode_mode
()
==
CUDAGraphMode
.
FULL
if
(
and
min_cg_support
==
AttentionCGSupport
.
NEVER
cudagraph_mode
.
decode_mode
()
==
CUDAGraphMode
.
FULL
):
and
min_cg_support
==
AttentionCGSupport
.
NEVER
msg
=
(
f
"CUDAGraphMode.
{
cudagraph_mode
.
name
}
is not supported "
f
"with
{
min_cg_backend_name
}
backend (support: "
f
"
{
min_cg_support
}
)"
)
if
self
.
compilation_config
.
mode
==
CompilationMode
.
VLLM_COMPILE
and
(
self
.
compilation_config
.
splitting_ops_contain_attention
()
or
self
.
compilation_config
.
use_inductor_graph_partition
):
):
msg
+=
(
msg
=
(
"; setting cudagraph_mode=PIECEWISE because "
f
"CUDAGraphMode.
{
cudagraph_mode
.
name
}
is not supported "
"attention is compiled piecewise"
f
"with
{
min_cg_backend_name
}
backend (support: "
f
"
{
min_cg_support
}
)"
)
)
cudagraph_mode
=
self
.
compilation_config
.
cudagraph_mode
=
(
if
self
.
compilation_config
.
mode
==
CompilationMode
.
VLLM_COMPILE
and
(
CUDAGraphMode
.
PIECEWISE
self
.
compilation_config
.
splitting_ops_contain_attention
()
)
or
self
.
compilation_config
.
use_inductor_graph_partition
else
:
):
msg
+=
(
msg
+=
(
"; setting cudagraph_mode=NONE because "
"; setting cudagraph_mode=PIECEWISE because "
"attention is not compiled piecewise"
"attention is compiled piecewise"
)
)
cudagraph_mode
=
self
.
compilation_config
.
cudagraph_mode
=
(
cudagraph_mode
=
self
.
compilation_config
.
cudagraph_mode
=
(
CUDAGraphMode
.
NONE
CUDAGraphMode
.
PIECEWISE
)
)
logger
.
warning
(
msg
)
else
:
msg
+=
(
"; setting cudagraph_mode=NONE because "
"attention is not compiled piecewise"
)
cudagraph_mode
=
self
.
compilation_config
.
cudagraph_mode
=
(
CUDAGraphMode
.
NONE
)
logger
.
warning
(
msg
)
# check that if we are doing spec-decode + decode full-cudagraphs it is
# check that if we are doing spec-decode + decode full-cudagraphs it is
# supported
# supported
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment