Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
24fab45d
Unverified
Commit
24fab45d
authored
Sep 23, 2025
by
Michael Goin
Committed by
GitHub
Sep 23, 2025
Browse files
[Perf] Change default CUDAGraphMode from PIECEWISE to FULL_AND_PIECEWISE (#25444)
Signed-off-by:
mgoin
<
mgoin64@gmail.com
>
parent
63400259
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
32 additions
and
7 deletions
+32
-7
vllm/config/__init__.py
vllm/config/__init__.py
+8
-1
vllm/config/compilation.py
vllm/config/compilation.py
+3
-4
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+21
-2
No files found.
vllm/config/__init__.py
View file @
24fab45d
...
...
@@ -509,8 +509,15 @@ class VllmConfig:
if
self
.
compilation_config
.
cudagraph_mode
is
None
:
if
envs
.
VLLM_USE_V1
and
self
.
compilation_config
.
level
\
==
CompilationLevel
.
PIECEWISE
:
# default to full and piecewise for most models
self
.
compilation_config
.
cudagraph_mode
=
\
CUDAGraphMode
.
PIECEWISE
CUDAGraphMode
.
FULL_AND_PIECEWISE
# pooling model does not support full cudagraphs
if
self
.
model_config
is
not
None
and
\
self
.
model_config
.
pooler_config
is
not
None
:
self
.
compilation_config
.
cudagraph_mode
=
\
CUDAGraphMode
.
PIECEWISE
else
:
self
.
compilation_config
.
cudagraph_mode
=
CUDAGraphMode
.
NONE
...
...
vllm/config/compilation.py
View file @
24fab45d
...
...
@@ -228,15 +228,14 @@ class CompilationConfig:
The mode of the cudagraph:
- NONE, no cudagraph capture.
- PIECEWISE.
(v1 default)
- PIECEWISE.
- FULL.
- FULL_DECODE_ONLY.
- FULL_AND_PIECEWISE.
- FULL_AND_PIECEWISE.
(v1 default)
PIECEWISE mode build piecewise cudagraph only, keeping the cudagraph
incompatible ops (i.e. some attention ops) outside the cudagraph
for general flexibility.
This is the default mode.
FULL mode: Capture full cudagraph for all batches. Can be good for small
models or workloads with small prompts; not supported by many backends.
...
...
@@ -249,7 +248,7 @@ class CompilationConfig:
FULL_AND_PIECEWISE mode: Capture full cudagraph for decode batches and
piecewise cudagraph for prefill and mixed prefill-decode batches.
This is
like
the most performant mode for most models.
This is the most performant mode for most models
and is the default
.
Currently, the cudagraph mode is only used for the v1 engine.
Note that the cudagraph logic is generally orthogonal to the
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
24fab45d
...
...
@@ -2947,8 +2947,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# TODO(luka) better system for describing dummy batches
seq_lens
=
[
1
]
*
num_decode_tokens
+
[
num_prefill_tokens
+
1
]
else
:
# Make sure max_model_len is used at the graph capture time.
seq_lens
=
self
.
max_model_len
seq_lens
=
max_query_len
self
.
seq_lens
.
np
[:
num_reqs
]
=
seq_lens
self
.
seq_lens
.
np
[
num_reqs
:]
=
0
self
.
seq_lens
.
copy_to_gpu
()
...
...
@@ -3541,6 +3540,26 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
CUDAGraphMode
.
FULL_DECODE_ONLY
logger
.
warning
(
msg
)
# check that if we are doing decode full-cudagraphs it is supported
if
(
cudagraph_mode
.
decode_mode
()
==
CUDAGraphMode
.
FULL
and
min_cg_support
==
AttentionCGSupport
.
NEVER
):
msg
=
(
f
"CUDAGraphMode.
{
cudagraph_mode
.
name
}
is not supported "
f
"with
{
min_cg_builder_name
}
backend (support: "
f
"
{
min_cg_support
}
)"
)
if
(
self
.
compilation_config
.
level
==
CompilationLevel
.
PIECEWISE
and
(
self
.
compilation_config
.
splitting_ops_contain_attention
()
or
self
.
compilation_config
.
use_inductor_graph_partition
)):
msg
+=
"; setting cudagraph_mode=PIECEWISE because "
\
"attention is compiled piecewise"
cudagraph_mode
=
self
.
compilation_config
.
cudagraph_mode
=
\
CUDAGraphMode
.
PIECEWISE
else
:
msg
+=
"; setting cudagraph_mode=NONE because "
\
"attention is not compiled piecewise"
cudagraph_mode
=
self
.
compilation_config
.
cudagraph_mode
=
\
CUDAGraphMode
.
NONE
logger
.
warning
(
msg
)
# check that if we are doing spec-decode + decode full-cudagraphs it is
# supported
if
(
cudagraph_mode
.
decode_mode
()
==
CUDAGraphMode
.
FULL
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment