Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
24fab45d
Unverified
Commit
24fab45d
authored
Sep 23, 2025
by
Michael Goin
Committed by
GitHub
Sep 23, 2025
Browse files
[Perf] Change default CUDAGraphMode from PIECEWISE to FULL_AND_PIECEWISE (#25444)
Signed-off-by:
mgoin
<
mgoin64@gmail.com
>
parent
63400259
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
32 additions
and
7 deletions
+32
-7
vllm/config/__init__.py
vllm/config/__init__.py
+8
-1
vllm/config/compilation.py
vllm/config/compilation.py
+3
-4
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+21
-2
No files found.
vllm/config/__init__.py
View file @
24fab45d
...
@@ -509,6 +509,13 @@ class VllmConfig:
...
@@ -509,6 +509,13 @@ class VllmConfig:
if
self
.
compilation_config
.
cudagraph_mode
is
None
:
if
self
.
compilation_config
.
cudagraph_mode
is
None
:
if
envs
.
VLLM_USE_V1
and
self
.
compilation_config
.
level
\
if
envs
.
VLLM_USE_V1
and
self
.
compilation_config
.
level
\
==
CompilationLevel
.
PIECEWISE
:
==
CompilationLevel
.
PIECEWISE
:
# default to full and piecewise for most models
self
.
compilation_config
.
cudagraph_mode
=
\
CUDAGraphMode
.
FULL_AND_PIECEWISE
# pooling model does not support full cudagraphs
if
self
.
model_config
is
not
None
and
\
self
.
model_config
.
pooler_config
is
not
None
:
self
.
compilation_config
.
cudagraph_mode
=
\
self
.
compilation_config
.
cudagraph_mode
=
\
CUDAGraphMode
.
PIECEWISE
CUDAGraphMode
.
PIECEWISE
else
:
else
:
...
...
vllm/config/compilation.py
View file @
24fab45d
...
@@ -228,15 +228,14 @@ class CompilationConfig:
...
@@ -228,15 +228,14 @@ class CompilationConfig:
The mode of the cudagraph:
The mode of the cudagraph:
- NONE, no cudagraph capture.
- NONE, no cudagraph capture.
- PIECEWISE.
(v1 default)
- PIECEWISE.
- FULL.
- FULL.
- FULL_DECODE_ONLY.
- FULL_DECODE_ONLY.
- FULL_AND_PIECEWISE.
- FULL_AND_PIECEWISE.
(v1 default)
PIECEWISE mode build piecewise cudagraph only, keeping the cudagraph
PIECEWISE mode build piecewise cudagraph only, keeping the cudagraph
incompatible ops (i.e. some attention ops) outside the cudagraph
incompatible ops (i.e. some attention ops) outside the cudagraph
for general flexibility.
for general flexibility.
This is the default mode.
FULL mode: Capture full cudagraph for all batches. Can be good for small
FULL mode: Capture full cudagraph for all batches. Can be good for small
models or workloads with small prompts; not supported by many backends.
models or workloads with small prompts; not supported by many backends.
...
@@ -249,7 +248,7 @@ class CompilationConfig:
...
@@ -249,7 +248,7 @@ class CompilationConfig:
FULL_AND_PIECEWISE mode: Capture full cudagraph for decode batches and
FULL_AND_PIECEWISE mode: Capture full cudagraph for decode batches and
piecewise cudagraph for prefill and mixed prefill-decode batches.
piecewise cudagraph for prefill and mixed prefill-decode batches.
This is
like
the most performant mode for most models.
This is the most performant mode for most models
and is the default
.
Currently, the cudagraph mode is only used for the v1 engine.
Currently, the cudagraph mode is only used for the v1 engine.
Note that the cudagraph logic is generally orthogonal to the
Note that the cudagraph logic is generally orthogonal to the
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
24fab45d
...
@@ -2947,8 +2947,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -2947,8 +2947,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# TODO(luka) better system for describing dummy batches
# TODO(luka) better system for describing dummy batches
seq_lens
=
[
1
]
*
num_decode_tokens
+
[
num_prefill_tokens
+
1
]
seq_lens
=
[
1
]
*
num_decode_tokens
+
[
num_prefill_tokens
+
1
]
else
:
else
:
# Make sure max_model_len is used at the graph capture time.
seq_lens
=
max_query_len
seq_lens
=
self
.
max_model_len
self
.
seq_lens
.
np
[:
num_reqs
]
=
seq_lens
self
.
seq_lens
.
np
[:
num_reqs
]
=
seq_lens
self
.
seq_lens
.
np
[
num_reqs
:]
=
0
self
.
seq_lens
.
np
[
num_reqs
:]
=
0
self
.
seq_lens
.
copy_to_gpu
()
self
.
seq_lens
.
copy_to_gpu
()
...
@@ -3541,6 +3540,26 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -3541,6 +3540,26 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
CUDAGraphMode
.
FULL_DECODE_ONLY
CUDAGraphMode
.
FULL_DECODE_ONLY
logger
.
warning
(
msg
)
logger
.
warning
(
msg
)
# check that if we are doing decode full-cudagraphs it is supported
if
(
cudagraph_mode
.
decode_mode
()
==
CUDAGraphMode
.
FULL
and
min_cg_support
==
AttentionCGSupport
.
NEVER
):
msg
=
(
f
"CUDAGraphMode.
{
cudagraph_mode
.
name
}
is not supported "
f
"with
{
min_cg_builder_name
}
backend (support: "
f
"
{
min_cg_support
}
)"
)
if
(
self
.
compilation_config
.
level
==
CompilationLevel
.
PIECEWISE
and
(
self
.
compilation_config
.
splitting_ops_contain_attention
()
or
self
.
compilation_config
.
use_inductor_graph_partition
)):
msg
+=
"; setting cudagraph_mode=PIECEWISE because "
\
"attention is compiled piecewise"
cudagraph_mode
=
self
.
compilation_config
.
cudagraph_mode
=
\
CUDAGraphMode
.
PIECEWISE
else
:
msg
+=
"; setting cudagraph_mode=NONE because "
\
"attention is not compiled piecewise"
cudagraph_mode
=
self
.
compilation_config
.
cudagraph_mode
=
\
CUDAGraphMode
.
NONE
logger
.
warning
(
msg
)
# check that if we are doing spec-decode + decode full-cudagraphs it is
# check that if we are doing spec-decode + decode full-cudagraphs it is
# supported
# supported
if
(
cudagraph_mode
.
decode_mode
()
==
CUDAGraphMode
.
FULL
if
(
cudagraph_mode
.
decode_mode
()
==
CUDAGraphMode
.
FULL
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment