Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d3fa2342
Commit
d3fa2342
authored
Jan 06, 2026
by
zhuwenwen
Browse files
[Perf] Change default CUDAGraphMode from FULL_AND_PIECEWISE to PIECEWISE
parent
55989b60
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
58 additions
and
50 deletions
+58
-50
vllm/config/compilation.py
vllm/config/compilation.py
+1
-1
vllm/config/vllm.py
vllm/config/vllm.py
+24
-21
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+33
-28
No files found.
vllm/config/compilation.py
View file @
d3fa2342
...
@@ -78,7 +78,7 @@ class CUDAGraphMode(enum.Enum):
...
@@ -78,7 +78,7 @@ class CUDAGraphMode(enum.Enum):
return
self
.
has_mode
(
CUDAGraphMode
.
PIECEWISE
)
return
self
.
has_mode
(
CUDAGraphMode
.
PIECEWISE
)
def
max_cudagraph_mode
(
self
)
->
"CUDAGraphMode"
:
def
max_cudagraph_mode
(
self
)
->
"CUDAGraphMode"
:
return
CUDAGraphMode
(
max
(
self
.
value
)
if
not
envs
.
VLLM_USE_PIECEWISE
else
min
(
self
.
value
)
)
if
self
.
separate_routine
()
else
self
return
CUDAGraphMode
(
max
(
self
.
value
))
if
self
.
separate_routine
()
else
self
def
has_full_cudagraphs
(
self
)
->
bool
:
def
has_full_cudagraphs
(
self
)
->
bool
:
return
self
.
max_cudagraph_mode
()
==
CUDAGraphMode
.
FULL
return
self
.
max_cudagraph_mode
()
==
CUDAGraphMode
.
FULL
...
...
vllm/config/vllm.py
View file @
d3fa2342
...
@@ -694,6 +694,7 @@ class VllmConfig:
...
@@ -694,6 +694,7 @@ class VllmConfig:
if
current_platform
.
support_static_graph_mode
():
if
current_platform
.
support_static_graph_mode
():
# if cudagraph_mode has full cudagraphs, we need to check support
# if cudagraph_mode has full cudagraphs, we need to check support
if
model_config
:
=
self
.
model_config
:
if
model_config
:
=
self
.
model_config
:
if
not
envs
.
VLLM_USE_PIECEWISE
:
if
(
if
(
self
.
compilation_config
.
cudagraph_mode
.
has_full_cudagraphs
()
self
.
compilation_config
.
cudagraph_mode
.
has_full_cudagraphs
()
and
model_config
.
pooler_config
is
not
None
and
model_config
.
pooler_config
is
not
None
...
@@ -716,6 +717,8 @@ class VllmConfig:
...
@@ -716,6 +717,8 @@ class VllmConfig:
self
.
compilation_config
.
cudagraph_mode
=
(
self
.
compilation_config
.
cudagraph_mode
=
(
CUDAGraphMode
.
FULL_DECODE_ONLY
CUDAGraphMode
.
FULL_DECODE_ONLY
)
)
else
:
self
.
compilation_config
.
cudagraph_mode
=
CUDAGraphMode
.
PIECEWISE
# disable cudagraph when enforce eager execution
# disable cudagraph when enforce eager execution
if
self
.
model_config
is
not
None
and
self
.
model_config
.
enforce_eager
:
if
self
.
model_config
is
not
None
and
self
.
model_config
.
enforce_eager
:
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
d3fa2342
...
@@ -4109,7 +4109,11 @@ class GPUModelRunner(
...
@@ -4109,7 +4109,11 @@ class GPUModelRunner(
# TODO(luka) better system for describing dummy batches
# TODO(luka) better system for describing dummy batches
seq_lens
=
[
1
]
*
num_decode_tokens
+
[
num_prefill_tokens
+
1
]
seq_lens
=
[
1
]
*
num_decode_tokens
+
[
num_prefill_tokens
+
1
]
else
:
else
:
seq_lens
=
max_query_len
# type: ignore[assignment]
if
not
envs
.
VLLM_USE_PIECEWISE
:
seq_lens
=
max_query_len
else
:
# Make sure max_model_len is used at the graph capture time.
seq_lens
=
self
.
max_model_len
self
.
seq_lens
.
np
[:
num_reqs
]
=
seq_lens
self
.
seq_lens
.
np
[:
num_reqs
]
=
seq_lens
self
.
seq_lens
.
np
[
num_reqs
:]
=
0
self
.
seq_lens
.
np
[
num_reqs
:]
=
0
self
.
seq_lens
.
copy_to_gpu
()
self
.
seq_lens
.
copy_to_gpu
()
...
@@ -4825,6 +4829,7 @@ class GPUModelRunner(
...
@@ -4825,6 +4829,7 @@ class GPUModelRunner(
logger
.
warning
(
msg
)
logger
.
warning
(
msg
)
# check that if we are doing decode full-cudagraphs it is supported
# check that if we are doing decode full-cudagraphs it is supported
if
not
envs
.
VLLM_USE_PIECEWISE
:
if
(
if
(
cudagraph_mode
.
decode_mode
()
==
CUDAGraphMode
.
FULL
cudagraph_mode
.
decode_mode
()
==
CUDAGraphMode
.
FULL
and
min_cg_support
==
AttentionCGSupport
.
NEVER
and
min_cg_support
==
AttentionCGSupport
.
NEVER
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment