Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
c756fb67
Unverified
Commit
c756fb67
authored
Dec 10, 2025
by
Nicolò Lucchesi
Committed by
GitHub
Dec 10, 2025
Browse files
[Core] Whisper enable `FULL_DECODE_ONLY` CudaGraph (#30072)
Signed-off-by:
NickLucche
<
nlucches@redhat.com
>
parent
d017bceb
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
31 additions
and
12 deletions
+31
-12
tests/models/multimodal/generation/test_whisper.py
tests/models/multimodal/generation/test_whisper.py
+2
-0
vllm/config/vllm.py
vllm/config/vllm.py
+19
-11
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+10
-1
No files found.
tests/models/multimodal/generation/test_whisper.py
View file @
c756fb67
...
...
@@ -103,6 +103,8 @@ def run_test(
max_model_len
=
448
,
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
# TODO (NickLucche) figure out output differences with non-eager and re-enable
enforce_eager
=
True
,
)
as
vllm_model
:
llm
=
vllm_model
.
llm
...
...
vllm/config/vllm.py
View file @
c756fb67
...
...
@@ -666,8 +666,9 @@ class VllmConfig:
default_config
=
OPTIMIZATION_LEVEL_TO_CONFIG
[
self
.
optimization_level
]
self
.
_apply_optimization_level_defaults
(
default_config
)
if
(
self
.
compilation_config
.
cudagraph_mode
!=
CUDAGraphMode
.
NONE
self
.
compilation_config
.
cudagraph_mode
.
requires_piecewise_compilation
()
and
self
.
compilation_config
.
mode
!=
CompilationMode
.
VLLM_COMPILE
):
logger
.
info
(
...
...
@@ -692,22 +693,29 @@ class VllmConfig:
if
current_platform
.
support_static_graph_mode
():
# if cudagraph_mode has full cudagraphs, we need to check support
if
model_config
:
=
self
.
model_config
:
if
(
self
.
compilation_config
.
cudagraph_mode
.
has_full_cudagraphs
()
and
self
.
model
_config
is
not
None
and
model_config
.
pooler
_config
is
not
None
):
if
self
.
model_config
.
pooler_config
is
not
None
:
logger
.
warning_once
(
"Pooling models do not support full cudagraphs. "
"Overriding cudagraph_mode to PIECEWISE."
)
self
.
compilation_config
.
cudagraph_mode
=
CUDAGraphMode
.
PIECEWISE
elif
self
.
model_config
.
is_encoder_decoder
:
logger
.
warning_once
(
"Encoder-decoder models do not support full cudagraphs. "
"Overriding cudagraph_mode to PIECEWISE."
elif
(
model_config
.
is_encoder_decoder
and
self
.
compilation_config
.
cudagraph_mode
not
in
(
CUDAGraphMode
.
NONE
,
CUDAGraphMode
.
FULL_DECODE_ONLY
)
):
logger
.
info_once
(
"Encoder-decoder models do not support %s. "
"Overriding cudagraph_mode to FULL_DECODE_ONLY."
,
self
.
compilation_config
.
cudagraph_mode
.
name
,
)
self
.
compilation_config
.
cudagraph_mode
=
(
CUDAGraphMode
.
FULL_DECODE_ONLY
)
self
.
compilation_config
.
cudagraph_mode
=
CUDAGraphMode
.
PIECEWISE
# disable cudagraph when enforce eager execution
if
self
.
model_config
is
not
None
and
self
.
model_config
.
enforce_eager
:
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
c756fb67
...
...
@@ -1267,6 +1267,8 @@ class GPUModelRunner(
if
not
isinstance
(
kv_cache_spec
,
CrossAttentionSpec
):
return
None
,
None
# Zero out buffer for padding requests that are not actually scheduled (CGs)
self
.
encoder_seq_lens
.
np
[:
num_reqs
]
=
0
# Build encoder_seq_lens array mapping request indices to
# encoder lengths for inputs scheduled in this batch
for
req_id
in
num_scheduled_tokens
:
...
...
@@ -2764,6 +2766,7 @@ class GPUModelRunner(
# be improved in model runner v2)
force_uniform_decode
:
bool
|
None
=
None
,
force_has_lora
:
bool
|
None
=
None
,
num_encoder_reqs
:
int
=
0
,
)
->
tuple
[
CUDAGraphMode
,
BatchDescriptor
,
...
...
@@ -2780,6 +2783,11 @@ class GPUModelRunner(
if
force_uniform_decode
is
None
else
force_uniform_decode
)
# Encoder-decoder models only support CG for decoder_step > 0 (no enc_output
# is present). Also, chunked-prefill is disabled, so batch are uniform.
has_encoder_output
=
(
self
.
model_config
.
is_encoder_decoder
and
num_encoder_reqs
>
0
)
has_lora
=
(
len
(
self
.
input_batch
.
lora_id_to_lora_request
)
>
0
...
...
@@ -2799,7 +2807,7 @@ class GPUModelRunner(
)
cudagraph_mode
,
batch_descriptor
=
dispatch_cudagraph
(
num_tokens_padded
,
use_cascade_attn
num_tokens_padded
,
use_cascade_attn
or
has_encoder_output
)
num_tokens_padded
=
batch_descriptor
.
num_tokens
...
...
@@ -2997,6 +3005,7 @@ class GPUModelRunner(
num_scheduled_tokens_np
=
num_scheduled_tokens_np
,
max_num_scheduled_tokens
=
max_num_scheduled_tokens
,
use_cascade_attn
=
cascade_attn_prefix_lens
is
not
None
,
num_encoder_reqs
=
len
(
scheduler_output
.
scheduled_encoder_inputs
),
)
logger
.
debug
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment