Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
75eb302a
Unverified
Commit
75eb302a
authored
Dec 16, 2025
by
Nicolò Lucchesi
Committed by
GitHub
Dec 16, 2025
Browse files
[Bugfix] Whisper fix number of allocated CrossAttn blocks per-request (#30772)
Signed-off-by:
NickLucche
<
nlucches@redhat.com
>
parent
9dbbc59b
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
11 additions
and
11 deletions
+11
-11
vllm/v1/core/sched/scheduler.py
vllm/v1/core/sched/scheduler.py
+11
-11
No files found.
vllm/v1/core/sched/scheduler.py
View file @
75eb302a
...
@@ -187,6 +187,12 @@ class Scheduler(SchedulerInterface):
...
@@ -187,6 +187,12 @@ class Scheduler(SchedulerInterface):
if
self
.
is_encoder_decoder
if
self
.
is_encoder_decoder
else
EncoderCacheManager
(
cache_size
=
encoder_cache_size
)
else
EncoderCacheManager
(
cache_size
=
encoder_cache_size
)
)
)
# For encoder-decoder models, allocate the maximum number of tokens for Cross
# Attn blocks, as for Whisper its input is always padded to the maximum length.
# TODO (NickLucche): Generalize to models with variable-length encoder inputs.
self
.
_num_encoder_max_input_tokens
=
(
MULTIMODAL_REGISTRY
.
get_encdec_max_encoder_len
(
vllm_config
.
model_config
)
)
speculative_config
=
vllm_config
.
speculative_config
speculative_config
=
vllm_config
.
speculative_config
self
.
use_eagle
=
False
self
.
use_eagle
=
False
...
@@ -568,17 +574,11 @@ class Scheduler(SchedulerInterface):
...
@@ -568,17 +574,11 @@ class Scheduler(SchedulerInterface):
0
if
request
.
num_computed_tokens
==
0
else
self
.
num_lookahead_tokens
0
if
request
.
num_computed_tokens
==
0
else
self
.
num_lookahead_tokens
)
)
# Determine if we need to allocate cross-attention blocks.
if
self
.
is_encoder_decoder
and
request
.
has_encoder_inputs
:
# TODO(russellb): For Whisper, we know that the input is
# always padded to the maximum length. If we support other
# encoder-decoder models, this will need to be updated if we
# want to only allocate what is needed.
num_encoder_tokens
=
(
num_encoder_tokens
=
(
self
.
scheduler_config
.
max_num_encoder_input_tokens
self
.
_num_encoder_max_input_tokens
if
self
.
is_encoder_decoder
and
request
.
has_encoder_inputs
else
0
)
)
else
:
num_encoder_tokens
=
0
new_blocks
=
self
.
kv_cache_manager
.
allocate_slots
(
new_blocks
=
self
.
kv_cache_manager
.
allocate_slots
(
request
,
request
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment