Unverified Commit 75eb302a authored by Nicolò Lucchesi's avatar Nicolò Lucchesi Committed by GitHub
Browse files

[Bugfix] Whisper fix number of allocated CrossAttn blocks per-request (#30772)


Signed-off-by: default avatarNickLucche <nlucches@redhat.com>
parent 9dbbc59b
...@@ -187,6 +187,12 @@ class Scheduler(SchedulerInterface): ...@@ -187,6 +187,12 @@ class Scheduler(SchedulerInterface):
if self.is_encoder_decoder if self.is_encoder_decoder
else EncoderCacheManager(cache_size=encoder_cache_size) else EncoderCacheManager(cache_size=encoder_cache_size)
) )
# For encoder-decoder models, allocate the maximum number of tokens for Cross
# Attn blocks, as for Whisper its input is always padded to the maximum length.
# TODO (NickLucche): Generalize to models with variable-length encoder inputs.
self._num_encoder_max_input_tokens = (
MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(vllm_config.model_config)
)
speculative_config = vllm_config.speculative_config speculative_config = vllm_config.speculative_config
self.use_eagle = False self.use_eagle = False
...@@ -568,17 +574,11 @@ class Scheduler(SchedulerInterface): ...@@ -568,17 +574,11 @@ class Scheduler(SchedulerInterface):
0 if request.num_computed_tokens == 0 else self.num_lookahead_tokens 0 if request.num_computed_tokens == 0 else self.num_lookahead_tokens
) )
# Determine if we need to allocate cross-attention blocks. num_encoder_tokens = (
if self.is_encoder_decoder and request.has_encoder_inputs: self._num_encoder_max_input_tokens
# TODO(russellb): For Whisper, we know that the input is if self.is_encoder_decoder and request.has_encoder_inputs
# always padded to the maximum length. If we support other else 0
# encoder-decoder models, this will need to be updated if we )
# want to only allocate what is needed.
num_encoder_tokens = (
self.scheduler_config.max_num_encoder_input_tokens
)
else:
num_encoder_tokens = 0
new_blocks = self.kv_cache_manager.allocate_slots( new_blocks = self.kv_cache_manager.allocate_slots(
request, request,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment