Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
58d4c705
Unverified
Commit
58d4c705
authored
Sep 16, 2025
by
Russell Bryant
Committed by
GitHub
Sep 16, 2025
Browse files
[Core] Get num_encoder_tokens from scheduler config (#24989)
Signed-off-by:
Russell Bryant
<
rbryant@redhat.com
>
parent
ea3de5ef
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
6 additions
and
8 deletions
+6
-8
vllm/v1/core/sched/scheduler.py
vllm/v1/core/sched/scheduler.py
+2
-3
vllm/v1/kv_cache_interface.py
vllm/v1/kv_cache_interface.py
+2
-3
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+2
-2
No files found.
vllm/v1/core/sched/scheduler.py
View file @
58d4c705
...
...
@@ -465,9 +465,8 @@ class Scheduler(SchedulerInterface):
in
self
.
vllm_config
.
model_config
.
model
.
lower
()),
(
"Whisper is the only supported "
"encoder-decoder model."
)
num_encoder_tokens
=
MULTIMODAL_REGISTRY
.
\
get_encdec_max_encoder_len
(
self
.
vllm_config
.
model_config
)
num_encoder_tokens
=
\
self
.
scheduler_config
.
max_num_encoder_input_tokens
else
:
num_encoder_tokens
=
0
...
...
vllm/v1/kv_cache_interface.py
View file @
58d4c705
...
...
@@ -11,7 +11,6 @@ from typing_extensions import Self
from
vllm.config
import
VllmConfig
from
vllm.logger
import
init_logger
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.utils
import
cdiv
,
get_dtype_size
logger
=
init_logger
(
__name__
)
...
...
@@ -230,8 +229,8 @@ class CrossAttentionSpec(AttentionSpec):
def
max_memory_usage_bytes
(
self
,
vllm_config
:
VllmConfig
)
->
int
:
# For cross-attention, we need to cache encoder states
# Get encoder length (e.g., 1500 for Whisper).
max_encoder_len
=
MULTIMODAL_REGISTRY
.
\
get_encdec_max_encoder_len
(
vllm_config
.
model_config
)
max_encoder_len
=
vllm_config
.
scheduler_config
.
\
max_num_encoder_input_tokens
return
cdiv
(
max_encoder_len
,
self
.
block_size
)
*
self
.
page_size_bytes
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
58d4c705
...
...
@@ -234,8 +234,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
if
self
.
model_config
.
is_encoder_decoder
:
# Maximum length of the encoder input, only for encoder-decoder
# models.
self
.
max_encoder_len
=
s
elf
.
mm_registry
.
\
get_encdec_max_encoder_len
(
model_config
)
self
.
max_encoder_len
=
s
cheduler_config
.
\
max_num_encoder_input_tokens
else
:
self
.
max_encoder_len
=
0
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment