Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0c6e40bb
Unverified
Commit
0c6e40bb
authored
Aug 21, 2025
by
Cyrus Leung
Committed by
GitHub
Aug 21, 2025
Browse files
[Refactor] Simplify code for MM budget (#23310)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
2e2000f3
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
57 additions
and
68 deletions
+57
-68
vllm/v1/core/encoder_cache_manager.py
vllm/v1/core/encoder_cache_manager.py
+32
-24
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+5
-13
vllm/v1/worker/tpu_model_runner.py
vllm/v1/worker/tpu_model_runner.py
+3
-10
vllm/v1/worker/utils.py
vllm/v1/worker/utils.py
+17
-21
No files found.
vllm/v1/core/encoder_cache_manager.py
View file @
0c6e40bb
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
collections.abc
import
Mapping
from
typing
import
TYPE_CHECKING
from
vllm.logger
import
init_logger
...
...
@@ -188,35 +188,47 @@ def compute_encoder_budget(
- Space budget for encoder cache size, in unit of number of tokens
in the input sequence.
"""
if
mm_registry
.
supports_multimodal_inputs
(
model_config
):
max_tokens_by_modality
=
mm_registry
\
.
get_max_tokens_per_item_by_nonzero_modality
(
model_config
)
if
not
mm_registry
.
supports_multimodal_inputs
(
model_config
):
return
0
,
0
# TODO: handle encoder-decoder models once we support them.
(
encoder_compute_budget
,
encoder_cache_size
,
)
=
_compute_encoder_budget_multimodal
(
model_config
,
return
compute_mm_encoder_budget
(
scheduler_config
,
mm_registr
y
,
max_tokens_by_modalit
y
,
)
return
encoder_compute_budget
,
encoder_cache_size
return
compute_text_encoder_budget
(
scheduler_config
)
def
_compute_encoder_budget_multimodal
(
model_config
:
"ModelConfig"
,
def
compute_text_encoder_budget
(
scheduler_config
:
"SchedulerConfig"
)
->
tuple
[
int
,
int
]:
"""Compute the encoder cache budget based on the model and scheduler
configurations for a text-only model.
Args:
scheduler_config: Scheduler configuration.
Returns:
- Compute budget for encoder execution, in unit of number of tokens
in the input sequence.
- Space budget for encoder cache size, in unit of number of tokens
in the input sequence.
"""
# Currently text-only encoder-decoder models are not supported
return
0
,
0
def
compute_mm_encoder_budget
(
scheduler_config
:
"SchedulerConfig"
,
m
m_registry
:
MultiModalRegistry
,
m
ax_tokens_by_modality
:
Mapping
[
str
,
int
]
,
)
->
tuple
[
int
,
int
]:
"""Compute the encoder cache budget based on the model and scheduler
configurations for a multimodal model.
Args:
model_config: Model configuration.
scheduler_config: Scheduler configuration.
mm_registry: Provides information about the token cost.
max_tokens_by_modality: The maximum number of tokens for each
non-text modality.
Returns:
- Compute budget for encoder execution, in unit of number of tokens
...
...
@@ -225,18 +237,14 @@ def _compute_encoder_budget_multimodal(
in the input sequence.
"""
max_tokens_by_modality_dict
=
mm_registry
\
.
get_max_tokens_per_item_by_nonzero_modality
(
model_config
)
if
not
max_tokens_by_modality_dict
:
if
not
max_tokens_by_modality
:
logger
.
warning
(
"All non-text modalities supported by the model have been "
"explicitly disabled via limit_mm_per_prompt. Encoder cache will "
"not be initialized."
)
return
0
,
0
_
,
max_tokens_per_mm_item
=
max
(
max_tokens_by_modality_dict
.
items
(),
key
=
lambda
item
:
item
[
1
])
max_tokens_per_mm_item
=
max
(
max_tokens_by_modality
.
values
())
if
(
scheduler_config
.
disable_chunked_mm_input
and
max_tokens_per_mm_item
>
scheduler_config
.
max_num_batched_tokens
):
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
0c6e40bb
...
...
@@ -341,10 +341,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
self
.
model_config
,
self
.
scheduler_config
,
self
.
mm_registry
,
max_model_len
=
self
.
max_model_len
,
max_num_reqs
=
self
.
max_num_reqs
,
)
if
self
.
supports_mm_inputs
\
else
None
)
)
if
self
.
supports_mm_inputs
else
None
)
self
.
reorder_batch_threshold
:
Optional
[
int
]
=
None
...
...
@@ -669,7 +666,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
mm_budget
=
self
.
mm_budget
assert
mm_budget
is
not
None
dummy_modality
,
_
=
mm_budget
.
get_modality_with_max_tokens
()
dummy_modality
=
mm_budget
.
get_modality_with_max_tokens
()
return
self
.
_get_mm_dummy_batch
(
dummy_modality
,
num_seqs
)
...
...
@@ -2595,14 +2592,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# NOTE: Currently model is profiled with a single non-text
# modality with the max possible input tokens even when
# it supports multiple.
(
dummy_modality
,
max_tokens
,
)
=
mm_budget
.
get_modality_with_max_tokens
()
(
max_mm_items_per_prompt
,
max_mm_items_per_batch
,
)
=
mm_budget
.
get_max_items
(
dummy_modality
,
max_tokens
)
dummy_modality
=
mm_budget
.
get_modality_with_max_tokens
()
max_mm_items_per_batch
=
mm_budget
\
.
max_items_per_batch_by_modality
[
dummy_modality
]
logger
.
info
(
"Encoder cache will be initialized with a budget of "
...
...
vllm/v1/worker/tpu_model_runner.py
View file @
0c6e40bb
...
...
@@ -292,8 +292,6 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
self
.
model_config
,
self
.
scheduler_config
,
self
.
mm_registry
,
max_model_len
=
self
.
max_model_len
,
max_num_reqs
=
self
.
max_num_reqs
,
)
if
self
.
supports_mm_inputs
else
None
)
if
not
self
.
use_spmd
:
...
...
@@ -1545,14 +1543,9 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# NOTE: Currently model is profiled with a single non-text
# modality with the max possible input tokens even when
# it supports multiple.
(
dummy_modality
,
max_tokens
,
)
=
mm_budget
.
get_modality_with_max_tokens
()
(
max_mm_items_per_prompt
,
max_mm_items_per_batch
,
)
=
mm_budget
.
get_max_items
(
dummy_modality
,
max_tokens
)
dummy_modality
=
mm_budget
.
get_modality_with_max_tokens
()
max_mm_items_per_batch
=
mm_budget
\
.
max_items_per_batch_by_modality
[
dummy_modality
]
logger
.
info
(
"Encoder cache will be initialized with a budget of "
...
...
vllm/v1/worker/utils.py
View file @
0c6e40bb
...
...
@@ -12,7 +12,7 @@ from vllm.model_executor.models.interfaces import MultiModalEmbeddings
from
vllm.model_executor.models.utils
import
extract_layer_index
from
vllm.multimodal.registry
import
MultiModalRegistry
from
vllm.v1.attention.backends.utils
import
AttentionMetadataBuilder
from
vllm.v1.core.encoder_cache_manager
import
compute_encoder_budget
from
vllm.v1.core.encoder_cache_manager
import
compute_
mm_
encoder_budget
from
vllm.v1.kv_cache_interface
import
KVCacheGroupSpec
if
TYPE_CHECKING
:
...
...
@@ -27,9 +27,6 @@ class MultiModalBudget:
model_config
:
ModelConfig
,
scheduler_config
:
SchedulerConfig
,
mm_registry
:
MultiModalRegistry
,
*
,
max_model_len
:
int
,
max_num_reqs
:
int
,
)
->
None
:
super
().
__init__
()
...
...
@@ -37,25 +34,25 @@ class MultiModalBudget:
self
.
scheduler_config
=
scheduler_config
self
.
mm_registry
=
mm_registry
encoder_compute_budget
,
encoder_cache_size
=
compute_encoder_budget
(
model_config
=
model_config
,
scheduler_config
=
scheduler_config
,
mm_registry
=
mm_registry
,
self
.
max_model_len
=
model_config
.
max_model_len
self
.
max_num_reqs
=
scheduler_config
.
max_num_seqs
self
.
mm_limits
=
mm_registry
.
get_mm_limits_per_prompt
(
model_config
)
max_tokens_by_modality
=
mm_registry
\
.
get_max_tokens_per_item_by_nonzero_modality
(
model_config
)
encoder_compute_budget
,
encoder_cache_size
=
compute_mm_encoder_budget
(
scheduler_config
,
max_tokens_by_modality
,
)
self
.
max_num_encoder_input_tokens
=
encoder_compute_budget
self
.
encoder_compute_budget
=
encoder_compute_budget
self
.
encoder_cache_size
=
encoder_cache_size
self
.
max_model_len
=
max_model_len
self
.
max_num_reqs
=
max_num_reqs
self
.
mm_limits
=
mm_registry
.
get_mm_limits_per_prompt
(
model_config
)
max_items_per_prompt_by_modality
=
dict
[
str
,
int
]()
max_items_per_batch_by_modality
=
dict
[
str
,
int
]()
max_tokens_by_modality
=
mm_registry
\
.
get_max_tokens_per_item_by_nonzero_modality
(
model_config
)
for
modality
,
max_tokens
in
max_tokens_by_modality
.
items
():
(
max_items_per_prompt
,
...
...
@@ -69,15 +66,14 @@ class MultiModalBudget:
self
.
max_items_per_prompt_by_modality
=
max_items_per_prompt_by_modality
self
.
max_items_per_batch_by_modality
=
max_items_per_batch_by_modality
def
get_modality_with_max_tokens
(
self
)
->
tuple
[
str
,
int
]
:
def
get_modality_with_max_tokens
(
self
)
->
str
:
max_tokens_by_modality
=
self
.
max_tokens_by_modality
modality
,
max_tokens
=
max
(
max_tokens_by_modality
.
items
(),
key
=
lambda
item
:
item
[
1
])
modality
,
_
=
max
(
max_tokens_by_modality
.
items
(),
key
=
lambda
x
:
x
[
1
])
return
modality
,
max_tokens
return
modality
def
get_encoder_budget
(
self
)
->
int
:
return
min
(
self
.
max_num_encoder_input_tokens
,
self
.
encoder_cache_size
)
return
min
(
self
.
encoder_compute_budget
,
self
.
encoder_cache_size
)
def
get_max_items
(
self
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment