Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
b5dcb372
Unverified
Commit
b5dcb372
authored
Feb 11, 2026
by
Cyrus Leung
Committed by
GitHub
Feb 10, 2026
Browse files
[Misc] Clean up validation logic in input processor (#34144)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
066c6da6
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
72 additions
and
86 deletions
+72
-86
tests/v1/engine/test_process_multi_modal_uuids.py
tests/v1/engine/test_process_multi_modal_uuids.py
+0
-1
vllm/multimodal/encoder_budget.py
vllm/multimodal/encoder_budget.py
+1
-0
vllm/v1/engine/input_processor.py
vllm/v1/engine/input_processor.py
+71
-85
No files found.
tests/v1/engine/test_process_multi_modal_uuids.py
View file @
b5dcb372
...
@@ -20,7 +20,6 @@ def _build_input_processor(
...
@@ -20,7 +20,6 @@ def _build_input_processor(
)
->
InputProcessor
:
)
->
InputProcessor
:
model_config
=
ModelConfig
(
model_config
=
ModelConfig
(
model
=
"Qwen/Qwen2.5-VL-3B-Instruct"
,
model
=
"Qwen/Qwen2.5-VL-3B-Instruct"
,
skip_tokenizer_init
=
True
,
max_model_len
=
128
,
max_model_len
=
128
,
mm_processor_cache_gb
=
mm_cache_gb
,
mm_processor_cache_gb
=
mm_cache_gb
,
)
)
...
...
vllm/multimodal/encoder_budget.py
View file @
b5dcb372
...
@@ -62,6 +62,7 @@ class MultiModalBudget:
...
@@ -62,6 +62,7 @@ class MultiModalBudget:
processor
=
mm_registry
.
create_processor
(
model_config
,
cache
=
cache
)
processor
=
mm_registry
.
create_processor
(
model_config
,
cache
=
cache
)
self
.
cache
=
cache
self
.
cache
=
cache
self
.
processor
=
processor
mm_config
=
model_config
.
get_multimodal_config
()
mm_config
=
model_config
.
get_multimodal_config
()
enable_mm_embeds
=
mm_config
is
not
None
and
mm_config
.
enable_mm_embeds
enable_mm_embeds
=
mm_config
is
not
None
and
mm_config
.
enable_mm_embeds
...
...
vllm/v1/engine/input_processor.py
View file @
b5dcb372
...
@@ -72,13 +72,15 @@ class InputProcessor:
...
@@ -72,13 +72,15 @@ class InputProcessor:
self
.
mm_registry
=
mm_registry
self
.
mm_registry
=
mm_registry
self
.
mm_processor_cache
=
mm_registry
.
processor_cache_from_config
(
vllm_config
)
self
.
mm_processor_cache
=
mm_registry
.
processor_cache_from_config
(
vllm_config
)
self
.
mm_encoder_cache_size
:
int
|
None
=
None
self
.
supports_mm_inputs
=
mm_registry
.
supports_multimodal_inputs
(
model_config
)
if
(
self
.
mm_encoder_cache_size
=
0
mm_registry
.
supports_multimodal_inputs
(
model_config
)
self
.
skip_prompt_length_check
=
False
and
not
model_config
.
skip_tokenizer_init
if
self
.
supports_mm_inputs
:
):
mm_budget
=
MultiModalBudget
(
vllm_config
,
mm_registry
)
mm_budget
=
MultiModalBudget
(
vllm_config
,
mm_registry
)
self
.
mm_encoder_cache_size
=
mm_budget
.
encoder_cache_size
self
.
mm_encoder_cache_size
=
mm_budget
.
encoder_cache_size
self
.
skip_prompt_length_check
=
(
mm_budget
.
processor
.
info
.
skip_prompt_length_check
)
mm_budget
.
reset_cache
()
# Not used anymore
mm_budget
.
reset_cache
()
# Not used anymore
self
.
input_preprocessor
=
InputPreprocessor
(
self
.
input_preprocessor
=
InputPreprocessor
(
...
@@ -670,76 +672,25 @@ class InputProcessor:
...
@@ -670,76 +672,25 @@ class InputProcessor:
resumable
=
resumable
,
resumable
=
resumable
,
)
)
def
_validate_model_inputs
(
def
_validate_prompt_len
(
self
,
encoder_inputs
:
SingletonInputs
|
None
,
decoder_inputs
:
SingletonInputs
):
if
encoder_inputs
is
not
None
:
self
.
_validate_model_input
(
encoder_inputs
,
prompt_type
=
"encoder"
)
self
.
_validate_model_input
(
decoder_inputs
,
prompt_type
=
"decoder"
)
def
_validate_model_input
(
self
,
self
,
prompt_inputs
:
SingletonInputs
,
prompt_len
:
int
,
*
,
prompt_type
:
Literal
[
"encoder"
,
"decoder"
],
prompt_type
:
Literal
[
"encoder"
,
"decoder"
],
):
):
model_config
=
self
.
model_config
if
self
.
skip_prompt_length_check
:
return
prompt_ids
=
(
None
if
prompt_inputs
[
"type"
]
==
"embeds"
else
prompt_inputs
[
"prompt_token_ids"
]
)
prompt_embeds
=
(
prompt_inputs
[
"prompt_embeds"
]
if
prompt_inputs
[
"type"
]
==
"embeds"
else
None
)
prompt_len
=
length_from_prompt_token_ids_or_embeds
(
prompt_ids
,
prompt_embeds
)
if
not
prompt_ids
:
if
prompt_type
==
"encoder"
and
model_config
.
is_multimodal_model
:
pass
# Mllama may have empty encoder inputs for text-only data
elif
prompt_inputs
[
"type"
]
==
"embeds"
:
pass
# Prompt embeds should not have prompt_ids.
else
:
raise
ValueError
(
f
"The
{
prompt_type
}
prompt cannot be empty"
)
tokenizer
=
self
.
tokenizer
if
tokenizer
is
not
None
:
max_input_id
=
max
(
prompt_ids
or
(),
default
=
0
)
# NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while
# self.model_config.get_vocab_size() is the model’s vocab size.
# For Qwen3 models, the language model has extra tokens that do
# not exist in the tokenizer, and vice versa for multimodal
# placeholder tokens in some multimodal models.
# See https://github.com/QwenLM/Qwen3/issues/29#issuecomment-1933720399 # noqa: E501
# and https://github.com/vllm-project/vllm/pull/22471#discussion_r2312251421 # noqa: E501
# Here we take the max of the two to determine if a token id is
if
prompt_len
==
0
and
prompt_type
==
"decoder"
:
# truly out-of-vocabulary.
raise
ValueError
(
f
"The
{
prompt_type
}
prompt cannot be empty"
)
if
max_input_id
>
max
(
tokenizer
.
max_token_id
,
self
.
model_config
.
get_vocab_size
()
-
1
):
raise
ValueError
(
f
"Token id
{
max_input_id
}
is out of vocabulary"
)
max_prompt_len
=
self
.
model_config
.
max_model_len
model_config
=
self
.
model_config
max_prompt_len
=
(
model_config
.
max_model_len
if
prompt_type
==
"decoder"
else
self
.
mm_encoder_cache_size
)
if
prompt_len
>
max_prompt_len
:
if
prompt_len
>
max_prompt_len
:
if
model_config
.
is_multimodal_model
:
if
self
.
supports_mm_inputs
:
mm_registry
=
self
.
input_preprocessor
.
mm_registry
model_cls
=
mm_registry
.
_get_model_cls
(
model_config
)
factories
=
model_cls
.
_processor_factory
ctx
=
mm_registry
.
_create_processing_ctx
(
model_config
,
tokenizer
=
tokenizer
,
)
mm_info
=
factories
.
info
(
ctx
)
if
mm_info
.
skip_prompt_length_check
:
return
if
model_config
.
is_multimodal_model
:
suggestion
=
(
suggestion
=
(
"Make sure that `max_model_len` is no smaller than the "
"Make sure that `max_model_len` is no smaller than the "
"number of text tokens plus multimodal tokens. For image "
"number of text tokens plus multimodal tokens. For image "
...
@@ -757,17 +708,7 @@ class InputProcessor:
...
@@ -757,17 +708,7 @@ class InputProcessor:
f
"longer than the maximum model length of
{
max_prompt_len
}
. "
f
"longer than the maximum model length of
{
max_prompt_len
}
. "
f
"
{
suggestion
}
"
f
"
{
suggestion
}
"
)
)
elif
prompt_len
==
max_prompt_len
and
model_config
.
runner_type
==
"generate"
:
# TODO: Find out how many placeholder tokens are there so we can
# check that chunked prefill does not truncate them
# max_batch_len = self.scheduler_config.max_num_batched_tokens
if
(
prompt_len
==
max_prompt_len
and
prompt_type
==
"decoder"
and
not
model_config
.
is_multimodal_model
and
self
.
model_config
.
runner_type
!=
"pooling"
):
suggestion
=
(
suggestion
=
(
"Make sure that `max_model_len` is no smaller than the "
"Make sure that `max_model_len` is no smaller than the "
"number of text tokens (prompt + requested output tokens)."
"number of text tokens (prompt + requested output tokens)."
...
@@ -778,11 +719,29 @@ class InputProcessor:
...
@@ -778,11 +719,29 @@ class InputProcessor:
f
"model length of
{
max_prompt_len
}
.
{
suggestion
}
"
f
"model length of
{
max_prompt_len
}
.
{
suggestion
}
"
)
)
if
(
def
_validate_model_input
(
prompt_type
==
"decoder"
self
,
and
prompt_inputs
[
"type"
]
==
"multimodal"
prompt_inputs
:
SingletonInputs
,
and
self
.
mm_encoder_cache_size
is
not
None
prompt_type
:
Literal
[
"encoder"
,
"decoder"
],
):
)
->
None
:
model_config
=
self
.
model_config
tokenizer
=
self
.
tokenizer
prompt_ids
=
(
None
if
prompt_inputs
[
"type"
]
==
"embeds"
else
prompt_inputs
[
"prompt_token_ids"
]
)
prompt_embeds
=
(
prompt_inputs
[
"prompt_embeds"
]
if
prompt_inputs
[
"type"
]
==
"embeds"
else
None
)
prompt_len
=
length_from_prompt_token_ids_or_embeds
(
prompt_ids
,
prompt_embeds
)
self
.
_validate_prompt_len
(
prompt_len
,
prompt_type
)
if
prompt_inputs
[
"type"
]
==
"multimodal"
:
decoder_mm_positions
=
prompt_inputs
[
"mm_placeholders"
]
decoder_mm_positions
=
prompt_inputs
[
"mm_placeholders"
]
for
modality
,
mm_positions
in
decoder_mm_positions
.
items
():
for
modality
,
mm_positions
in
decoder_mm_positions
.
items
():
for
mm_position
in
mm_positions
:
for
mm_position
in
mm_positions
:
...
@@ -797,6 +756,33 @@ class InputProcessor:
...
@@ -797,6 +756,33 @@ class InputProcessor:
f
"by setting --limit-mm-per-prompt at startup."
f
"by setting --limit-mm-per-prompt at startup."
)
)
if
prompt_ids
and
tokenizer
is
not
None
:
max_input_id
=
max
(
prompt_ids
,
default
=
0
)
# NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while
# self.model_config.get_vocab_size() is the model’s vocab size.
# For Qwen3 models, the language model has extra tokens that do
# not exist in the tokenizer, and vice versa for multimodal
# placeholder tokens in some multimodal models.
# See https://github.com/QwenLM/Qwen3/issues/29#issuecomment-1933720399 # noqa: E501
# and https://github.com/vllm-project/vllm/pull/22471#discussion_r2312251421 # noqa: E501
# Here we take the max of the two to determine if a token id is
# truly out-of-vocabulary.
model_vocab_size
=
model_config
.
get_vocab_size
()
if
max_input_id
>
max
(
tokenizer
.
max_token_id
,
model_vocab_size
-
1
):
raise
ValueError
(
f
"Token id
{
max_input_id
}
is out of vocabulary"
)
def
_validate_model_inputs
(
self
,
encoder_inputs
:
SingletonInputs
|
None
,
decoder_inputs
:
SingletonInputs
,
):
if
encoder_inputs
is
not
None
:
self
.
_validate_model_input
(
encoder_inputs
,
prompt_type
=
"encoder"
)
self
.
_validate_model_input
(
decoder_inputs
,
prompt_type
=
"decoder"
)
def
stat_mm_cache
(
self
)
->
MultiModalCacheStats
|
None
:
def
stat_mm_cache
(
self
)
->
MultiModalCacheStats
|
None
:
return
self
.
input_preprocessor
.
stat_mm_cache
()
return
self
.
input_preprocessor
.
stat_mm_cache
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment