Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
aa3b3d76
Unverified
Commit
aa3b3d76
authored
Apr 11, 2025
by
Michael Goin
Committed by
GitHub
Apr 11, 2025
Browse files
Enforce valid max_num_batched_tokens when disable_chunked_mm_input=True (#16447)
Signed-off-by:
mgoin
<
mgoin64@gmail.com
>
parent
f7030df3
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
18 additions
and
1 deletion
+18
-1
tests/v1/core/test_scheduler.py
tests/v1/core/test_scheduler.py
+9
-0
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+1
-1
vllm/v1/core/encoder_cache_manager.py
vllm/v1/core/encoder_cache_manager.py
+8
-0
No files found.
tests/v1/core/test_scheduler.py
View file @
aa3b3d76
...
...
@@ -322,6 +322,15 @@ def test_no_mm_input_chunking():
assert
len
(
output
.
finished_req_ids
)
==
0
assert
output
.
num_scheduled_tokens
[
requests
[
0
].
request_id
]
==
800
# Test that we fail if we disable chunked mm input and use too small
# of a max_num_batched_tokens for the mm input.
with
pytest
.
raises
(
ValueError
):
_
=
create_scheduler
(
model
=
"llava-hf/llava-1.5-7b-hf"
,
max_num_batched_tokens
=
100
,
disable_chunked_mm_input
=
True
,
)
@
pytest
.
mark
.
parametrize
(
"enable_prefix_caching"
,
[
True
,
False
])
def
test_schedule_concurrent_partial_requests
(
enable_prefix_caching
:
bool
):
...
...
vllm/engine/arg_utils.py
View file @
aa3b3d76
...
...
@@ -1030,7 +1030,7 @@ class EngineArgs:
action
=
StoreBoolean
,
default
=
EngineArgs
.
disable_chunked_mm_input
,
nargs
=
"?"
,
const
=
"
Fals
e"
,
const
=
"
Tru
e"
,
help
=
"Disable multimodal input chunking attention for V1. "
"If set to true and chunked prefill is enabled, we do not want to"
" partially schedule a multimodal item. This ensures that if a "
...
...
vllm/v1/core/encoder_cache_manager.py
View file @
aa3b3d76
...
...
@@ -133,6 +133,14 @@ def _compute_encoder_budget_multimodal(
_
,
max_tokens_per_mm_item
=
max
(
max_tokens_by_modality_dict
.
items
(),
key
=
lambda
item
:
item
[
1
])
if
(
scheduler_config
.
disable_chunked_mm_input
and
max_tokens_per_mm_item
>
scheduler_config
.
max_num_batched_tokens
):
raise
ValueError
(
"Chunked MM input disabled but max_tokens_per_mm_item "
f
"(
{
max_tokens_per_mm_item
}
) is larger than max_num_batched_tokens"
f
" (
{
scheduler_config
.
max_num_batched_tokens
}
). Please increase "
"max_num_batched_tokens."
)
encoder_compute_budget
=
max
(
scheduler_config
.
max_num_encoder_input_tokens
,
max_tokens_per_mm_item
)
encoder_cache_size
=
max
(
scheduler_config
.
encoder_cache_size
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment