Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
48312e57
Unverified
Commit
48312e57
authored
Feb 07, 2026
by
Cyrus Leung
Committed by
GitHub
Feb 07, 2026
Browse files
[Misc] Make `PlaceholderRange.get_num_embeds` a method (#34035)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
bc32444b
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
11 additions
and
12 deletions
+11
-12
tests/models/multimodal/processing/test_mllama4.py
tests/models/multimodal/processing/test_mllama4.py
+1
-1
tests/multimodal/test_inputs.py
tests/multimodal/test_inputs.py
+1
-1
tests/v1/core/test_encoder_cache_manager.py
tests/v1/core/test_encoder_cache_manager.py
+4
-4
vllm/multimodal/budget.py
vllm/multimodal/budget.py
+1
-1
vllm/multimodal/inputs.py
vllm/multimodal/inputs.py
+0
-1
vllm/v1/core/sched/scheduler.py
vllm/v1/core/sched/scheduler.py
+1
-1
vllm/v1/engine/input_processor.py
vllm/v1/engine/input_processor.py
+1
-1
vllm/v1/request.py
vllm/v1/request.py
+1
-1
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+1
-1
No files found.
tests/models/multimodal/processing/test_mllama4.py
View file @
48312e57
...
@@ -48,7 +48,7 @@ def test_profiling(model_id: str, max_model_len: int):
...
@@ -48,7 +48,7 @@ def test_profiling(model_id: str, max_model_len: int):
)
# image start, image, image end
)
# image start, image, image end
assert
total_num_patches
==
sum
(
assert
total_num_patches
==
sum
(
item
.
get_num_embeds
for
item
in
mm_inputs
[
"mm_placeholders"
][
"image"
]
item
.
get_num_embeds
()
for
item
in
mm_inputs
[
"mm_placeholders"
][
"image"
]
)
)
assert
total_tokens
==
sum
(
assert
total_tokens
==
sum
(
placeholder
.
length
for
placeholder
in
mm_inputs
[
"mm_placeholders"
][
"image"
]
placeholder
.
length
for
placeholder
in
mm_inputs
[
"mm_placeholders"
][
"image"
]
...
...
tests/multimodal/test_inputs.py
View file @
48312e57
...
@@ -19,7 +19,7 @@ from vllm.multimodal.inputs import PlaceholderRange
...
@@ -19,7 +19,7 @@ from vllm.multimodal.inputs import PlaceholderRange
def
test_placeholder_range_get_num_embeds
(
is_embed
,
expected
):
def
test_placeholder_range_get_num_embeds
(
is_embed
,
expected
):
length
=
len
(
is_embed
)
if
is_embed
is
not
None
else
5
length
=
len
(
is_embed
)
if
is_embed
is
not
None
else
5
pr
=
PlaceholderRange
(
offset
=
0
,
length
=
length
,
is_embed
=
is_embed
)
pr
=
PlaceholderRange
(
offset
=
0
,
length
=
length
,
is_embed
=
is_embed
)
assert
pr
.
get_num_embeds
==
expected
assert
pr
.
get_num_embeds
()
==
expected
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
...
...
tests/v1/core/test_encoder_cache_manager.py
View file @
48312e57
...
@@ -187,7 +187,7 @@ def test_schedule_request_multi_images_respect_compute_limit():
...
@@ -187,7 +187,7 @@ def test_schedule_request_multi_images_respect_compute_limit():
def
test_encoder_cache_with_is_embed_mask
():
def
test_encoder_cache_with_is_embed_mask
():
class
MockRequestWithMask
(
MockRequest
):
class
MockRequestWithMask
(
MockRequest
):
def
get_num_encoder_embeds
(
self
,
input_id
:
int
)
->
int
:
def
get_num_encoder_embeds
(
self
,
input_id
:
int
)
->
int
:
return
self
.
mm_features
[
input_id
].
mm_position
.
get_num_embeds
return
self
.
mm_features
[
input_id
].
mm_position
.
get_num_embeds
()
is_embed
=
torch
.
zeros
(
100
,
dtype
=
torch
.
bool
)
is_embed
=
torch
.
zeros
(
100
,
dtype
=
torch
.
bool
)
is_embed
[
torch
.
tensor
([
5
,
15
,
25
,
35
,
45
,
55
,
65
,
75
])]
=
True
is_embed
[
torch
.
tensor
([
5
,
15
,
25
,
35
,
45
,
55
,
65
,
75
])]
=
True
...
@@ -207,7 +207,7 @@ def test_encoder_cache_with_is_embed_mask():
...
@@ -207,7 +207,7 @@ def test_encoder_cache_with_is_embed_mask():
assert
"img1"
in
manager
.
cached
assert
"img1"
in
manager
.
cached
old_size
=
100
old_size
=
100
new_size
=
request
.
mm_features
[
0
].
mm_position
.
get_num_embeds
new_size
=
request
.
mm_features
[
0
].
mm_position
.
get_num_embeds
()
assert
new_size
==
8
assert
new_size
==
8
savings_ratio
=
old_size
/
new_size
savings_ratio
=
old_size
/
new_size
assert
savings_ratio
==
12.5
assert
savings_ratio
==
12.5
...
@@ -216,7 +216,7 @@ def test_encoder_cache_with_is_embed_mask():
...
@@ -216,7 +216,7 @@ def test_encoder_cache_with_is_embed_mask():
def
test_encoder_cache_mask_based_retrieval
():
def
test_encoder_cache_mask_based_retrieval
():
class
MockRequestWithMask
(
MockRequest
):
class
MockRequestWithMask
(
MockRequest
):
def
get_num_encoder_embeds
(
self
,
input_id
:
int
)
->
int
:
def
get_num_encoder_embeds
(
self
,
input_id
:
int
)
->
int
:
return
self
.
mm_features
[
input_id
].
mm_position
.
get_num_embeds
return
self
.
mm_features
[
input_id
].
mm_position
.
get_num_embeds
()
is_embed
=
torch
.
tensor
(
is_embed
=
torch
.
tensor
(
[
False
,
False
,
True
,
True
,
False
,
True
,
True
,
True
,
False
,
False
]
[
False
,
False
,
True
,
True
,
False
,
True
,
True
,
True
,
False
,
False
]
...
@@ -233,7 +233,7 @@ def test_encoder_cache_mask_based_retrieval():
...
@@ -233,7 +233,7 @@ def test_encoder_cache_mask_based_retrieval():
manager
=
EncoderCacheManager
(
cache_size
=
50
)
manager
=
EncoderCacheManager
(
cache_size
=
50
)
manager
.
allocate
(
request
,
0
)
manager
.
allocate
(
request
,
0
)
assert
request
.
mm_features
[
0
].
mm_position
.
get_num_embeds
==
5
assert
request
.
mm_features
[
0
].
mm_position
.
get_num_embeds
()
==
5
start_idx
=
2
start_idx
=
2
end_idx
=
8
end_idx
=
8
...
...
vllm/multimodal/budget.py
View file @
48312e57
...
@@ -33,7 +33,7 @@ def get_mm_max_toks_per_item(
...
@@ -33,7 +33,7 @@ def get_mm_max_toks_per_item(
)
)
return
{
return
{
modality
:
sum
(
item
.
get_num_embeds
for
item
in
placeholders
)
modality
:
sum
(
item
.
get_num_embeds
()
for
item
in
placeholders
)
for
modality
,
placeholders
in
mm_inputs
[
"mm_placeholders"
].
items
()
for
modality
,
placeholders
in
mm_inputs
[
"mm_placeholders"
].
items
()
}
}
...
...
vllm/multimodal/inputs.py
View file @
48312e57
...
@@ -199,7 +199,6 @@ class PlaceholderRange:
...
@@ -199,7 +199,6 @@ class PlaceholderRange:
def
embeds_cumsum
(
self
)
->
torch
.
Tensor
|
None
:
def
embeds_cumsum
(
self
)
->
torch
.
Tensor
|
None
:
return
None
if
self
.
is_embed
is
None
else
self
.
is_embed
.
cumsum
(
dim
=
0
)
return
None
if
self
.
is_embed
is
None
else
self
.
is_embed
.
cumsum
(
dim
=
0
)
@
cached_property
def
get_num_embeds
(
self
)
->
int
:
def
get_num_embeds
(
self
)
->
int
:
if
self
.
embeds_cumsum
is
None
:
if
self
.
embeds_cumsum
is
None
:
return
self
.
length
return
self
.
length
...
...
vllm/v1/core/sched/scheduler.py
View file @
48312e57
...
@@ -1100,7 +1100,7 @@ class Scheduler(SchedulerInterface):
...
@@ -1100,7 +1100,7 @@ class Scheduler(SchedulerInterface):
for
i
,
mm_feature
in
enumerate
(
mm_features
):
for
i
,
mm_feature
in
enumerate
(
mm_features
):
start_pos
=
mm_feature
.
mm_position
.
offset
start_pos
=
mm_feature
.
mm_position
.
offset
num_encoder_tokens
=
mm_feature
.
mm_position
.
length
num_encoder_tokens
=
mm_feature
.
mm_position
.
length
num_encoder_embeds
=
mm_feature
.
mm_position
.
get_num_embeds
num_encoder_embeds
=
mm_feature
.
mm_position
.
get_num_embeds
()
item_identifier
=
mm_feature
.
identifier
item_identifier
=
mm_feature
.
identifier
# The encoder output is needed if the two ranges overlap:
# The encoder output is needed if the two ranges overlap:
...
...
vllm/v1/engine/input_processor.py
View file @
48312e57
...
@@ -786,7 +786,7 @@ class InputProcessor:
...
@@ -786,7 +786,7 @@ class InputProcessor:
decoder_mm_positions
=
prompt_inputs
[
"mm_placeholders"
]
decoder_mm_positions
=
prompt_inputs
[
"mm_placeholders"
]
for
modality
,
mm_positions
in
decoder_mm_positions
.
items
():
for
modality
,
mm_positions
in
decoder_mm_positions
.
items
():
for
mm_position
in
mm_positions
:
for
mm_position
in
mm_positions
:
embed_length
=
mm_position
.
get_num_embeds
embed_length
=
mm_position
.
get_num_embeds
()
if
embed_length
>
self
.
mm_encoder_cache_size
:
if
embed_length
>
self
.
mm_encoder_cache_size
:
raise
ValueError
(
raise
ValueError
(
f
"The
{
prompt_type
}
prompt contains a(n)
{
modality
}
item "
f
"The
{
prompt_type
}
prompt contains a(n)
{
modality
}
item "
...
...
vllm/v1/request.py
View file @
48312e57
...
@@ -260,7 +260,7 @@ class Request:
...
@@ -260,7 +260,7 @@ class Request:
def
get_num_encoder_embeds
(
self
,
input_id
:
int
)
->
int
:
def
get_num_encoder_embeds
(
self
,
input_id
:
int
)
->
int
:
assert
input_id
<
len
(
self
.
mm_features
)
assert
input_id
<
len
(
self
.
mm_features
)
return
self
.
mm_features
[
input_id
].
mm_position
.
get_num_embeds
return
self
.
mm_features
[
input_id
].
mm_position
.
get_num_embeds
()
def
record_event
(
def
record_event
(
self
,
self
,
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
48312e57
...
@@ -2326,7 +2326,7 @@ class GPUModelRunner(
...
@@ -2326,7 +2326,7 @@ class GPUModelRunner(
# Prefer pos_info.get_num_embeds to count precise MM embedding tokens.
# Prefer pos_info.get_num_embeds to count precise MM embedding tokens.
num_tokens
=
self
.
model
.
get_num_mm_encoder_tokens
(
# type: ignore[attr-defined]
num_tokens
=
self
.
model
.
get_num_mm_encoder_tokens
(
# type: ignore[attr-defined]
pos_info
.
get_num_embeds
pos_info
.
get_num_embeds
()
)
)
prompt_lora_mapping
.
append
(
lora_id
)
prompt_lora_mapping
.
append
(
lora_id
)
token_lora_mapping
.
extend
([
lora_id
]
*
num_tokens
)
token_lora_mapping
.
extend
([
lora_id
]
*
num_tokens
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment