Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
116880a5
Unverified
Commit
116880a5
authored
Feb 06, 2026
by
Cyrus Leung
Committed by
GitHub
Feb 05, 2026
Browse files
[Bugfix] Make MM batching more robust (#33817)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
4145e50d
Changes
13
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
625 additions
and
428 deletions
+625
-428
.buildkite/test-amd.yaml
.buildkite/test-amd.yaml
+2
-1
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+2
-1
.buildkite/test_areas/models_basic.yaml
.buildkite/test_areas/models_basic.yaml
+2
-1
tests/models/test_terratorch.py
tests/models/test_terratorch.py
+9
-3
tests/multimodal/media/test_connector.py
tests/multimodal/media/test_connector.py
+320
-0
tests/multimodal/test_hasher.py
tests/multimodal/test_hasher.py
+16
-0
tests/multimodal/test_inputs.py
tests/multimodal/test_inputs.py
+46
-0
tests/multimodal/test_utils.py
tests/multimodal/test_utils.py
+36
-350
vllm/model_executor/models/step3_vl.py
vllm/model_executor/models/step3_vl.py
+18
-18
vllm/model_executor/models/terratorch.py
vllm/model_executor/models/terratorch.py
+28
-18
vllm/multimodal/hasher.py
vllm/multimodal/hasher.py
+13
-5
vllm/multimodal/inputs.py
vllm/multimodal/inputs.py
+32
-20
vllm/multimodal/utils.py
vllm/multimodal/utils.py
+101
-11
No files found.
.buildkite/test-amd.yaml
View file @
116880a5
...
@@ -863,10 +863,11 @@ steps:
...
@@ -863,10 +863,11 @@ steps:
torch_nightly
:
true
torch_nightly
:
true
source_file_dependencies
:
source_file_dependencies
:
-
vllm/
-
vllm/
-
tests/models/test_terratorch.py
-
tests/models/test_transformers.py
-
tests/models/test_transformers.py
-
tests/models/test_registry.py
-
tests/models/test_registry.py
commands
:
commands
:
-
pytest -v -s models/test_transformers.py models/test_registry.py
-
pytest -v -s
models/test_terratorch.py
models/test_transformers.py models/test_registry.py
-
label
:
Basic Models Test (Other CPU)
# 5min
-
label
:
Basic Models Test (Other CPU)
# 5min
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
...
...
.buildkite/test-pipeline.yaml
View file @
116880a5
...
@@ -804,10 +804,11 @@ steps:
...
@@ -804,10 +804,11 @@ steps:
torch_nightly
:
true
torch_nightly
:
true
source_file_dependencies
:
source_file_dependencies
:
-
vllm/
-
vllm/
-
tests/models/test_terratorch.py
-
tests/models/test_transformers.py
-
tests/models/test_transformers.py
-
tests/models/test_registry.py
-
tests/models/test_registry.py
commands
:
commands
:
-
pytest -v -s models/test_transformers.py models/test_registry.py
-
pytest -v -s
models/test_terratorch.py
models/test_transformers.py models/test_registry.py
-
label
:
Basic Models Test (Other CPU)
# 5min
-
label
:
Basic Models Test (Other CPU)
# 5min
timeout_in_minutes
:
10
timeout_in_minutes
:
10
...
...
.buildkite/test_areas/models_basic.yaml
View file @
116880a5
...
@@ -33,10 +33,11 @@ steps:
...
@@ -33,10 +33,11 @@ steps:
timeout_in_minutes
:
45
timeout_in_minutes
:
45
source_file_dependencies
:
source_file_dependencies
:
-
vllm/
-
vllm/
-
tests/models/test_terratorch.py
-
tests/models/test_transformers.py
-
tests/models/test_transformers.py
-
tests/models/test_registry.py
-
tests/models/test_registry.py
commands
:
commands
:
-
pytest -v -s models/test_transformers.py models/test_registry.py
-
pytest -v -s
models/test_terratorch.py
models/test_transformers.py models/test_registry.py
-
label
:
Basic Models Test (Other CPU)
# 5min
-
label
:
Basic Models Test (Other CPU)
# 5min
depends_on
:
depends_on
:
...
...
tests/models/test_terratorch.py
View file @
116880a5
...
@@ -5,8 +5,10 @@ import pytest
...
@@ -5,8 +5,10 @@ import pytest
import
torch
import
torch
from
tests.conftest
import
VllmRunner
from
tests.conftest
import
VllmRunner
from
tests.utils
import
create_new_process_for_each_test
@
create_new_process_for_each_test
()
# Memory is not cleaned up properly otherwise
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"model"
,
"model"
,
[
[
...
@@ -22,10 +24,14 @@ def test_inference(
...
@@ -22,10 +24,14 @@ def test_inference(
location_coords
=
torch
.
full
((
1
,
2
),
1.0
,
dtype
=
torch
.
float16
)
location_coords
=
torch
.
full
((
1
,
2
),
1.0
,
dtype
=
torch
.
float16
)
prompt
=
dict
(
prompt
=
dict
(
prompt_token_ids
=
[
1
],
prompt_token_ids
=
[
1
],
multi_modal_data
=
dict
(
multi_modal_data
=
{
pixel_values
=
pixel_values
,
location_coords
=
location_coords
"image"
:
{
),
"pixel_values"
:
pixel_values
,
"location_coords"
:
location_coords
,
}
},
)
)
with
vllm_runner
(
with
vllm_runner
(
model
,
model
,
runner
=
"pooling"
,
runner
=
"pooling"
,
...
...
tests/multimodal/media/test_connector.py
0 → 100644
View file @
116880a5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
import
base64
import
mimetypes
import
os
from
tempfile
import
NamedTemporaryFile
,
TemporaryDirectory
import
numpy
as
np
import
pytest
import
torch
from
PIL
import
Image
,
ImageChops
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.multimodal.inputs
import
PlaceholderRange
from
vllm.multimodal.media
import
MediaConnector
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_ASSETS
=
[
"2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
,
# "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
"Grayscale_8bits_palette_sample_image.png"
,
# "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/Grayscale_8bits_palette_sample_image.png",
"1280px-Venn_diagram_rgb.svg.png"
,
# "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/1280px-Venn_diagram_rgb.svg.png",
"RGBA_comp.png"
,
# "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png",
]
TEST_VIDEO_URLS
=
[
"https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4"
,
"https://github.com/opencv/opencv/raw/refs/tags/4.12.0/samples/data/vtest.avi"
,
]
@
pytest
.
fixture
(
scope
=
"module"
)
def
url_images
(
local_asset_server
)
->
dict
[
str
,
Image
.
Image
]:
return
{
image_url
:
local_asset_server
.
get_image_asset
(
image_url
)
for
image_url
in
TEST_IMAGE_ASSETS
}
def
get_supported_suffixes
()
->
tuple
[
str
,
...]:
# We should at least test the file types mentioned in GPT-4 with Vision
OPENAI_SUPPORTED_SUFFIXES
=
(
".png"
,
".jpeg"
,
".jpg"
,
".webp"
,
".gif"
)
# Additional file types that are supported by us
EXTRA_SUPPORTED_SUFFIXES
=
(
".bmp"
,
".tiff"
)
return
OPENAI_SUPPORTED_SUFFIXES
+
EXTRA_SUPPORTED_SUFFIXES
def
_image_equals
(
a
:
Image
.
Image
,
b
:
Image
.
Image
)
->
bool
:
return
(
np
.
asarray
(
a
)
==
np
.
asarray
(
convert_image_mode
(
b
,
a
.
mode
))).
all
()
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_ASSETS
,
indirect
=
True
)
async
def
test_fetch_image_http
(
image_url
:
str
):
connector
=
MediaConnector
()
image_sync
=
connector
.
fetch_image
(
image_url
)
image_async
=
await
connector
.
fetch_image_async
(
image_url
)
assert
_image_equals
(
image_sync
,
image_async
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"raw_image_url"
,
TEST_IMAGE_ASSETS
)
@
pytest
.
mark
.
parametrize
(
"suffix"
,
get_supported_suffixes
())
async
def
test_fetch_image_base64
(
url_images
:
dict
[
str
,
Image
.
Image
],
raw_image_url
:
str
,
suffix
:
str
):
connector
=
MediaConnector
(
# Domain restriction should not apply to data URLs.
allowed_media_domains
=
[
"www.bogotobogo.com"
,
"github.com"
,
]
)
url_image
=
url_images
[
raw_image_url
]
try
:
mime_type
=
Image
.
MIME
[
Image
.
registered_extensions
()[
suffix
]]
except
KeyError
:
try
:
mime_type
=
mimetypes
.
types_map
[
suffix
]
except
KeyError
:
pytest
.
skip
(
"No MIME type"
)
with
NamedTemporaryFile
(
suffix
=
suffix
)
as
f
:
try
:
url_image
.
save
(
f
.
name
)
except
Exception
as
e
:
if
e
.
args
[
0
]
==
"cannot write mode RGBA as JPEG"
:
pytest
.
skip
(
"Conversion not supported"
)
raise
base64_image
=
base64
.
b64encode
(
f
.
read
()).
decode
(
"utf-8"
)
data_url
=
f
"data:
{
mime_type
}
;base64,
{
base64_image
}
"
data_image_sync
=
connector
.
fetch_image
(
data_url
)
if
_image_equals
(
url_image
,
Image
.
open
(
f
)):
assert
_image_equals
(
url_image
,
data_image_sync
)
else
:
pass
# Lossy format; only check that image can be opened
data_image_async
=
await
connector
.
fetch_image_async
(
data_url
)
assert
_image_equals
(
data_image_sync
,
data_image_async
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_ASSETS
,
indirect
=
True
)
async
def
test_fetch_image_local_files
(
image_url
:
str
):
connector
=
MediaConnector
()
with
TemporaryDirectory
()
as
temp_dir
:
local_connector
=
MediaConnector
(
allowed_local_media_path
=
temp_dir
)
origin_image
=
connector
.
fetch_image
(
image_url
)
origin_image
.
save
(
os
.
path
.
join
(
temp_dir
,
os
.
path
.
basename
(
image_url
)),
quality
=
100
,
icc_profile
=
origin_image
.
info
.
get
(
"icc_profile"
),
)
image_async
=
await
local_connector
.
fetch_image_async
(
f
"file://
{
temp_dir
}
/
{
os
.
path
.
basename
(
image_url
)
}
"
)
image_sync
=
local_connector
.
fetch_image
(
f
"file://
{
temp_dir
}
/
{
os
.
path
.
basename
(
image_url
)
}
"
)
# Check that the images are equal
assert
not
ImageChops
.
difference
(
image_sync
,
image_async
).
getbbox
()
with
pytest
.
raises
(
ValueError
,
match
=
"must be a subpath"
):
await
local_connector
.
fetch_image_async
(
f
"file://
{
temp_dir
}
/../
{
os
.
path
.
basename
(
image_url
)
}
"
)
with
pytest
.
raises
(
RuntimeError
,
match
=
"Cannot load local files"
):
await
connector
.
fetch_image_async
(
f
"file://
{
temp_dir
}
/../
{
os
.
path
.
basename
(
image_url
)
}
"
)
with
pytest
.
raises
(
ValueError
,
match
=
"must be a subpath"
):
local_connector
.
fetch_image
(
f
"file://
{
temp_dir
}
/../
{
os
.
path
.
basename
(
image_url
)
}
"
)
with
pytest
.
raises
(
RuntimeError
,
match
=
"Cannot load local files"
):
connector
.
fetch_image
(
f
"file://
{
temp_dir
}
/../
{
os
.
path
.
basename
(
image_url
)
}
"
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"image_url"
,
[
TEST_IMAGE_ASSETS
[
0
]],
indirect
=
True
)
async
def
test_fetch_image_local_files_with_space_in_name
(
image_url
:
str
):
connector
=
MediaConnector
()
with
TemporaryDirectory
()
as
temp_dir
:
local_connector
=
MediaConnector
(
allowed_local_media_path
=
temp_dir
)
origin_image
=
connector
.
fetch_image
(
image_url
)
filename
=
"file name with space.jpg"
origin_image
.
save
(
os
.
path
.
join
(
temp_dir
,
filename
),
quality
=
100
,
icc_profile
=
origin_image
.
info
.
get
(
"icc_profile"
),
)
try
:
image_async
=
await
local_connector
.
fetch_image_async
(
f
"file://
{
temp_dir
}
/
{
filename
}
"
)
image_sync
=
local_connector
.
fetch_image
(
f
"file://
{
temp_dir
}
/
{
filename
}
"
)
except
FileNotFoundError
as
e
:
pytest
.
fail
(
"Failed to fetch image with space in name: {}"
.
format
(
e
))
# Check that the images are equal
assert
not
ImageChops
.
difference
(
image_sync
,
image_async
).
getbbox
()
@
pytest
.
mark
.
asyncio
async
def
test_fetch_image_error_conversion
():
connector
=
MediaConnector
()
broken_img
=
"data:image/png;base64,aGVsbG9fdmxsbV9jb21tdW5pdHkK"
# PIL.UnidentifiedImageError should be converted to ValueError
with
pytest
.
raises
(
ValueError
):
await
connector
.
fetch_image_async
(
broken_img
)
with
pytest
.
raises
(
ValueError
):
connector
.
fetch_image
(
broken_img
)
@
pytest
.
mark
.
flaky
(
reruns
=
3
,
reruns_delay
=
5
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"video_url"
,
TEST_VIDEO_URLS
)
@
pytest
.
mark
.
parametrize
(
"num_frames"
,
[
-
1
,
32
,
1800
])
async
def
test_fetch_video_http
(
video_url
:
str
,
num_frames
:
int
):
connector
=
MediaConnector
(
media_io_kwargs
=
{
"video"
:
{
"num_frames"
:
num_frames
,
}
}
)
try
:
video_sync
,
metadata_sync
=
connector
.
fetch_video
(
video_url
)
video_async
,
metadata_async
=
await
connector
.
fetch_video_async
(
video_url
)
except
(
TimeoutError
,
asyncio
.
TimeoutError
)
as
e
:
pytest
.
skip
(
f
"Timeout fetching video (CI network flakiness):
{
e
}
"
)
assert
np
.
array_equal
(
video_sync
,
video_async
)
assert
metadata_sync
==
metadata_async
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"video_url"
,
TEST_VIDEO_URLS
)
@
pytest
.
mark
.
parametrize
(
"max_duration"
,
[
1
,
60
,
1800
])
@
pytest
.
mark
.
parametrize
(
"requested_fps"
,
[
2
,
24
])
async
def
test_fetch_video_http_with_dynamic_loader
(
video_url
:
str
,
max_duration
:
int
,
requested_fps
:
int
,
monkeypatch
:
pytest
.
MonkeyPatch
,
):
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_VIDEO_LOADER_BACKEND"
,
"opencv_dynamic"
)
connector
=
MediaConnector
(
media_io_kwargs
=
{
"video"
:
{
"max_duration"
:
max_duration
,
"requested_fps"
:
requested_fps
,
}
}
)
video_sync
,
metadata_sync
=
connector
.
fetch_video
(
video_url
)
video_async
,
metadata_async
=
await
connector
.
fetch_video_async
(
video_url
)
assert
np
.
array_equal
(
video_sync
,
video_async
)
assert
metadata_sync
==
metadata_async
assert
metadata_sync
[
"video_backend"
]
==
"opencv_dynamic"
@
pytest
.
mark
.
parametrize
(
"is_embed,start_idx,end_idx,expected"
,
[
(
None
,
2
,
4
,
(
2
,
4
)),
(
torch
.
tensor
([
False
,
True
,
False
,
True
,
True
]),
3
,
5
,
(
1
,
3
),
),
(
torch
.
tensor
([
False
,
True
,
False
,
True
,
True
]),
0
,
2
,
(
0
,
1
),
),
(
torch
.
tensor
([
True
,
False
,
True
,
False
]),
2
,
2
,
(
1
,
1
),
),
],
)
def
test_placeholder_range_get_embeds_indices_in_range
(
is_embed
,
start_idx
,
end_idx
,
expected
):
length
=
len
(
is_embed
)
if
is_embed
is
not
None
else
5
pr
=
PlaceholderRange
(
offset
=
0
,
length
=
length
,
is_embed
=
is_embed
)
assert
pr
.
get_embeds_indices_in_range
(
start_idx
,
end_idx
)
==
expected
@
pytest
.
mark
.
parametrize
(
"offset,is_embed,expected"
,
[
(
0
,
None
,
[(
0
,
4
)]),
(
2
,
torch
.
tensor
([
False
,
True
,
False
,
True
,
True
]),
[(
3
,
3
),
(
5
,
6
)],
),
(
0
,
torch
.
tensor
([
True
,
True
,
True
,
True
]),
[(
0
,
3
)]),
(
0
,
torch
.
tensor
([
False
,
False
,
False
,
False
]),
[]),
],
)
def
test_placeholder_range_extract_embeds_range
(
offset
,
is_embed
,
expected
):
length
=
len
(
is_embed
)
if
is_embed
is
not
None
else
5
pr
=
PlaceholderRange
(
offset
=
offset
,
length
=
length
,
is_embed
=
is_embed
)
assert
pr
.
extract_embeds_range
()
==
expected
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"video_url"
,
TEST_VIDEO_URLS
)
@
pytest
.
mark
.
parametrize
(
"num_frames"
,
[
-
1
,
32
,
1800
])
async
def
test_allowed_media_domains
(
video_url
:
str
,
num_frames
:
int
):
connector
=
MediaConnector
(
media_io_kwargs
=
{
"video"
:
{
"num_frames"
:
num_frames
,
}
},
allowed_media_domains
=
[
"www.bogotobogo.com"
,
"github.com"
,
],
)
video_sync
,
metadata_sync
=
connector
.
fetch_video
(
video_url
)
video_async
,
metadata_async
=
await
connector
.
fetch_video_async
(
video_url
)
assert
np
.
array_equal
(
video_sync
,
video_async
)
assert
metadata_sync
==
metadata_async
disallowed_url
=
"https://upload.wikimedia.org/wikipedia/commons/4/47/PNG_transparency_demonstration_1.png"
with
pytest
.
raises
(
ValueError
):
_
,
_
=
connector
.
fetch_video
(
disallowed_url
)
with
pytest
.
raises
(
ValueError
):
_
,
_
=
await
connector
.
fetch_video_async
(
disallowed_url
)
tests/multimodal/test_hasher.py
View file @
116880a5
...
@@ -16,6 +16,22 @@ ASSETS_DIR = Path(__file__).parent / "assets"
...
@@ -16,6 +16,22 @@ ASSETS_DIR = Path(__file__).parent / "assets"
assert
ASSETS_DIR
.
exists
()
assert
ASSETS_DIR
.
exists
()
def
test_hash_single_item_different_shape
():
x1
=
torch
.
zeros
(())
x2
=
torch
.
zeros
((
1
,))
hasher
=
MultiModalHasher
assert
hasher
.
hash_kwargs
(
x
=
x1
)
!=
hasher
.
hash_kwargs
(
x
=
x2
)
def
test_hash_key_order_invariant
():
x
=
torch
.
zeros
((
5
,
10
))
y
=
torch
.
ones
((
5
,
10
))
hasher
=
MultiModalHasher
assert
hasher
.
hash_kwargs
(
x
=
x
,
y
=
y
)
==
hasher
.
hash_kwargs
(
y
=
y
,
x
=
x
)
# NOTE: Images that are the same visually are allowed to have the same hash
# NOTE: Images that are the same visually are allowed to have the same hash
@
pytest
.
mark
.
parametrize
(
"mode_pair"
,
[(
"1"
,
"L"
),
(
"RGBA"
,
"CMYK"
)])
@
pytest
.
mark
.
parametrize
(
"mode_pair"
,
[(
"1"
,
"L"
),
(
"RGBA"
,
"CMYK"
)])
def
test_hash_collision_image_mode
(
mode_pair
):
def
test_hash_collision_image_mode
(
mode_pair
):
...
...
tests/multimodal/test_inputs.py
0 → 100644
View file @
116880a5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
torch
from
vllm.multimodal.inputs
import
PlaceholderRange
@
pytest
.
mark
.
parametrize
(
"is_embed,expected"
,
[
(
None
,
5
),
(
torch
.
tensor
([
True
,
True
,
True
,
True
,
True
]),
5
),
(
torch
.
tensor
([
False
,
False
,
False
,
False
,
False
]),
0
),
(
torch
.
tensor
([
True
,
False
,
True
,
False
,
True
]),
3
),
(
torch
.
tensor
([
True
]),
1
),
],
)
def
test_placeholder_range_get_num_embeds
(
is_embed
,
expected
):
length
=
len
(
is_embed
)
if
is_embed
is
not
None
else
5
pr
=
PlaceholderRange
(
offset
=
0
,
length
=
length
,
is_embed
=
is_embed
)
assert
pr
.
get_num_embeds
==
expected
@
pytest
.
mark
.
parametrize
(
"is_embed,expected"
,
[
(
None
,
None
),
(
torch
.
tensor
([
False
,
True
,
False
,
True
,
True
]),
torch
.
tensor
([
0
,
1
,
1
,
2
,
3
]),
),
(
torch
.
tensor
([
True
,
True
,
True
]),
torch
.
tensor
([
1
,
2
,
3
])),
],
)
def
test_placeholder_range_embeds_cumsum
(
is_embed
,
expected
):
length
=
len
(
is_embed
)
if
is_embed
is
not
None
else
5
pr
=
PlaceholderRange
(
offset
=
0
,
length
=
length
,
is_embed
=
is_embed
)
if
expected
is
None
:
assert
pr
.
embeds_cumsum
is
None
return
assert
torch
.
equal
(
pr
.
embeds_cumsum
,
expected
)
# cached_property should return the same object on repeated access
assert
pr
.
embeds_cumsum
is
pr
.
embeds_cumsum
tests/multimodal/test_utils.py
View file @
116880a5
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
import
base64
import
mimetypes
import
os
from
tempfile
import
NamedTemporaryFile
,
TemporaryDirectory
import
numpy
as
np
import
pytest
import
pytest
import
torch
import
torch
from
PIL
import
Image
,
ImageChops
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.multimodal.inputs
import
PlaceholderRange
from
vllm.multimodal.media
import
MediaConnector
from
vllm.multimodal.utils
import
argsort_mm_positions
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_ASSETS
=
[
"2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
,
# "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
"Grayscale_8bits_palette_sample_image.png"
,
# "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/Grayscale_8bits_palette_sample_image.png",
"1280px-Venn_diagram_rgb.svg.png"
,
# "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/1280px-Venn_diagram_rgb.svg.png",
"RGBA_comp.png"
,
# "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png",
]
TEST_VIDEO_URLS
=
[
"https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4"
,
"https://github.com/opencv/opencv/raw/refs/tags/4.12.0/samples/data/vtest.avi"
,
]
@
pytest
.
fixture
(
scope
=
"module"
)
def
url_images
(
local_asset_server
)
->
dict
[
str
,
Image
.
Image
]:
return
{
image_url
:
local_asset_server
.
get_image_asset
(
image_url
)
for
image_url
in
TEST_IMAGE_ASSETS
}
def
get_supported_suffixes
()
->
tuple
[
str
,
...]:
# We should at least test the file types mentioned in GPT-4 with Vision
OPENAI_SUPPORTED_SUFFIXES
=
(
".png"
,
".jpeg"
,
".jpg"
,
".webp"
,
".gif"
)
# Additional file types that are supported by us
EXTRA_SUPPORTED_SUFFIXES
=
(
".bmp"
,
".tiff"
)
return
OPENAI_SUPPORTED_SUFFIXES
+
EXTRA_SUPPORTED_SUFFIXES
def
_image_equals
(
a
:
Image
.
Image
,
b
:
Image
.
Image
)
->
bool
:
return
(
np
.
asarray
(
a
)
==
np
.
asarray
(
convert_image_mode
(
b
,
a
.
mode
))).
all
()
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_ASSETS
,
indirect
=
True
)
async
def
test_fetch_image_http
(
image_url
:
str
):
connector
=
MediaConnector
()
image_sync
=
connector
.
fetch_image
(
image_url
)
image_async
=
await
connector
.
fetch_image_async
(
image_url
)
assert
_image_equals
(
image_sync
,
image_async
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"raw_image_url"
,
TEST_IMAGE_ASSETS
)
@
pytest
.
mark
.
parametrize
(
"suffix"
,
get_supported_suffixes
())
async
def
test_fetch_image_base64
(
url_images
:
dict
[
str
,
Image
.
Image
],
raw_image_url
:
str
,
suffix
:
str
):
connector
=
MediaConnector
(
# Domain restriction should not apply to data URLs.
allowed_media_domains
=
[
"www.bogotobogo.com"
,
"github.com"
,
]
)
url_image
=
url_images
[
raw_image_url
]
try
:
mime_type
=
Image
.
MIME
[
Image
.
registered_extensions
()[
suffix
]]
except
KeyError
:
try
:
mime_type
=
mimetypes
.
types_map
[
suffix
]
except
KeyError
:
pytest
.
skip
(
"No MIME type"
)
with
NamedTemporaryFile
(
suffix
=
suffix
)
as
f
:
try
:
url_image
.
save
(
f
.
name
)
except
Exception
as
e
:
if
e
.
args
[
0
]
==
"cannot write mode RGBA as JPEG"
:
pytest
.
skip
(
"Conversion not supported"
)
raise
base64_image
=
base64
.
b64encode
(
f
.
read
()).
decode
(
"utf-8"
)
data_url
=
f
"data:
{
mime_type
}
;base64,
{
base64_image
}
"
data_image_sync
=
connector
.
fetch_image
(
data_url
)
if
_image_equals
(
url_image
,
Image
.
open
(
f
)):
assert
_image_equals
(
url_image
,
data_image_sync
)
else
:
pass
# Lossy format; only check that image can be opened
data_image_async
=
await
connector
.
fetch_image_async
(
data_url
)
assert
_image_equals
(
data_image_sync
,
data_image_async
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_ASSETS
,
indirect
=
True
)
async
def
test_fetch_image_local_files
(
image_url
:
str
):
connector
=
MediaConnector
()
with
TemporaryDirectory
()
as
temp_dir
:
local_connector
=
MediaConnector
(
allowed_local_media_path
=
temp_dir
)
origin_image
=
connector
.
fetch_image
(
image_url
)
origin_image
.
save
(
os
.
path
.
join
(
temp_dir
,
os
.
path
.
basename
(
image_url
)),
quality
=
100
,
icc_profile
=
origin_image
.
info
.
get
(
"icc_profile"
),
)
image_async
=
await
local_connector
.
fetch_image_async
(
f
"file://
{
temp_dir
}
/
{
os
.
path
.
basename
(
image_url
)
}
"
)
image_sync
=
local_connector
.
fetch_image
(
f
"file://
{
temp_dir
}
/
{
os
.
path
.
basename
(
image_url
)
}
"
)
# Check that the images are equal
assert
not
ImageChops
.
difference
(
image_sync
,
image_async
).
getbbox
()
with
pytest
.
raises
(
ValueError
,
match
=
"must be a subpath"
):
await
local_connector
.
fetch_image_async
(
f
"file://
{
temp_dir
}
/../
{
os
.
path
.
basename
(
image_url
)
}
"
)
with
pytest
.
raises
(
RuntimeError
,
match
=
"Cannot load local files"
):
await
connector
.
fetch_image_async
(
f
"file://
{
temp_dir
}
/../
{
os
.
path
.
basename
(
image_url
)
}
"
)
with
pytest
.
raises
(
ValueError
,
match
=
"must be a subpath"
):
local_connector
.
fetch_image
(
f
"file://
{
temp_dir
}
/../
{
os
.
path
.
basename
(
image_url
)
}
"
)
with
pytest
.
raises
(
RuntimeError
,
match
=
"Cannot load local files"
):
connector
.
fetch_image
(
f
"file://
{
temp_dir
}
/../
{
os
.
path
.
basename
(
image_url
)
}
"
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"image_url"
,
[
TEST_IMAGE_ASSETS
[
0
]],
indirect
=
True
)
async
def
test_fetch_image_local_files_with_space_in_name
(
image_url
:
str
):
connector
=
MediaConnector
()
with
TemporaryDirectory
()
as
temp_dir
:
local_connector
=
MediaConnector
(
allowed_local_media_path
=
temp_dir
)
origin_image
=
connector
.
fetch_image
(
image_url
)
filename
=
"file name with space.jpg"
origin_image
.
save
(
os
.
path
.
join
(
temp_dir
,
filename
),
quality
=
100
,
icc_profile
=
origin_image
.
info
.
get
(
"icc_profile"
),
)
try
:
image_async
=
await
local_connector
.
fetch_image_async
(
f
"file://
{
temp_dir
}
/
{
filename
}
"
)
image_sync
=
local_connector
.
fetch_image
(
f
"file://
{
temp_dir
}
/
{
filename
}
"
)
except
FileNotFoundError
as
e
:
pytest
.
fail
(
"Failed to fetch image with space in name: {}"
.
format
(
e
))
# Check that the images are equal
assert
not
ImageChops
.
difference
(
image_sync
,
image_async
).
getbbox
()
@
pytest
.
mark
.
asyncio
async
def
test_fetch_image_error_conversion
():
connector
=
MediaConnector
()
broken_img
=
"data:image/png;base64,aGVsbG9fdmxsbV9jb21tdW5pdHkK"
# PIL.UnidentifiedImageError should be converted to ValueError
with
pytest
.
raises
(
ValueError
):
await
connector
.
fetch_image_async
(
broken_img
)
with
pytest
.
raises
(
ValueError
):
connector
.
fetch_image
(
broken_img
)
@
pytest
.
mark
.
flaky
(
reruns
=
3
,
reruns_delay
=
5
)
from
vllm.multimodal.inputs
import
(
@
pytest
.
mark
.
asyncio
MultiModalBatchedField
,
@
pytest
.
mark
.
parametrize
(
"video_url"
,
TEST_VIDEO_URLS
)
MultiModalFieldElem
,
@
pytest
.
mark
.
parametrize
(
"num_frames"
,
[
-
1
,
32
,
1800
])
MultiModalKwargsItem
,
async
def
test_fetch_video_http
(
video_url
:
str
,
num_frames
:
int
):
MultiModalSharedField
,
connector
=
MediaConnector
(
PlaceholderRange
,
media_io_kwargs
=
{
)
"video"
:
{
from
vllm.multimodal.utils
import
argsort_mm_positions
,
group_and_batch_mm_items
"num_frames"
:
num_frames
,
}
}
)
try
:
video_sync
,
metadata_sync
=
connector
.
fetch_video
(
video_url
)
video_async
,
metadata_async
=
await
connector
.
fetch_video_async
(
video_url
)
except
(
TimeoutError
,
asyncio
.
TimeoutError
)
as
e
:
pytest
.
skip
(
f
"Timeout fetching video (CI network flakiness):
{
e
}
"
)
assert
np
.
array_equal
(
video_sync
,
video_async
)
assert
metadata_sync
==
metadata_async
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"video_url"
,
TEST_VIDEO_URLS
)
@
pytest
.
mark
.
parametrize
(
"max_duration"
,
[
1
,
60
,
1800
])
@
pytest
.
mark
.
parametrize
(
"requested_fps"
,
[
2
,
24
])
async
def
test_fetch_video_http_with_dynamic_loader
(
video_url
:
str
,
max_duration
:
int
,
requested_fps
:
int
,
monkeypatch
:
pytest
.
MonkeyPatch
,
):
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_VIDEO_LOADER_BACKEND"
,
"opencv_dynamic"
)
connector
=
MediaConnector
(
media_io_kwargs
=
{
"video"
:
{
"max_duration"
:
max_duration
,
"requested_fps"
:
requested_fps
,
}
}
)
video_sync
,
metadata_sync
=
connector
.
fetch_video
(
video_url
)
video_async
,
metadata_async
=
await
connector
.
fetch_video_async
(
video_url
)
assert
np
.
array_equal
(
video_sync
,
video_async
)
assert
metadata_sync
==
metadata_async
assert
metadata_sync
[
"video_backend"
]
==
"opencv_dynamic"
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
...
@@ -412,121 +184,35 @@ def test_argsort_mm_positions(case):
...
@@ -412,121 +184,35 @@ def test_argsort_mm_positions(case):
assert
modality_idxs
==
expected_modality_idxs
assert
modality_idxs
==
expected_modality_idxs
@
pytest
.
mark
.
parametrize
(
def
test_group_and_batch_mm_items_split_by_fieldset
():
"is_embed,expected"
,
elem
=
MultiModalFieldElem
(
[
data
=
torch
.
empty
(
1
,
dtype
=
torch
.
uint8
),
(
None
,
5
),
field
=
MultiModalBatchedField
(),
(
torch
.
tensor
([
True
,
True
,
True
,
True
,
True
]),
5
),
(
torch
.
tensor
([
False
,
False
,
False
,
False
,
False
]),
0
),
(
torch
.
tensor
([
True
,
False
,
True
,
False
,
True
]),
3
),
(
torch
.
tensor
([
True
]),
1
),
],
)
def
test_placeholder_range_get_num_embeds
(
is_embed
,
expected
):
length
=
len
(
is_embed
)
if
is_embed
is
not
None
else
5
pr
=
PlaceholderRange
(
offset
=
0
,
length
=
length
,
is_embed
=
is_embed
)
assert
pr
.
get_num_embeds
==
expected
@
pytest
.
mark
.
parametrize
(
"is_embed,expected"
,
[
(
None
,
None
),
(
torch
.
tensor
([
False
,
True
,
False
,
True
,
True
]),
torch
.
tensor
([
0
,
1
,
1
,
2
,
3
]),
),
(
torch
.
tensor
([
True
,
True
,
True
]),
torch
.
tensor
([
1
,
2
,
3
])),
],
)
def
test_placeholder_range_embeds_cumsum
(
is_embed
,
expected
):
length
=
len
(
is_embed
)
if
is_embed
is
not
None
else
5
pr
=
PlaceholderRange
(
offset
=
0
,
length
=
length
,
is_embed
=
is_embed
)
if
expected
is
None
:
assert
pr
.
embeds_cumsum
is
None
return
assert
torch
.
equal
(
pr
.
embeds_cumsum
,
expected
)
# cached_property should return the same object on repeated access
assert
pr
.
embeds_cumsum
is
pr
.
embeds_cumsum
@
pytest
.
mark
.
parametrize
(
"is_embed,start_idx,end_idx,expected"
,
[
(
None
,
2
,
4
,
(
2
,
4
)),
(
torch
.
tensor
([
False
,
True
,
False
,
True
,
True
]),
3
,
5
,
(
1
,
3
),
),
(
torch
.
tensor
([
False
,
True
,
False
,
True
,
True
]),
0
,
2
,
(
0
,
1
),
),
(
torch
.
tensor
([
True
,
False
,
True
,
False
]),
2
,
2
,
(
1
,
1
),
),
],
)
def
test_placeholder_range_get_embeds_indices_in_range
(
is_embed
,
start_idx
,
end_idx
,
expected
):
length
=
len
(
is_embed
)
if
is_embed
is
not
None
else
5
pr
=
PlaceholderRange
(
offset
=
0
,
length
=
length
,
is_embed
=
is_embed
)
assert
pr
.
get_embeds_indices_in_range
(
start_idx
,
end_idx
)
==
expected
@
pytest
.
mark
.
parametrize
(
"offset,is_embed,expected"
,
[
(
0
,
None
,
[(
0
,
4
)]),
(
2
,
torch
.
tensor
([
False
,
True
,
False
,
True
,
True
]),
[(
3
,
3
),
(
5
,
6
)],
),
(
0
,
torch
.
tensor
([
True
,
True
,
True
,
True
]),
[(
0
,
3
)]),
(
0
,
torch
.
tensor
([
False
,
False
,
False
,
False
]),
[]),
],
)
def
test_placeholder_range_extract_embeds_range
(
offset
,
is_embed
,
expected
):
length
=
len
(
is_embed
)
if
is_embed
is
not
None
else
5
pr
=
PlaceholderRange
(
offset
=
offset
,
length
=
length
,
is_embed
=
is_embed
)
assert
pr
.
extract_embeds_range
()
==
expected
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"video_url"
,
TEST_VIDEO_URLS
)
@
pytest
.
mark
.
parametrize
(
"num_frames"
,
[
-
1
,
32
,
1800
])
async
def
test_allowed_media_domains
(
video_url
:
str
,
num_frames
:
int
):
connector
=
MediaConnector
(
media_io_kwargs
=
{
"video"
:
{
"num_frames"
:
num_frames
,
}
},
allowed_media_domains
=
[
"www.bogotobogo.com"
,
"github.com"
,
],
)
)
item1
=
MultiModalKwargsItem
({
"x"
:
elem
,
"y"
:
elem
})
item2
=
MultiModalKwargsItem
({
"y"
:
elem
,
"x"
:
elem
})
item3
=
MultiModalKwargsItem
({
"x"
:
elem
,
"y"
:
elem
,
"z"
:
elem
})
item4
=
MultiModalKwargsItem
({
"x"
:
elem
})
item5
=
MultiModalKwargsItem
({
"x"
:
elem
,
"y"
:
elem
})
video_sync
,
metadata_sync
=
connector
.
fetch_video
(
video_url
)
res
=
group_and_batch_mm_items
([
item1
,
item2
,
item3
,
item4
,
item5
])
video_async
,
metadata_async
=
await
connector
.
fetch_video_async
(
video_url
)
assert
[
num_items
for
num_items
,
_
in
res
]
==
[
2
,
1
,
1
,
1
]
assert
np
.
array_equal
(
video_sync
,
video_async
)
assert
metadata_sync
==
metadata_async
disallowed_url
=
"https://upload.wikimedia.org/wikipedia/commons/4/47/PNG_transparency_demonstration_1.png"
with
pytest
.
raises
(
ValueError
):
_
,
_
=
connector
.
fetch_video
(
disallowed_url
)
with
pytest
.
raises
(
ValueError
):
def
test_group_and_batch_mm_items_split_by_shared_data
():
_
,
_
=
await
connector
.
fetch_video_async
(
disallowed_url
)
elem1
=
MultiModalFieldElem
(
data
=
torch
.
zeros
(
1
,
dtype
=
torch
.
uint8
),
field
=
MultiModalSharedField
(
batch_size
=
1
),
)
elem2
=
MultiModalFieldElem
(
data
=
torch
.
zeros
(
2
,
dtype
=
torch
.
uint8
),
field
=
MultiModalSharedField
(
batch_size
=
1
),
)
item1
=
MultiModalKwargsItem
({
"x"
:
elem1
})
item2
=
MultiModalKwargsItem
({
"x"
:
elem1
})
item3
=
MultiModalKwargsItem
({
"x"
:
elem2
})
item4
=
MultiModalKwargsItem
({
"x"
:
elem1
})
item5
=
MultiModalKwargsItem
({
"x"
:
elem2
})
res
=
group_and_batch_mm_items
([
item1
,
item2
,
item3
,
item4
,
item5
])
assert
[
num_items
for
num_items
,
_
in
res
]
==
[
2
,
1
,
1
,
1
]
vllm/model_executor/models/step3_vl.py
View file @
116880a5
...
@@ -71,9 +71,7 @@ class Step3VLImagePixelInputs(TensorSchema):
...
@@ -71,9 +71,7 @@ class Step3VLImagePixelInputs(TensorSchema):
type
:
Literal
[
"pixel_values"
]
type
:
Literal
[
"pixel_values"
]
pixel_values
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"bn"
,
3
,
"h"
,
"w"
)]
pixel_values
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"bn"
,
3
,
"h"
,
"w"
)]
patch_pixel_values
:
Annotated
[
patch_pixel_values
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"bnp"
,
3
,
"hp"
,
"wp"
)]
torch
.
Tensor
|
None
,
TensorShape
(
"bnp"
,
3
,
"hp"
,
"wp"
)
]
num_patches
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"bn"
)]
num_patches
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"bn"
)]
...
@@ -91,7 +89,7 @@ class Step3VLImageEmbeddingInputs(TensorSchema):
...
@@ -91,7 +89,7 @@ class Step3VLImageEmbeddingInputs(TensorSchema):
Step3VLImageInputs
:
TypeAlias
=
Step3VLImagePixelInputs
|
Step3VLImageEmbeddingInputs
Step3VLImageInputs
:
TypeAlias
=
Step3VLImagePixelInputs
|
Step3VLImageEmbeddingInputs
ImageWithPatches
=
tuple
[
Image
.
Image
,
list
[
Image
.
Image
],
list
[
int
]
|
None
]
ImageWithPatches
=
tuple
[
Image
.
Image
,
list
[
Image
.
Image
],
list
[
bool
]
|
None
]
MAX_IMAGE_SIZE
:
int
=
3024
MAX_IMAGE_SIZE
:
int
=
3024
...
@@ -432,7 +430,7 @@ class Step3VLProcessor:
...
@@ -432,7 +430,7 @@ class Step3VLProcessor:
if
len
(
parts
)
-
1
!=
len
(
repls
):
if
len
(
parts
)
-
1
!=
len
(
repls
):
raise
ValueError
(
raise
ValueError
(
"The number of placeholders does not match the number of replacements."
# noqa: E501
"The number of placeholders does not match the number of replacements."
)
)
result
=
[
parts
[
0
]]
result
=
[
parts
[
0
]]
...
@@ -468,7 +466,7 @@ class Step3VLProcessor:
...
@@ -468,7 +466,7 @@ class Step3VLProcessor:
image_repl_str_lst
=
[]
image_repl_str_lst
=
[]
image_repl_ids_lst
=
[]
image_repl_ids_lst
=
[]
num_patches
=
[]
num_patches
=
[]
for
raw_img
,
img_patches
,
patch_newline_mask
in
splitted_images_data
:
# noqa: E501
for
raw_img
,
img_patches
,
patch_newline_mask
in
splitted_images_data
:
pixel_values_lst
.
extend
(
self
.
_convert_images_to_pixel_values
([
raw_img
]))
pixel_values_lst
.
extend
(
self
.
_convert_images_to_pixel_values
([
raw_img
]))
if
len
(
img_patches
)
>
0
:
if
len
(
img_patches
)
>
0
:
...
@@ -486,16 +484,20 @@ class Step3VLProcessor:
...
@@ -486,16 +484,20 @@ class Step3VLProcessor:
if
patch_newline_mask
is
not
None
:
if
patch_newline_mask
is
not
None
:
patch_newline_mask_lst
.
extend
(
patch_newline_mask
)
patch_newline_mask_lst
.
extend
(
patch_newline_mask
)
pixel_values
=
torch
.
cat
(
pixel_values_lst
)
patch_size
=
self
.
patch_size
image_inputs
=
{
image_inputs
=
{
"pixel_values"
:
torch
.
cat
(
pixel_values
_lst
)
,
"pixel_values"
:
pixel_values
,
"num_patches"
:
num_patches
,
"num_patches"
:
num_patches
,
}
"patch_pixel_values"
:
(
if
patch_pixel_values_lst
:
torch
.
cat
(
patch_pixel_values_lst
)
image_inputs
[
"patch_pixel_values"
]
=
torch
.
cat
(
patch_pixel_values_lst
)
if
patch_pixel_values_lst
if
patch_newline_mask_lst
:
else
pixel_values
.
new_empty
((
0
,
3
,
patch_size
,
patch_size
))
image_inputs
[
"patch_newline_mask"
]
=
torch
.
tensor
(
),
"patch_newline_mask"
:
torch
.
tensor
(
patch_newline_mask_lst
,
dtype
=
torch
.
bool
patch_newline_mask_lst
,
dtype
=
torch
.
bool
)
),
}
text
=
[
text
=
[
self
.
replace_placeholder
(
t
,
self
.
image_token
,
image_repl_str_lst
)
self
.
replace_placeholder
(
t
,
self
.
image_token
,
image_repl_str_lst
)
...
@@ -998,13 +1000,11 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
...
@@ -998,13 +1000,11 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
if
pixel_values
is
None
and
image_embeds
is
None
:
if
pixel_values
is
None
and
image_embeds
is
None
:
return
None
return
None
if
pixel_values
is
not
None
:
if
pixel_values
is
not
None
and
patch_pixel_values
is
not
None
:
return
Step3VLImagePixelInputs
(
return
Step3VLImagePixelInputs
(
type
=
"pixel_values"
,
type
=
"pixel_values"
,
pixel_values
=
pixel_values
.
to
(
self
.
dtype
),
pixel_values
=
pixel_values
.
to
(
self
.
dtype
),
patch_pixel_values
=
patch_pixel_values
.
to
(
self
.
dtype
)
patch_pixel_values
=
patch_pixel_values
.
to
(
self
.
dtype
),
if
patch_pixel_values
is
not
None
else
None
,
num_patches
=
num_patches
,
num_patches
=
num_patches
,
)
)
...
@@ -1039,7 +1039,7 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
...
@@ -1039,7 +1039,7 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
image_features
=
self
.
_get_vision_model_output
(
image_input
[
"pixel_values"
])
image_features
=
self
.
_get_vision_model_output
(
image_input
[
"pixel_values"
])
patch_image_features
=
(
patch_image_features
=
(
self
.
_get_vision_model_output
(
image_input
[
"patch_pixel_values"
])
self
.
_get_vision_model_output
(
image_input
[
"patch_pixel_values"
])
if
image_input
[
"patch_pixel_values"
]
is
not
None
if
len
(
image_input
[
"patch_pixel_values"
]
)
>
0
else
None
else
None
)
)
num_patches
=
image_input
[
"num_patches"
]
num_patches
=
image_input
[
"num_patches"
]
...
...
vllm/model_executor/models/terratorch.py
View file @
116880a5
...
@@ -62,7 +62,6 @@ from vllm.multimodal.processing import (
...
@@ -62,7 +62,6 @@ from vllm.multimodal.processing import (
PromptUpdate
,
PromptUpdate
,
)
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils
import
length_from_prompt_token_ids_or_embeds
from
.interfaces
import
IsAttentionFree
,
MultiModalEmbeddings
,
SupportsMultiModal
from
.interfaces
import
IsAttentionFree
,
MultiModalEmbeddings
,
SupportsMultiModal
from
.interfaces_base
import
attn_type
from
.interfaces_base
import
attn_type
...
@@ -74,7 +73,11 @@ def _terratorch_field_names(input_definition: InputDefinition):
...
@@ -74,7 +73,11 @@ def _terratorch_field_names(input_definition: InputDefinition):
return
set
(
input_definition
.
data
.
keys
())
return
set
(
input_definition
.
data
.
keys
())
def
_terratorch_field_factory
(
input_definition
:
InputDefinition
):
def
_terratorch_field_factory
(
input_definition
:
InputDefinition
,
*
,
is_shared
:
bool
=
True
,
# True for unprocessed data, False for processed data
):
def
_terratorch_field_config
(
def
_terratorch_field_config
(
hf_inputs
:
Mapping
[
str
,
torch
.
Tensor
],
hf_inputs
:
Mapping
[
str
,
torch
.
Tensor
],
)
->
Mapping
[
str
,
MultiModalFieldConfig
]:
)
->
Mapping
[
str
,
MultiModalFieldConfig
]:
...
@@ -82,7 +85,11 @@ def _terratorch_field_factory(input_definition: InputDefinition):
...
@@ -82,7 +85,11 @@ def _terratorch_field_factory(input_definition: InputDefinition):
for
name
,
input
in
input_definition
.
data
.
items
():
for
name
,
input
in
input_definition
.
data
.
items
():
modality
=
"image"
modality
=
"image"
if
input
.
type
==
InputTypeEnum
.
tensor
:
if
input
.
type
==
InputTypeEnum
.
tensor
:
fields
[
name
]
=
MultiModalFieldConfig
.
shared
(
modality
,
batch_size
=
1
)
fields
[
name
]
=
(
MultiModalFieldConfig
.
shared
(
modality
,
batch_size
=
1
)
if
is_shared
else
MultiModalFieldConfig
.
batched
(
modality
)
)
return
fields
return
fields
...
@@ -166,8 +173,14 @@ class TerratorchMultiModalProcessor(BaseMultiModalProcessor[TerratorchProcessing
...
@@ -166,8 +173,14 @@ class TerratorchMultiModalProcessor(BaseMultiModalProcessor[TerratorchProcessing
self
,
self
,
hf_inputs
:
BatchFeature
,
hf_inputs
:
BatchFeature
,
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
*
,
is_shared
:
bool
=
True
,
)
->
Mapping
[
str
,
MultiModalFieldConfig
]:
)
->
Mapping
[
str
,
MultiModalFieldConfig
]:
return
_terratorch_field_factory
(
self
.
info
.
input_definition
)(
hf_inputs
)
factory
=
_terratorch_field_factory
(
self
.
info
.
input_definition
,
is_shared
=
is_shared
,
)
return
factory
(
hf_inputs
)
def
_get_prompt_updates
(
def
_get_prompt_updates
(
self
,
self
,
...
@@ -193,12 +206,19 @@ class TerratorchMultiModalProcessor(BaseMultiModalProcessor[TerratorchProcessing
...
@@ -193,12 +206,19 @@ class TerratorchMultiModalProcessor(BaseMultiModalProcessor[TerratorchProcessing
)
)
_
,
passthrough_data
=
self
.
_get_hf_mm_data
(
mm_items
)
_
,
passthrough_data
=
self
.
_get_hf_mm_data
(
mm_items
)
mm_processed_data
=
BatchFeature
(
dict
(
passthrough_data
),
tensor_type
=
"pt"
)
mm_processed_data
=
BatchFeature
(
{
k
:
torch
.
tensor
(
v
).
unsqueeze
(
0
)
for
k
,
v
in
passthrough_data
.
items
()},
tensor_type
=
"pt"
,
)
mm_placeholders
=
{
"image"
:
[
PlaceholderRange
(
offset
=
0
,
length
=
0
)]}
mm_placeholders
=
{
"image"
:
[
PlaceholderRange
(
offset
=
0
,
length
=
0
)]}
mm_kwargs
=
MultiModalKwargsItems
.
from_hf_inputs
(
mm_kwargs
=
MultiModalKwargsItems
.
from_hf_inputs
(
mm_processed_data
,
mm_processed_data
,
self
.
_get_mm_fields_config
(
mm_processed_data
,
hf_processor_mm_kwargs
),
self
.
_get_mm_fields_config
(
mm_processed_data
,
hf_processor_mm_kwargs
,
is_shared
=
False
,
),
)
)
return
MultiModalInputs
(
return
MultiModalInputs
(
...
@@ -235,9 +255,6 @@ class Terratorch(nn.Module, IsAttentionFree, SupportsMultiModal):
...
@@ -235,9 +255,6 @@ class Terratorch(nn.Module, IsAttentionFree, SupportsMultiModal):
self
.
inference_runner
=
InferenceRunner
(
config
)
self
.
inference_runner
=
InferenceRunner
(
config
)
self
.
model
=
self
.
inference_runner
.
model
self
.
model
=
self
.
inference_runner
.
model
pooler_config
=
vllm_config
.
model_config
.
pooler_config
assert
pooler_config
is
not
None
self
.
pooler
=
IdentityPooler
()
self
.
pooler
=
IdentityPooler
()
def
embed_input_ids
(
def
embed_input_ids
(
...
@@ -262,15 +279,8 @@ class Terratorch(nn.Module, IsAttentionFree, SupportsMultiModal):
...
@@ -262,15 +279,8 @@ class Terratorch(nn.Module, IsAttentionFree, SupportsMultiModal):
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
**
kwargs
:
object
,
**
kwargs
:
object
,
):
):
input_len
=
length_from_prompt_token_ids_or_embeds
(
input_ids
,
inputs_embeds
)
model_output
=
self
.
inference_runner
.
forward
(
**
kwargs
)
return
model_output
.
output
batched_kwargs
=
{
k
:
v
.
unsqueeze
(
0
)
for
k
,
v
in
kwargs
.
items
()}
model_output
=
self
.
inference_runner
.
forward
(
**
batched_kwargs
).
output
# The leading dimension of hidden states needs to equal input length
return
model_output
.
expand
(
input_len
,
*
(
-
1
for
_
in
range
(
model_output
.
ndim
-
1
))
)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
params_list
=
[]
params_list
=
[]
...
...
vllm/multimodal/hasher.py
View file @
116880a5
...
@@ -102,12 +102,19 @@ class MultiModalHasher:
...
@@ -102,12 +102,19 @@ class MultiModalHasher:
"data"
:
tensor_obj
.
numpy
(),
"data"
:
tensor_obj
.
numpy
(),
},
},
)
)
return
cls
.
iter_item_to_bytes
(
"tensor"
,
tensor_obj
.
numpy
())
return
cls
.
iter_item_to_bytes
(
"tensor"
,
tensor_obj
.
numpy
())
if
isinstance
(
obj
,
np
.
ndarray
):
if
isinstance
(
obj
,
np
.
ndarray
):
# If the array is non-contiguous, we need to copy it first
if
obj
.
ndim
==
0
:
arr_data
=
(
arr_data
=
obj
.
item
()
obj
.
view
(
np
.
uint8
).
data
if
obj
.
flags
.
c_contiguous
else
obj
.
tobytes
()
elif
obj
.
flags
.
c_contiguous
:
)
# Not valid for 0-D arrays
arr_data
=
obj
.
view
(
np
.
uint8
).
data
else
:
# If the array is non-contiguous, we need to copy it first
arr_data
=
obj
.
tobytes
()
return
cls
.
iter_item_to_bytes
(
return
cls
.
iter_item_to_bytes
(
"ndarray"
,
"ndarray"
,
{
{
...
@@ -116,6 +123,7 @@ class MultiModalHasher:
...
@@ -116,6 +123,7 @@ class MultiModalHasher:
"data"
:
arr_data
,
"data"
:
arr_data
,
},
},
)
)
logger
.
warning
(
logger
.
warning
(
"No serialization method found for %s. Falling back to pickle."
,
type
(
obj
)
"No serialization method found for %s. Falling back to pickle."
,
type
(
obj
)
)
)
...
@@ -147,7 +155,7 @@ class MultiModalHasher:
...
@@ -147,7 +155,7 @@ class MultiModalHasher:
hasher_factory
=
_get_hasher_factory
(
envs
.
VLLM_MM_HASHER_ALGORITHM
)
hasher_factory
=
_get_hasher_factory
(
envs
.
VLLM_MM_HASHER_ALGORITHM
)
hasher
=
hasher_factory
()
hasher
=
hasher_factory
()
for
k
,
v
in
kwargs
.
items
():
for
k
,
v
in
sorted
(
kwargs
.
items
()
,
key
=
lambda
kv
:
kv
[
0
])
:
for
bytes_
in
cls
.
iter_item_to_bytes
(
k
,
v
):
for
bytes_
in
cls
.
iter_item_to_bytes
(
k
,
v
):
hasher
.
update
(
bytes_
)
hasher
.
update
(
bytes_
)
...
...
vllm/multimodal/inputs.py
View file @
116880a5
...
@@ -424,8 +424,9 @@ class BaseMultiModalField(ABC):
...
@@ -424,8 +424,9 @@ class BaseMultiModalField(ABC):
keep_on_cpu
:
bool
=
False
keep_on_cpu
:
bool
=
False
"""
"""
If `True`, then this field is excluded from being moved to the accelerator
If `True`, then this field is excluded from being moved to the accelerator when
when `MultiModalKwargsItems.get_data()` is called to batch the data.
[`group_and_batch_mm_items`][vllm.multimodal.utils.group_and_batch_mm_items]
is called to batch the data.
"""
"""
def
_field_factory
(
self
):
def
_field_factory
(
self
):
...
@@ -1006,27 +1007,38 @@ class MultiModalKwargsItems(UserDict[str, Sequence[_I]]):
...
@@ -1006,27 +1007,38 @@ class MultiModalKwargsItems(UserDict[str, Sequence[_I]]):
pin_memory
:
bool
=
False
,
pin_memory
:
bool
=
False
,
)
->
BatchedTensorInputs
:
)
->
BatchedTensorInputs
:
"""Construct a dictionary of keyword arguments to pass to the model."""
"""Construct a dictionary of keyword arguments to pass to the model."""
elems_by_key
=
defaultdict
[
str
,
list
[
MultiModalFieldElem
]](
list
)
from
.utils
import
group_and_batch_mm_items
for
modality
,
items
in
self
.
items
():
for
i
,
item
in
enumerate
(
items
):
items_by_modality
=
self
.
require_data
()
if
item
is
None
:
batches_by_modality
=
{
raise
RuntimeError
(
modality
:
[
f
"Cannot build data from empty mm_items[
{
modality
}
][
{
i
}
]"
data
)
for
_
,
data
in
group_and_batch_mm_items
(
items
,
device
=
device
,
pin_memory
=
pin_memory
,
)
]
for
modality
,
items
in
items_by_modality
.
items
()
if
len
(
items
)
>
0
}
for
key
,
elem
in
item
.
items
():
out_data
:
BatchedTensorInputs
=
{}
elems_by_key
[
key
].
append
(
elem
)
for
_
,
batches
in
batches_by_modality
.
items
():
if
len
(
batches
)
!=
1
:
num_batches_by_modality
=
{
modality
:
len
(
batches
)
for
modality
,
batches
in
batches_by_modality
.
items
()
}
raise
RuntimeError
(
f
"Some modalities cannot be merged into a single batch "
f
"(
{
num_batches_by_modality
=
}
)"
)
data
=
{
out_data
.
update
(
batches
[
0
])
key
:
elems
[
0
].
field
.
reduce_data
(
elems
,
device
=
device
,
pin_memory
=
pin_memory
,
)
for
key
,
elems
in
elems_by_key
.
items
()
}
return
data
return
out_
data
MultiModalKwargsOptionalItems
:
TypeAlias
=
(
MultiModalKwargsOptionalItems
:
TypeAlias
=
(
...
...
vllm/multimodal/utils.py
View file @
116880a5
...
@@ -3,7 +3,8 @@
...
@@ -3,7 +3,8 @@
import
mimetypes
import
mimetypes
import
warnings
import
warnings
from
collections.abc
import
Generator
from
collections
import
defaultdict
from
collections.abc
import
Generator
,
Sequence
from
itertools
import
groupby
from
itertools
import
groupby
from
typing
import
TYPE_CHECKING
,
Any
from
typing
import
TYPE_CHECKING
,
Any
...
@@ -13,11 +14,13 @@ from PIL import Image
...
@@ -13,11 +14,13 @@ from PIL import Image
from
vllm.utils.import_utils
import
LazyLoader
from
vllm.utils.import_utils
import
LazyLoader
from
.hasher
import
MultiModalHasher
from
.inputs
import
(
from
.inputs
import
(
BatchedTensorInputs
,
BatchedTensorInputs
,
MultiModalFieldElem
,
MultiModalKwargsItem
,
MultiModalKwargsItem
,
MultiModalKwargsItems
,
MultiModalPlaceholderDict
,
MultiModalPlaceholderDict
,
MultiModalSharedField
,
)
)
from
.media
import
AudioMediaIO
,
ImageMediaIO
,
MediaConnector
,
VideoMediaIO
from
.media
import
AudioMediaIO
,
ImageMediaIO
,
MediaConnector
,
VideoMediaIO
...
@@ -146,32 +149,119 @@ def argsort_mm_positions(
...
@@ -146,32 +149,119 @@ def argsort_mm_positions(
return
[(
modality
,
idx
)
for
modality
,
idx
,
_
in
sorted_flat_items
]
return
[(
modality
,
idx
)
for
modality
,
idx
,
_
in
sorted_flat_items
]
def
_get_group_hash
(
elem
:
MultiModalFieldElem
):
if
not
isinstance
(
elem
.
field
,
MultiModalSharedField
):
return
None
return
MultiModalHasher
.
hash_kwargs
(
data
=
elem
.
data
)
def
_batch_mm_items
(
items
:
Sequence
[
MultiModalKwargsItem
],
*
,
device
:
torch
.
types
.
Device
=
None
,
pin_memory
:
bool
=
False
,
):
elems
=
defaultdict
[
str
,
list
[
MultiModalFieldElem
]](
list
)
for
item
in
items
:
for
key
,
elem
in
item
.
items
():
elems
[
key
].
append
(
elem
)
return
{
key
:
elems
[
0
].
field
.
reduce_data
(
elems
,
device
=
device
,
pin_memory
=
pin_memory
,
)
for
key
,
elems
in
elems
.
items
()
}
def
group_and_batch_mm_items
(
items
:
Sequence
[
MultiModalKwargsItem
],
*
,
device
:
torch
.
types
.
Device
=
None
,
pin_memory
:
bool
=
False
,
)
->
Generator
[
tuple
[
int
,
BatchedTensorInputs
]]:
"""
Group consecutive items (possibly from different requests) into batches.
Items must be split across groups if any of the following occurs,
as the batch would otherwise be invalid:
- They have different fields (e.g. mixed image and embedding inputs).
- They have different values in `MultiModalSharedField`.
Args:
items: List of `MultiModalKwargsItem`.
device: The device to place the grouped tensors on.
pin_memory: Whether to pin memory for faster host-to-device transfer.
Yields:
A tuple `(num_items, grouped_kwargs)`, where:
- `kwargs` is a dictionary of keyword arguments to pass to the model;
- `num_items` is the corresponding number of items.
"""
group_ids
=
[
tuple
(
(
key
,
_get_group_hash
(
elem
))
for
key
,
elem
in
sorted
(
item
.
items
(),
key
=
lambda
kv
:
kv
[
0
])
)
for
item
in
items
]
group_sizes
=
[
sum
(
1
for
_
in
group
)
for
_
,
group
in
groupby
(
group_ids
)]
start_idx
=
0
for
group_size
in
group_sizes
:
group_data
=
_batch_mm_items
(
items
[
start_idx
:
start_idx
+
group_size
],
device
=
device
,
pin_memory
=
pin_memory
,
)
yield
group_size
,
group_data
start_idx
+=
group_size
assert
start_idx
==
len
(
items
)
def
group_mm_kwargs_by_modality
(
def
group_mm_kwargs_by_modality
(
mm_kwargs
:
list
[
tuple
[
str
,
MultiModalKwargsItem
]],
mm_kwargs
:
list
[
tuple
[
str
,
MultiModalKwargsItem
]],
*
,
*
,
device
:
torch
.
types
.
Device
=
None
,
device
:
torch
.
types
.
Device
=
None
,
pin_memory
:
bool
=
False
,
pin_memory
:
bool
=
False
,
)
->
Generator
[
tuple
[
str
,
int
,
BatchedTensorInputs
],
None
,
None
]:
)
->
Generator
[
tuple
[
str
,
int
,
BatchedTensorInputs
],
None
,
None
]:
"""Group consecutive `MultiModalKwargsItem`s from `mm_kwargs` with the same
"""
modality together into the same `MultiModalKwargs` instance.
Group consecutive items (possibly from different requests) into batches.
Items must be split across groups if any of the following occurs,
as the batch would otherwise be invalid:
- They have different fields (e.g. mixed image and embedding inputs).
- They have different values in `MultiModalSharedField`.
To simplify the implementation of `embed_multimodal`, we add another
restriction that the items in a batch must belong to the same modality.
Args:
Args:
mm_kwargs: List of `
MultiModalKwargsI
tem`.
mm_kwargs: List of `
(modality, i
tem
)
`.
device: The device to place the grouped tensors on.
device: The device to place the grouped tensors on.
pin_memory: Whether to pin memory for faster host-to-device transfer.
pin_memory: Whether to pin memory for faster host-to-device transfer.
Yields:
Yields:
A tuple `(modality, num_items, grouped_kwargs)`.
A tuple `(modality, num_items, grouped_kwargs)`, where:
- `modality` is the modality of the batch;
- `kwargs` is a dictionary of keyword arguments to pass to the model;
- `num_items` is the corresponding number of items.
"""
"""
for
modality
,
group
in
groupby
(
mm_kwargs
,
key
=
lambda
x
:
x
[
0
]):
for
modality
,
group
in
groupby
(
mm_kwargs
,
key
=
lambda
x
:
x
[
0
]):
items_lst
=
[
item
for
_
,
item
in
group
]
items_lst
=
[
item
for
_
,
item
in
group
]
mm_kwargs_items
=
MultiModalKwargsItems
({
modality
:
items_lst
})
mm_kwargs_data
=
mm_kwargs_items
.
get_data
(
for
num_items
,
mm_kwargs_batch
in
group_and_batch_mm_items
(
items_lst
,
device
=
device
,
device
=
device
,
pin_memory
=
pin_memory
,
pin_memory
=
pin_memory
,
)
):
yield
modality
,
num_items
,
mm_kwargs_batch
yield
modality
,
len
(
items_lst
),
mm_kwargs_data
def
fetch_audio
(
def
fetch_audio
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment