Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
28459785
Unverified
Commit
28459785
authored
Jan 15, 2026
by
Cyrus Leung
Committed by
GitHub
Jan 15, 2026
Browse files
[3/N] Group together media-related code (#32406)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
8853a50a
Changes
25
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
611 additions
and
639 deletions
+611
-639
tests/conftest.py
tests/conftest.py
+1
-1
tests/entrypoints/openai/test_sparse_tensor_validation.py
tests/entrypoints/openai/test_sparse_tensor_validation.py
+1
-2
tests/entrypoints/openai/test_vision.py
tests/entrypoints/openai/test_vision.py
+1
-1
tests/entrypoints/pooling/embed/test_online_vision.py
tests/entrypoints/pooling/embed/test_online_vision.py
+1
-1
tests/multimodal/media/__init__.py
tests/multimodal/media/__init__.py
+0
-0
tests/multimodal/media/test_audio.py
tests/multimodal/media/test_audio.py
+73
-0
tests/multimodal/media/test_base.py
tests/multimodal/media/test_base.py
+45
-0
tests/multimodal/media/test_image.py
tests/multimodal/media/test_image.py
+133
-0
tests/multimodal/media/test_video.py
tests/multimodal/media/test_video.py
+237
-0
tests/multimodal/test_audio.py
tests/multimodal/test_audio.py
+0
-56
tests/multimodal/test_image.py
tests/multimodal/test_image.py
+1
-153
tests/multimodal/test_video.py
tests/multimodal/test_video.py
+3
-220
tools/pre_commit/check_pickle_imports.py
tools/pre_commit/check_pickle_imports.py
+1
-1
vllm/multimodal/audio.py
vllm/multimodal/audio.py
+0
-77
vllm/multimodal/hasher.py
vllm/multimodal/hasher.py
+1
-1
vllm/multimodal/image.py
vllm/multimodal/image.py
+0
-119
vllm/multimodal/inputs.py
vllm/multimodal/inputs.py
+1
-1
vllm/multimodal/media/__init__.py
vllm/multimodal/media/__init__.py
+16
-0
vllm/multimodal/media/audio.py
vllm/multimodal/media/audio.py
+89
-0
vllm/multimodal/media/base.py
vllm/multimodal/media/base.py
+7
-6
No files found.
tests/conftest.py
View file @
28459785
...
@@ -63,7 +63,7 @@ from vllm.distributed import (
...
@@ -63,7 +63,7 @@ from vllm.distributed import (
)
)
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.logprobs
import
Logprob
from
vllm.logprobs
import
Logprob
from
vllm.multimodal.
base
import
MediaWithBytes
from
vllm.multimodal.
media
import
MediaWithBytes
from
vllm.multimodal.utils
import
fetch_image
from
vllm.multimodal.utils
import
fetch_image
from
vllm.outputs
import
RequestOutput
from
vllm.outputs
import
RequestOutput
from
vllm.sampling_params
import
BeamSearchParams
from
vllm.sampling_params
import
BeamSearchParams
...
...
tests/entrypoints/openai/test_sparse_tensor_validation.py
View file @
28459785
...
@@ -14,8 +14,7 @@ import pytest
...
@@ -14,8 +14,7 @@ import pytest
import
torch
import
torch
from
vllm.entrypoints.renderer
import
CompletionRenderer
from
vllm.entrypoints.renderer
import
CompletionRenderer
from
vllm.multimodal.audio
import
AudioEmbeddingMediaIO
from
vllm.multimodal.media
import
AudioEmbeddingMediaIO
,
ImageEmbeddingMediaIO
from
vllm.multimodal.image
import
ImageEmbeddingMediaIO
def
_encode_tensor
(
tensor
:
torch
.
Tensor
)
->
bytes
:
def
_encode_tensor
(
tensor
:
torch
.
Tensor
)
->
bytes
:
...
...
tests/entrypoints/openai/test_vision.py
View file @
28459785
...
@@ -8,7 +8,7 @@ import pytest
...
@@ -8,7 +8,7 @@ import pytest
import
pytest_asyncio
import
pytest_asyncio
from
transformers
import
AutoProcessor
from
transformers
import
AutoProcessor
from
vllm.multimodal.
base
import
MediaWithBytes
from
vllm.multimodal.
media
import
MediaWithBytes
from
vllm.multimodal.utils
import
encode_image_url
,
fetch_image
from
vllm.multimodal.utils
import
encode_image_url
,
fetch_image
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
...
...
tests/entrypoints/pooling/embed/test_online_vision.py
View file @
28459785
...
@@ -9,7 +9,7 @@ from transformers import AutoProcessor
...
@@ -9,7 +9,7 @@ from transformers import AutoProcessor
from
tests.utils
import
VLLM_PATH
,
RemoteOpenAIServer
from
tests.utils
import
VLLM_PATH
,
RemoteOpenAIServer
from
vllm.entrypoints.pooling.embed.protocol
import
EmbeddingResponse
from
vllm.entrypoints.pooling.embed.protocol
import
EmbeddingResponse
from
vllm.multimodal.
base
import
MediaWithBytes
from
vllm.multimodal.
media
import
MediaWithBytes
from
vllm.multimodal.utils
import
fetch_image
from
vllm.multimodal.utils
import
fetch_image
MODEL_NAME
=
"TIGER-Lab/VLM2Vec-Full"
MODEL_NAME
=
"TIGER-Lab/VLM2Vec-Full"
...
...
tests/multimodal/media/__init__.py
0 → 100644
View file @
28459785
tests/multimodal/media/test_audio.py
0 → 100644
View file @
28459785
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
base64
from
pathlib
import
Path
from
unittest.mock
import
patch
import
numpy
as
np
import
pytest
from
vllm.multimodal.media
import
AudioMediaIO
pytestmark
=
pytest
.
mark
.
cpu_test
ASSETS_DIR
=
Path
(
__file__
).
parent
.
parent
/
"assets"
assert
ASSETS_DIR
.
exists
()
@
pytest
.
fixture
def
dummy_audio
():
return
np
.
array
([
0.0
,
0.1
,
0.2
,
0.3
,
0.4
],
dtype
=
float
)
@
pytest
.
fixture
def
dummy_audio_bytes
():
return
b
"FAKEAUDIOBYTES"
def
test_audio_media_io_load_bytes
(
dummy_audio_bytes
):
audio_io
=
AudioMediaIO
()
with
patch
(
"librosa.load"
)
as
mock_load
:
mock_load
.
return_value
=
(
np
.
array
([
0.1
,
0.2
]),
16000
)
out
=
audio_io
.
load_bytes
(
dummy_audio_bytes
)
mock_load
.
assert_called_once
()
assert
isinstance
(
out
[
0
],
np
.
ndarray
)
assert
out
[
1
]
==
16000
def
test_audio_media_io_load_base64
(
dummy_audio_bytes
):
audio_io
=
AudioMediaIO
()
encoded
=
base64
.
b64encode
(
dummy_audio_bytes
).
decode
(
"utf-8"
)
with
patch
.
object
(
AudioMediaIO
,
"load_bytes"
)
as
mock_load_bytes
:
mock_load_bytes
.
return_value
=
(
np
.
array
([
0.1
,
0.2
]),
16000
)
out
=
audio_io
.
load_base64
(
"audio/wav"
,
encoded
)
mock_load_bytes
.
assert_called_once
()
assert
isinstance
(
out
[
0
],
np
.
ndarray
)
assert
out
[
1
]
==
16000
def
test_audio_media_io_load_file
():
audio_io
=
AudioMediaIO
()
path
=
Path
(
"/fake/path.wav"
)
with
patch
(
"librosa.load"
)
as
mock_load
:
mock_load
.
return_value
=
(
np
.
array
([
0.1
,
0.2
]),
16000
)
out
=
audio_io
.
load_file
(
path
)
mock_load
.
assert_called_once_with
(
path
,
sr
=
None
)
assert
isinstance
(
out
[
0
],
np
.
ndarray
)
assert
out
[
1
]
==
16000
def
test_audio_media_io_encode_base64
(
dummy_audio
):
audio_io
=
AudioMediaIO
()
media
=
(
dummy_audio
,
16000
)
with
patch
(
"soundfile.write"
)
as
mock_write
:
def
write_to_buffer
(
buffer
,
*
_args
,
**
_kwargs
):
buffer
.
write
(
b
"dummy_wav_data"
)
mock_write
.
side_effect
=
write_to_buffer
out
=
audio_io
.
encode_base64
(
media
)
decoded
=
base64
.
b64decode
(
out
)
assert
decoded
==
b
"dummy_wav_data"
mock_write
.
assert_called_once
()
tests/multimodal/media/test_base.py
0 → 100644
View file @
28459785
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pickle
from
pathlib
import
Path
import
pytest
from
PIL
import
Image
from
vllm.multimodal.media
import
MediaWithBytes
pytestmark
=
pytest
.
mark
.
cpu_test
ASSETS_DIR
=
Path
(
__file__
).
parent
.
parent
/
"assets"
assert
ASSETS_DIR
.
exists
()
def
test_media_with_bytes_pickle_roundtrip
():
"""Regression test for pickle/unpickle of MediaWithBytes.
Verifies that MediaWithBytes can be pickled and unpickled without
RecursionError. See: https://github.com/vllm-project/vllm/issues/30818
"""
original_image
=
Image
.
open
(
ASSETS_DIR
/
"image1.png"
).
convert
(
"RGB"
)
original_bytes
=
b
"test_bytes_data"
wrapper
=
MediaWithBytes
(
media
=
original_image
,
original_bytes
=
original_bytes
)
# Verify attribute delegation works before pickling
assert
wrapper
.
width
==
original_image
.
width
assert
wrapper
.
height
==
original_image
.
height
assert
wrapper
.
mode
==
original_image
.
mode
# Pickle and unpickle (this would cause RecursionError before the fix)
pickled
=
pickle
.
dumps
(
wrapper
)
unpickled
=
pickle
.
loads
(
pickled
)
# Verify the unpickled object works correctly
assert
unpickled
.
original_bytes
==
original_bytes
assert
unpickled
.
media
.
width
==
original_image
.
width
assert
unpickled
.
media
.
height
==
original_image
.
height
# Verify attribute delegation works after unpickling
assert
unpickled
.
width
==
original_image
.
width
assert
unpickled
.
height
==
original_image
.
height
assert
unpickled
.
mode
==
original_image
.
mode
tests/multimodal/media/test_image.py
0 → 100644
View file @
28459785
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
pathlib
import
Path
import
numpy
as
np
import
pytest
from
PIL
import
Image
from
vllm.multimodal.media
import
ImageMediaIO
pytestmark
=
pytest
.
mark
.
cpu_test
ASSETS_DIR
=
Path
(
__file__
).
parent
.
parent
/
"assets"
assert
ASSETS_DIR
.
exists
()
def
test_image_media_io_rgba_custom_background
(
tmp_path
):
"""Test RGBA to RGB conversion with custom background colors."""
# Create a simple RGBA image with transparent and opaque pixels
rgba_image
=
Image
.
new
(
"RGBA"
,
(
10
,
10
),
(
255
,
0
,
0
,
255
))
# Red with full opacity
# Make top-left quadrant transparent
for
i
in
range
(
5
):
for
j
in
range
(
5
):
rgba_image
.
putpixel
((
i
,
j
),
(
0
,
0
,
0
,
0
))
# Fully transparent
# Save the test image to tmp_path
test_image_path
=
tmp_path
/
"test_rgba.png"
rgba_image
.
save
(
test_image_path
)
# Test 1: Default white background (backward compatibility)
image_io_default
=
ImageMediaIO
()
converted_default
=
image_io_default
.
load_file
(
test_image_path
)
default_numpy
=
np
.
array
(
converted_default
)
# Check transparent pixels are white
assert
default_numpy
[
0
][
0
][
0
]
==
255
# R
assert
default_numpy
[
0
][
0
][
1
]
==
255
# G
assert
default_numpy
[
0
][
0
][
2
]
==
255
# B
# Check opaque pixels remain red
assert
default_numpy
[
5
][
5
][
0
]
==
255
# R
assert
default_numpy
[
5
][
5
][
1
]
==
0
# G
assert
default_numpy
[
5
][
5
][
2
]
==
0
# B
# Test 2: Custom black background via kwargs
image_io_black
=
ImageMediaIO
(
rgba_background_color
=
(
0
,
0
,
0
))
converted_black
=
image_io_black
.
load_file
(
test_image_path
)
black_numpy
=
np
.
array
(
converted_black
)
# Check transparent pixels are black
assert
black_numpy
[
0
][
0
][
0
]
==
0
# R
assert
black_numpy
[
0
][
0
][
1
]
==
0
# G
assert
black_numpy
[
0
][
0
][
2
]
==
0
# B
# Check opaque pixels remain red
assert
black_numpy
[
5
][
5
][
0
]
==
255
# R
assert
black_numpy
[
5
][
5
][
1
]
==
0
# G
assert
black_numpy
[
5
][
5
][
2
]
==
0
# B
# Test 3: Custom blue background via kwargs (as list)
image_io_blue
=
ImageMediaIO
(
rgba_background_color
=
[
0
,
0
,
255
])
converted_blue
=
image_io_blue
.
load_file
(
test_image_path
)
blue_numpy
=
np
.
array
(
converted_blue
)
# Check transparent pixels are blue
assert
blue_numpy
[
0
][
0
][
0
]
==
0
# R
assert
blue_numpy
[
0
][
0
][
1
]
==
0
# G
assert
blue_numpy
[
0
][
0
][
2
]
==
255
# B
# Test 4: Test with load_bytes method
with
open
(
test_image_path
,
"rb"
)
as
f
:
image_data
=
f
.
read
()
image_io_green
=
ImageMediaIO
(
rgba_background_color
=
(
0
,
255
,
0
))
converted_green
=
image_io_green
.
load_bytes
(
image_data
)
green_numpy
=
np
.
array
(
converted_green
)
# Check transparent pixels are green
assert
green_numpy
[
0
][
0
][
0
]
==
0
# R
assert
green_numpy
[
0
][
0
][
1
]
==
255
# G
assert
green_numpy
[
0
][
0
][
2
]
==
0
# B
def
test_image_media_io_rgba_background_color_validation
():
"""Test that invalid rgba_background_color values are properly rejected."""
# Test invalid types
with
pytest
.
raises
(
ValueError
,
match
=
"rgba_background_color must be a list or tuple"
):
ImageMediaIO
(
rgba_background_color
=
"255,255,255"
)
with
pytest
.
raises
(
ValueError
,
match
=
"rgba_background_color must be a list or tuple"
):
ImageMediaIO
(
rgba_background_color
=
255
)
# Test wrong number of elements
with
pytest
.
raises
(
ValueError
,
match
=
"rgba_background_color must be a list or tuple"
):
ImageMediaIO
(
rgba_background_color
=
(
255
,
255
))
with
pytest
.
raises
(
ValueError
,
match
=
"rgba_background_color must be a list or tuple"
):
ImageMediaIO
(
rgba_background_color
=
(
255
,
255
,
255
,
255
))
# Test non-integer values
with
pytest
.
raises
(
ValueError
,
match
=
"rgba_background_color must be a list or tuple"
):
ImageMediaIO
(
rgba_background_color
=
(
255.0
,
255.0
,
255.0
))
with
pytest
.
raises
(
ValueError
,
match
=
"rgba_background_color must be a list or tuple"
):
ImageMediaIO
(
rgba_background_color
=
(
255
,
"255"
,
255
))
# Test out of range values
with
pytest
.
raises
(
ValueError
,
match
=
"rgba_background_color must be a list or tuple"
):
ImageMediaIO
(
rgba_background_color
=
(
256
,
255
,
255
))
with
pytest
.
raises
(
ValueError
,
match
=
"rgba_background_color must be a list or tuple"
):
ImageMediaIO
(
rgba_background_color
=
(
255
,
-
1
,
255
))
# Test that valid values work
ImageMediaIO
(
rgba_background_color
=
(
0
,
0
,
0
))
# Should not raise
ImageMediaIO
(
rgba_background_color
=
[
255
,
255
,
255
])
# Should not raise
ImageMediaIO
(
rgba_background_color
=
(
128
,
128
,
128
))
# Should not raise
tests/multimodal/media/test_video.py
0 → 100644
View file @
28459785
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
pathlib
import
Path
import
numpy
as
np
import
numpy.typing
as
npt
import
pytest
from
PIL
import
Image
from
vllm.assets.base
import
get_vllm_public_assets
from
vllm.assets.video
import
video_to_ndarrays
,
video_to_pil_images_list
from
vllm.multimodal.media
import
ImageMediaIO
,
VideoMediaIO
from
vllm.multimodal.video
import
VIDEO_LOADER_REGISTRY
,
VideoLoader
from
..utils
import
cosine_similarity
,
create_video_from_image
,
normalize_image
pytestmark
=
pytest
.
mark
.
cpu_test
ASSETS_DIR
=
Path
(
__file__
).
parent
.
parent
/
"assets"
assert
ASSETS_DIR
.
exists
()
@
VIDEO_LOADER_REGISTRY
.
register
(
"assert_10_frames_1_fps"
)
class
Assert10Frames1FPSVideoLoader
(
VideoLoader
):
@
classmethod
def
load_bytes
(
cls
,
data
:
bytes
,
num_frames
:
int
=
-
1
,
fps
:
float
=
-
1.0
,
**
kwargs
)
->
npt
.
NDArray
:
assert
num_frames
==
10
,
"bad num_frames"
assert
fps
==
1.0
,
"bad fps"
return
FAKE_OUTPUT_2
def
test_video_media_io_kwargs
(
monkeypatch
:
pytest
.
MonkeyPatch
):
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_VIDEO_LOADER_BACKEND"
,
"assert_10_frames_1_fps"
)
imageio
=
ImageMediaIO
()
# Verify that different args pass/fail assertions as expected.
videoio
=
VideoMediaIO
(
imageio
,
**
{
"num_frames"
:
10
,
"fps"
:
1.0
})
_
=
videoio
.
load_bytes
(
b
"test"
)
videoio
=
VideoMediaIO
(
imageio
,
**
{
"num_frames"
:
10
,
"fps"
:
1.0
,
"not_used"
:
"not_used"
}
)
_
=
videoio
.
load_bytes
(
b
"test"
)
with
pytest
.
raises
(
AssertionError
,
match
=
"bad num_frames"
):
videoio
=
VideoMediaIO
(
imageio
,
**
{})
_
=
videoio
.
load_bytes
(
b
"test"
)
with
pytest
.
raises
(
AssertionError
,
match
=
"bad num_frames"
):
videoio
=
VideoMediaIO
(
imageio
,
**
{
"num_frames"
:
9
,
"fps"
:
1.0
})
_
=
videoio
.
load_bytes
(
b
"test"
)
with
pytest
.
raises
(
AssertionError
,
match
=
"bad fps"
):
videoio
=
VideoMediaIO
(
imageio
,
**
{
"num_frames"
:
10
,
"fps"
:
2.0
})
_
=
videoio
.
load_bytes
(
b
"test"
)
@
pytest
.
mark
.
parametrize
(
"is_color"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"fourcc, ext"
,
[(
"mp4v"
,
"mp4"
),
(
"XVID"
,
"avi"
)])
def
test_opencv_video_io_colorspace
(
tmp_path
,
is_color
:
bool
,
fourcc
:
str
,
ext
:
str
):
"""
Test all functions that use OpenCV for video I/O return RGB format.
Both RGB and grayscale videos are tested.
"""
image_path
=
get_vllm_public_assets
(
filename
=
"stop_sign.jpg"
,
s3_prefix
=
"vision_model_images"
)
image
=
Image
.
open
(
image_path
)
if
not
is_color
:
image_path
=
f
"
{
tmp_path
}
/test_grayscale_image.png"
image
=
image
.
convert
(
"L"
)
image
.
save
(
image_path
)
# Convert to gray RGB for comparison
image
=
image
.
convert
(
"RGB"
)
video_path
=
f
"
{
tmp_path
}
/test_RGB_video.
{
ext
}
"
create_video_from_image
(
image_path
,
video_path
,
num_frames
=
2
,
is_color
=
is_color
,
fourcc
=
fourcc
,
)
frames
=
video_to_ndarrays
(
video_path
)
for
frame
in
frames
:
sim
=
cosine_similarity
(
normalize_image
(
np
.
array
(
frame
)),
normalize_image
(
np
.
array
(
image
))
)
assert
np
.
sum
(
np
.
isnan
(
sim
))
/
sim
.
size
<
0.001
assert
np
.
nanmean
(
sim
)
>
0.99
pil_frames
=
video_to_pil_images_list
(
video_path
)
for
frame
in
pil_frames
:
sim
=
cosine_similarity
(
normalize_image
(
np
.
array
(
frame
)),
normalize_image
(
np
.
array
(
image
))
)
assert
np
.
sum
(
np
.
isnan
(
sim
))
/
sim
.
size
<
0.001
assert
np
.
nanmean
(
sim
)
>
0.99
io_frames
,
_
=
VideoMediaIO
(
ImageMediaIO
()).
load_file
(
Path
(
video_path
))
for
frame
in
io_frames
:
sim
=
cosine_similarity
(
normalize_image
(
np
.
array
(
frame
)),
normalize_image
(
np
.
array
(
image
))
)
assert
np
.
sum
(
np
.
isnan
(
sim
))
/
sim
.
size
<
0.001
assert
np
.
nanmean
(
sim
)
>
0.99
NUM_FRAMES
=
10
FAKE_OUTPUT_1
=
np
.
random
.
rand
(
NUM_FRAMES
,
1280
,
720
,
3
)
FAKE_OUTPUT_2
=
np
.
random
.
rand
(
NUM_FRAMES
,
1280
,
720
,
3
)
@
VIDEO_LOADER_REGISTRY
.
register
(
"test_video_backend_override_1"
)
class
TestVideoBackendOverride1
(
VideoLoader
):
"""Test loader that returns FAKE_OUTPUT_1 to verify backend selection."""
@
classmethod
def
load_bytes
(
cls
,
data
:
bytes
,
num_frames
:
int
=
-
1
,
**
kwargs
)
->
tuple
[
npt
.
NDArray
,
dict
]:
return
FAKE_OUTPUT_1
,
{
"video_backend"
:
"test_video_backend_override_1"
}
@
VIDEO_LOADER_REGISTRY
.
register
(
"test_video_backend_override_2"
)
class
TestVideoBackendOverride2
(
VideoLoader
):
"""Test loader that returns FAKE_OUTPUT_2 to verify backend selection."""
@
classmethod
def
load_bytes
(
cls
,
data
:
bytes
,
num_frames
:
int
=
-
1
,
**
kwargs
)
->
tuple
[
npt
.
NDArray
,
dict
]:
return
FAKE_OUTPUT_2
,
{
"video_backend"
:
"test_video_backend_override_2"
}
def
test_video_media_io_backend_kwarg_override
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""
Test that video_backend kwarg can override the VLLM_VIDEO_LOADER_BACKEND
environment variable.
This allows users to dynamically select a different video backend
via --media-io-kwargs without changing the global env var, which is
useful when plugins set a default backend but a specific request
needs a different one.
"""
with
monkeypatch
.
context
()
as
m
:
# Set the env var to one backend
m
.
setenv
(
"VLLM_VIDEO_LOADER_BACKEND"
,
"test_video_backend_override_1"
)
imageio
=
ImageMediaIO
()
# Without video_backend kwarg, should use env var backend
videoio_default
=
VideoMediaIO
(
imageio
,
num_frames
=
10
)
frames_default
,
metadata_default
=
videoio_default
.
load_bytes
(
b
"test"
)
np
.
testing
.
assert_array_equal
(
frames_default
,
FAKE_OUTPUT_1
)
assert
metadata_default
[
"video_backend"
]
==
"test_video_backend_override_1"
# With video_backend kwarg, should override env var
videoio_override
=
VideoMediaIO
(
imageio
,
num_frames
=
10
,
video_backend
=
"test_video_backend_override_2"
)
frames_override
,
metadata_override
=
videoio_override
.
load_bytes
(
b
"test"
)
np
.
testing
.
assert_array_equal
(
frames_override
,
FAKE_OUTPUT_2
)
assert
metadata_override
[
"video_backend"
]
==
"test_video_backend_override_2"
def
test_video_media_io_backend_kwarg_not_passed_to_loader
(
monkeypatch
:
pytest
.
MonkeyPatch
,
):
"""
Test that video_backend kwarg is consumed by VideoMediaIO and NOT passed
through to the underlying video loader's load_bytes method.
This ensures the kwarg is properly popped from kwargs before forwarding.
"""
@
VIDEO_LOADER_REGISTRY
.
register
(
"test_reject_video_backend_kwarg"
)
class
RejectVideoBackendKwargLoader
(
VideoLoader
):
"""Test loader that fails if video_backend is passed through."""
@
classmethod
def
load_bytes
(
cls
,
data
:
bytes
,
num_frames
:
int
=
-
1
,
**
kwargs
)
->
tuple
[
npt
.
NDArray
,
dict
]:
# This should never receive video_backend in kwargs
if
"video_backend"
in
kwargs
:
raise
AssertionError
(
"video_backend should be consumed by VideoMediaIO, "
"not passed to loader"
)
return
FAKE_OUTPUT_1
,
{
"received_kwargs"
:
list
(
kwargs
.
keys
())}
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_VIDEO_LOADER_BACKEND"
,
"test_reject_video_backend_kwarg"
)
imageio
=
ImageMediaIO
()
# Even when video_backend is provided, it should NOT be passed to loader
videoio
=
VideoMediaIO
(
imageio
,
num_frames
=
10
,
video_backend
=
"test_reject_video_backend_kwarg"
,
other_kwarg
=
"should_pass_through"
,
)
# This should NOT raise AssertionError
frames
,
metadata
=
videoio
.
load_bytes
(
b
"test"
)
np
.
testing
.
assert_array_equal
(
frames
,
FAKE_OUTPUT_1
)
# Verify other kwargs are still passed through
assert
"other_kwarg"
in
metadata
[
"received_kwargs"
]
def
test_video_media_io_backend_env_var_fallback
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""
Test that when video_backend kwarg is None or not provided,
VideoMediaIO falls back to VLLM_VIDEO_LOADER_BACKEND env var.
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_VIDEO_LOADER_BACKEND"
,
"test_video_backend_override_2"
)
imageio
=
ImageMediaIO
()
# Explicit None should fall back to env var
videoio_none
=
VideoMediaIO
(
imageio
,
num_frames
=
10
,
video_backend
=
None
)
frames_none
,
metadata_none
=
videoio_none
.
load_bytes
(
b
"test"
)
np
.
testing
.
assert_array_equal
(
frames_none
,
FAKE_OUTPUT_2
)
assert
metadata_none
[
"video_backend"
]
==
"test_video_backend_override_2"
# Not providing video_backend should also fall back to env var
videoio_missing
=
VideoMediaIO
(
imageio
,
num_frames
=
10
)
frames_missing
,
metadata_missing
=
videoio_missing
.
load_bytes
(
b
"test"
)
np
.
testing
.
assert_array_equal
(
frames_missing
,
FAKE_OUTPUT_2
)
assert
metadata_missing
[
"video_backend"
]
==
"test_video_backend_override_2"
tests/multimodal/test_audio.py
View file @
28459785
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# test_audio.py
# test_audio.py
import
base64
from
pathlib
import
Path
from
unittest.mock
import
patch
from
unittest.mock
import
patch
import
numpy
as
np
import
numpy
as
np
...
@@ -12,7 +10,6 @@ import torch
...
@@ -12,7 +10,6 @@ import torch
from
vllm.multimodal.audio
import
(
from
vllm.multimodal.audio
import
(
MONO_AUDIO_SPEC
,
MONO_AUDIO_SPEC
,
PASSTHROUGH_AUDIO_SPEC
,
PASSTHROUGH_AUDIO_SPEC
,
AudioMediaIO
,
AudioResampler
,
AudioResampler
,
AudioSpec
,
AudioSpec
,
ChannelReduction
,
ChannelReduction
,
...
@@ -92,59 +89,6 @@ def test_audio_resampler_no_target_sr(dummy_audio):
...
@@ -92,59 +89,6 @@ def test_audio_resampler_no_target_sr(dummy_audio):
resampler
.
resample
(
dummy_audio
,
orig_sr
=
44100
)
resampler
.
resample
(
dummy_audio
,
orig_sr
=
44100
)
@
pytest
.
fixture
def
dummy_audio_bytes
():
return
b
"FAKEAUDIOBYTES"
def
test_audio_media_io_load_bytes
(
dummy_audio_bytes
):
audio_io
=
AudioMediaIO
()
with
patch
(
"vllm.multimodal.audio.librosa.load"
)
as
mock_load
:
mock_load
.
return_value
=
(
np
.
array
([
0.1
,
0.2
]),
16000
)
out
=
audio_io
.
load_bytes
(
dummy_audio_bytes
)
mock_load
.
assert_called_once
()
assert
isinstance
(
out
[
0
],
np
.
ndarray
)
assert
out
[
1
]
==
16000
def
test_audio_media_io_load_base64
(
dummy_audio_bytes
):
audio_io
=
AudioMediaIO
()
encoded
=
base64
.
b64encode
(
dummy_audio_bytes
).
decode
(
"utf-8"
)
with
patch
.
object
(
AudioMediaIO
,
"load_bytes"
)
as
mock_load_bytes
:
mock_load_bytes
.
return_value
=
(
np
.
array
([
0.1
,
0.2
]),
16000
)
out
=
audio_io
.
load_base64
(
"audio/wav"
,
encoded
)
mock_load_bytes
.
assert_called_once
()
assert
isinstance
(
out
[
0
],
np
.
ndarray
)
assert
out
[
1
]
==
16000
def
test_audio_media_io_load_file
():
audio_io
=
AudioMediaIO
()
path
=
Path
(
"/fake/path.wav"
)
with
patch
(
"vllm.multimodal.audio.librosa.load"
)
as
mock_load
:
mock_load
.
return_value
=
(
np
.
array
([
0.1
,
0.2
]),
16000
)
out
=
audio_io
.
load_file
(
path
)
mock_load
.
assert_called_once_with
(
path
,
sr
=
None
)
assert
isinstance
(
out
[
0
],
np
.
ndarray
)
assert
out
[
1
]
==
16000
def
test_audio_media_io_encode_base64
(
dummy_audio
):
audio_io
=
AudioMediaIO
()
media
=
(
dummy_audio
,
16000
)
with
patch
(
"vllm.multimodal.audio.soundfile.write"
)
as
mock_write
:
def
write_to_buffer
(
buffer
,
*
_args
,
**
_kwargs
):
buffer
.
write
(
b
"dummy_wav_data"
)
mock_write
.
side_effect
=
write_to_buffer
out
=
audio_io
.
encode_base64
(
media
)
decoded
=
base64
.
b64decode
(
out
)
assert
decoded
==
b
"dummy_wav_data"
mock_write
.
assert_called_once
()
# ============================================================
# ============================================================
# Tests for normalize_audio function
# Tests for normalize_audio function
# ============================================================
# ============================================================
...
...
tests/multimodal/test_image.py
View file @
28459785
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pickle
from
pathlib
import
Path
from
pathlib
import
Path
import
numpy
as
np
import
numpy
as
np
import
pytest
import
pytest
from
PIL
import
Image
,
ImageChops
from
PIL
import
Image
,
ImageChops
from
vllm.multimodal.base
import
MediaWithBytes
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.multimodal.image
import
ImageMediaIO
,
convert_image_mode
pytestmark
=
pytest
.
mark
.
cpu_test
pytestmark
=
pytest
.
mark
.
cpu_test
...
@@ -40,153 +38,3 @@ def test_rgba_to_rgb():
...
@@ -40,153 +38,3 @@ def test_rgba_to_rgb():
assert
converted_image_numpy
[
i
][
j
][
0
]
==
255
assert
converted_image_numpy
[
i
][
j
][
0
]
==
255
assert
converted_image_numpy
[
i
][
j
][
1
]
==
255
assert
converted_image_numpy
[
i
][
j
][
1
]
==
255
assert
converted_image_numpy
[
i
][
j
][
2
]
==
255
assert
converted_image_numpy
[
i
][
j
][
2
]
==
255
def
test_rgba_to_rgb_custom_background
(
tmp_path
):
"""Test RGBA to RGB conversion with custom background colors."""
# Create a simple RGBA image with transparent and opaque pixels
rgba_image
=
Image
.
new
(
"RGBA"
,
(
10
,
10
),
(
255
,
0
,
0
,
255
))
# Red with full opacity
# Make top-left quadrant transparent
for
i
in
range
(
5
):
for
j
in
range
(
5
):
rgba_image
.
putpixel
((
i
,
j
),
(
0
,
0
,
0
,
0
))
# Fully transparent
# Save the test image to tmp_path
test_image_path
=
tmp_path
/
"test_rgba.png"
rgba_image
.
save
(
test_image_path
)
# Test 1: Default white background (backward compatibility)
image_io_default
=
ImageMediaIO
()
converted_default
=
image_io_default
.
load_file
(
test_image_path
)
default_numpy
=
np
.
array
(
converted_default
)
# Check transparent pixels are white
assert
default_numpy
[
0
][
0
][
0
]
==
255
# R
assert
default_numpy
[
0
][
0
][
1
]
==
255
# G
assert
default_numpy
[
0
][
0
][
2
]
==
255
# B
# Check opaque pixels remain red
assert
default_numpy
[
5
][
5
][
0
]
==
255
# R
assert
default_numpy
[
5
][
5
][
1
]
==
0
# G
assert
default_numpy
[
5
][
5
][
2
]
==
0
# B
# Test 2: Custom black background via kwargs
image_io_black
=
ImageMediaIO
(
rgba_background_color
=
(
0
,
0
,
0
))
converted_black
=
image_io_black
.
load_file
(
test_image_path
)
black_numpy
=
np
.
array
(
converted_black
)
# Check transparent pixels are black
assert
black_numpy
[
0
][
0
][
0
]
==
0
# R
assert
black_numpy
[
0
][
0
][
1
]
==
0
# G
assert
black_numpy
[
0
][
0
][
2
]
==
0
# B
# Check opaque pixels remain red
assert
black_numpy
[
5
][
5
][
0
]
==
255
# R
assert
black_numpy
[
5
][
5
][
1
]
==
0
# G
assert
black_numpy
[
5
][
5
][
2
]
==
0
# B
# Test 3: Custom blue background via kwargs (as list)
image_io_blue
=
ImageMediaIO
(
rgba_background_color
=
[
0
,
0
,
255
])
converted_blue
=
image_io_blue
.
load_file
(
test_image_path
)
blue_numpy
=
np
.
array
(
converted_blue
)
# Check transparent pixels are blue
assert
blue_numpy
[
0
][
0
][
0
]
==
0
# R
assert
blue_numpy
[
0
][
0
][
1
]
==
0
# G
assert
blue_numpy
[
0
][
0
][
2
]
==
255
# B
# Test 4: Test with load_bytes method
with
open
(
test_image_path
,
"rb"
)
as
f
:
image_data
=
f
.
read
()
image_io_green
=
ImageMediaIO
(
rgba_background_color
=
(
0
,
255
,
0
))
converted_green
=
image_io_green
.
load_bytes
(
image_data
)
green_numpy
=
np
.
array
(
converted_green
)
# Check transparent pixels are green
assert
green_numpy
[
0
][
0
][
0
]
==
0
# R
assert
green_numpy
[
0
][
0
][
1
]
==
255
# G
assert
green_numpy
[
0
][
0
][
2
]
==
0
# B
def
test_rgba_background_color_validation
():
"""Test that invalid rgba_background_color values are properly rejected."""
# Test invalid types
with
pytest
.
raises
(
ValueError
,
match
=
"rgba_background_color must be a list or tuple"
):
ImageMediaIO
(
rgba_background_color
=
"255,255,255"
)
with
pytest
.
raises
(
ValueError
,
match
=
"rgba_background_color must be a list or tuple"
):
ImageMediaIO
(
rgba_background_color
=
255
)
# Test wrong number of elements
with
pytest
.
raises
(
ValueError
,
match
=
"rgba_background_color must be a list or tuple"
):
ImageMediaIO
(
rgba_background_color
=
(
255
,
255
))
with
pytest
.
raises
(
ValueError
,
match
=
"rgba_background_color must be a list or tuple"
):
ImageMediaIO
(
rgba_background_color
=
(
255
,
255
,
255
,
255
))
# Test non-integer values
with
pytest
.
raises
(
ValueError
,
match
=
"rgba_background_color must be a list or tuple"
):
ImageMediaIO
(
rgba_background_color
=
(
255.0
,
255.0
,
255.0
))
with
pytest
.
raises
(
ValueError
,
match
=
"rgba_background_color must be a list or tuple"
):
ImageMediaIO
(
rgba_background_color
=
(
255
,
"255"
,
255
))
# Test out of range values
with
pytest
.
raises
(
ValueError
,
match
=
"rgba_background_color must be a list or tuple"
):
ImageMediaIO
(
rgba_background_color
=
(
256
,
255
,
255
))
with
pytest
.
raises
(
ValueError
,
match
=
"rgba_background_color must be a list or tuple"
):
ImageMediaIO
(
rgba_background_color
=
(
255
,
-
1
,
255
))
# Test that valid values work
ImageMediaIO
(
rgba_background_color
=
(
0
,
0
,
0
))
# Should not raise
ImageMediaIO
(
rgba_background_color
=
[
255
,
255
,
255
])
# Should not raise
ImageMediaIO
(
rgba_background_color
=
(
128
,
128
,
128
))
# Should not raise
def
test_media_with_bytes_pickle_roundtrip
():
"""Regression test for pickle/unpickle of MediaWithBytes.
Verifies that MediaWithBytes can be pickled and unpickled without
RecursionError. See: https://github.com/vllm-project/vllm/issues/30818
"""
original_image
=
Image
.
open
(
ASSETS_DIR
/
"image1.png"
).
convert
(
"RGB"
)
original_bytes
=
b
"test_bytes_data"
wrapper
=
MediaWithBytes
(
media
=
original_image
,
original_bytes
=
original_bytes
)
# Verify attribute delegation works before pickling
assert
wrapper
.
width
==
original_image
.
width
assert
wrapper
.
height
==
original_image
.
height
assert
wrapper
.
mode
==
original_image
.
mode
# Pickle and unpickle (this would cause RecursionError before the fix)
pickled
=
pickle
.
dumps
(
wrapper
)
unpickled
=
pickle
.
loads
(
pickled
)
# Verify the unpickled object works correctly
assert
unpickled
.
original_bytes
==
original_bytes
assert
unpickled
.
media
.
width
==
original_image
.
width
assert
unpickled
.
media
.
height
==
original_image
.
height
# Verify attribute delegation works after unpickling
assert
unpickled
.
width
==
original_image
.
width
assert
unpickled
.
height
==
original_image
.
height
assert
unpickled
.
mode
==
original_image
.
mode
tests/multimodal/test_video.py
View file @
28459785
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
tempfile
from
pathlib
import
Path
from
pathlib
import
Path
import
numpy
as
np
import
numpy
as
np
import
numpy.typing
as
npt
import
numpy.typing
as
npt
import
pytest
import
pytest
from
PIL
import
Image
from
vllm.assets.base
import
get_vllm_public_assets
from
vllm.multimodal.video
import
VIDEO_LOADER_REGISTRY
,
VideoLoader
from
vllm.assets.video
import
video_to_ndarrays
,
video_to_pil_images_list
from
vllm.multimodal.image
import
ImageMediaIO
from
vllm.multimodal.video
import
VIDEO_LOADER_REGISTRY
,
VideoLoader
,
VideoMediaIO
from
.utils
import
cosine_similarity
,
create_video_from_image
,
normalize_image
pytestmark
=
pytest
.
mark
.
cpu_test
pytestmark
=
pytest
.
mark
.
cpu_test
ASSETS_DIR
=
Path
(
__file__
).
parent
/
"assets"
ASSETS_DIR
=
Path
(
__file__
).
parent
/
"assets"
assert
ASSETS_DIR
.
exists
()
NUM_FRAMES
=
10
NUM_FRAMES
=
10
FAKE_OUTPUT_1
=
np
.
random
.
rand
(
NUM_FRAMES
,
1280
,
720
,
3
)
FAKE_OUTPUT_1
=
np
.
random
.
rand
(
NUM_FRAMES
,
1280
,
720
,
3
)
FAKE_OUTPUT_2
=
np
.
random
.
rand
(
NUM_FRAMES
,
1280
,
720
,
3
)
FAKE_OUTPUT_2
=
np
.
random
.
rand
(
NUM_FRAMES
,
1280
,
720
,
3
)
...
@@ -53,96 +48,6 @@ def test_video_loader_type_doesnt_exist():
...
@@ -53,96 +48,6 @@ def test_video_loader_type_doesnt_exist():
VIDEO_LOADER_REGISTRY
.
load
(
"non_existing_video_loader"
)
VIDEO_LOADER_REGISTRY
.
load
(
"non_existing_video_loader"
)
@
VIDEO_LOADER_REGISTRY
.
register
(
"assert_10_frames_1_fps"
)
class
Assert10Frames1FPSVideoLoader
(
VideoLoader
):
@
classmethod
def
load_bytes
(
cls
,
data
:
bytes
,
num_frames
:
int
=
-
1
,
fps
:
float
=
-
1.0
,
**
kwargs
)
->
npt
.
NDArray
:
assert
num_frames
==
10
,
"bad num_frames"
assert
fps
==
1.0
,
"bad fps"
return
FAKE_OUTPUT_2
def
test_video_media_io_kwargs
(
monkeypatch
:
pytest
.
MonkeyPatch
):
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_VIDEO_LOADER_BACKEND"
,
"assert_10_frames_1_fps"
)
imageio
=
ImageMediaIO
()
# Verify that different args pass/fail assertions as expected.
videoio
=
VideoMediaIO
(
imageio
,
**
{
"num_frames"
:
10
,
"fps"
:
1.0
})
_
=
videoio
.
load_bytes
(
b
"test"
)
videoio
=
VideoMediaIO
(
imageio
,
**
{
"num_frames"
:
10
,
"fps"
:
1.0
,
"not_used"
:
"not_used"
}
)
_
=
videoio
.
load_bytes
(
b
"test"
)
with
pytest
.
raises
(
AssertionError
,
match
=
"bad num_frames"
):
videoio
=
VideoMediaIO
(
imageio
,
**
{})
_
=
videoio
.
load_bytes
(
b
"test"
)
with
pytest
.
raises
(
AssertionError
,
match
=
"bad num_frames"
):
videoio
=
VideoMediaIO
(
imageio
,
**
{
"num_frames"
:
9
,
"fps"
:
1.0
})
_
=
videoio
.
load_bytes
(
b
"test"
)
with
pytest
.
raises
(
AssertionError
,
match
=
"bad fps"
):
videoio
=
VideoMediaIO
(
imageio
,
**
{
"num_frames"
:
10
,
"fps"
:
2.0
})
_
=
videoio
.
load_bytes
(
b
"test"
)
@
pytest
.
mark
.
parametrize
(
"is_color"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"fourcc, ext"
,
[(
"mp4v"
,
"mp4"
),
(
"XVID"
,
"avi"
)])
def
test_opencv_video_io_colorspace
(
is_color
:
bool
,
fourcc
:
str
,
ext
:
str
):
"""
Test all functions that use OpenCV for video I/O return RGB format.
Both RGB and grayscale videos are tested.
"""
image_path
=
get_vllm_public_assets
(
filename
=
"stop_sign.jpg"
,
s3_prefix
=
"vision_model_images"
)
image
=
Image
.
open
(
image_path
)
with
tempfile
.
TemporaryDirectory
()
as
tmpdir
:
if
not
is_color
:
image_path
=
f
"
{
tmpdir
}
/test_grayscale_image.png"
image
=
image
.
convert
(
"L"
)
image
.
save
(
image_path
)
# Convert to gray RGB for comparison
image
=
image
.
convert
(
"RGB"
)
video_path
=
f
"
{
tmpdir
}
/test_RGB_video.
{
ext
}
"
create_video_from_image
(
image_path
,
video_path
,
num_frames
=
2
,
is_color
=
is_color
,
fourcc
=
fourcc
,
)
frames
=
video_to_ndarrays
(
video_path
)
for
frame
in
frames
:
sim
=
cosine_similarity
(
normalize_image
(
np
.
array
(
frame
)),
normalize_image
(
np
.
array
(
image
))
)
assert
np
.
sum
(
np
.
isnan
(
sim
))
/
sim
.
size
<
0.001
assert
np
.
nanmean
(
sim
)
>
0.99
pil_frames
=
video_to_pil_images_list
(
video_path
)
for
frame
in
pil_frames
:
sim
=
cosine_similarity
(
normalize_image
(
np
.
array
(
frame
)),
normalize_image
(
np
.
array
(
image
))
)
assert
np
.
sum
(
np
.
isnan
(
sim
))
/
sim
.
size
<
0.001
assert
np
.
nanmean
(
sim
)
>
0.99
io_frames
,
_
=
VideoMediaIO
(
ImageMediaIO
()).
load_file
(
Path
(
video_path
))
for
frame
in
io_frames
:
sim
=
cosine_similarity
(
normalize_image
(
np
.
array
(
frame
)),
normalize_image
(
np
.
array
(
image
))
)
assert
np
.
sum
(
np
.
isnan
(
sim
))
/
sim
.
size
<
0.001
assert
np
.
nanmean
(
sim
)
>
0.99
def
test_video_backend_handles_broken_frames
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_video_backend_handles_broken_frames
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""
"""
Regression test for handling videos with broken frames.
Regression test for handling videos with broken frames.
...
@@ -179,128 +84,6 @@ def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch):
...
@@ -179,128 +84,6 @@ def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch):
)
)
@
VIDEO_LOADER_REGISTRY
.
register
(
"test_video_backend_override_1"
)
class
TestVideoBackendOverride1
(
VideoLoader
):
"""Test loader that returns FAKE_OUTPUT_1 to verify backend selection."""
@
classmethod
def
load_bytes
(
cls
,
data
:
bytes
,
num_frames
:
int
=
-
1
,
**
kwargs
)
->
tuple
[
npt
.
NDArray
,
dict
]:
return
FAKE_OUTPUT_1
,
{
"video_backend"
:
"test_video_backend_override_1"
}
@
VIDEO_LOADER_REGISTRY
.
register
(
"test_video_backend_override_2"
)
class
TestVideoBackendOverride2
(
VideoLoader
):
"""Test loader that returns FAKE_OUTPUT_2 to verify backend selection."""
@
classmethod
def
load_bytes
(
cls
,
data
:
bytes
,
num_frames
:
int
=
-
1
,
**
kwargs
)
->
tuple
[
npt
.
NDArray
,
dict
]:
return
FAKE_OUTPUT_2
,
{
"video_backend"
:
"test_video_backend_override_2"
}
def
test_video_media_io_backend_kwarg_override
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""
Test that video_backend kwarg can override the VLLM_VIDEO_LOADER_BACKEND
environment variable.
This allows users to dynamically select a different video backend
via --media-io-kwargs without changing the global env var, which is
useful when plugins set a default backend but a specific request
needs a different one.
"""
with
monkeypatch
.
context
()
as
m
:
# Set the env var to one backend
m
.
setenv
(
"VLLM_VIDEO_LOADER_BACKEND"
,
"test_video_backend_override_1"
)
imageio
=
ImageMediaIO
()
# Without video_backend kwarg, should use env var backend
videoio_default
=
VideoMediaIO
(
imageio
,
num_frames
=
10
)
frames_default
,
metadata_default
=
videoio_default
.
load_bytes
(
b
"test"
)
np
.
testing
.
assert_array_equal
(
frames_default
,
FAKE_OUTPUT_1
)
assert
metadata_default
[
"video_backend"
]
==
"test_video_backend_override_1"
# With video_backend kwarg, should override env var
videoio_override
=
VideoMediaIO
(
imageio
,
num_frames
=
10
,
video_backend
=
"test_video_backend_override_2"
)
frames_override
,
metadata_override
=
videoio_override
.
load_bytes
(
b
"test"
)
np
.
testing
.
assert_array_equal
(
frames_override
,
FAKE_OUTPUT_2
)
assert
metadata_override
[
"video_backend"
]
==
"test_video_backend_override_2"
def
test_video_media_io_backend_kwarg_not_passed_to_loader
(
monkeypatch
:
pytest
.
MonkeyPatch
,
):
"""
Test that video_backend kwarg is consumed by VideoMediaIO and NOT passed
through to the underlying video loader's load_bytes method.
This ensures the kwarg is properly popped from kwargs before forwarding.
"""
@
VIDEO_LOADER_REGISTRY
.
register
(
"test_reject_video_backend_kwarg"
)
class
RejectVideoBackendKwargLoader
(
VideoLoader
):
"""Test loader that fails if video_backend is passed through."""
@
classmethod
def
load_bytes
(
cls
,
data
:
bytes
,
num_frames
:
int
=
-
1
,
**
kwargs
)
->
tuple
[
npt
.
NDArray
,
dict
]:
# This should never receive video_backend in kwargs
if
"video_backend"
in
kwargs
:
raise
AssertionError
(
"video_backend should be consumed by VideoMediaIO, "
"not passed to loader"
)
return
FAKE_OUTPUT_1
,
{
"received_kwargs"
:
list
(
kwargs
.
keys
())}
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_VIDEO_LOADER_BACKEND"
,
"test_reject_video_backend_kwarg"
)
imageio
=
ImageMediaIO
()
# Even when video_backend is provided, it should NOT be passed to loader
videoio
=
VideoMediaIO
(
imageio
,
num_frames
=
10
,
video_backend
=
"test_reject_video_backend_kwarg"
,
other_kwarg
=
"should_pass_through"
,
)
# This should NOT raise AssertionError
frames
,
metadata
=
videoio
.
load_bytes
(
b
"test"
)
np
.
testing
.
assert_array_equal
(
frames
,
FAKE_OUTPUT_1
)
# Verify other kwargs are still passed through
assert
"other_kwarg"
in
metadata
[
"received_kwargs"
]
def
test_video_media_io_backend_env_var_fallback
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""
Test that when video_backend kwarg is None or not provided,
VideoMediaIO falls back to VLLM_VIDEO_LOADER_BACKEND env var.
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_VIDEO_LOADER_BACKEND"
,
"test_video_backend_override_2"
)
imageio
=
ImageMediaIO
()
# Explicit None should fall back to env var
videoio_none
=
VideoMediaIO
(
imageio
,
num_frames
=
10
,
video_backend
=
None
)
frames_none
,
metadata_none
=
videoio_none
.
load_bytes
(
b
"test"
)
np
.
testing
.
assert_array_equal
(
frames_none
,
FAKE_OUTPUT_2
)
assert
metadata_none
[
"video_backend"
]
==
"test_video_backend_override_2"
# Not providing video_backend should also fall back to env var
videoio_missing
=
VideoMediaIO
(
imageio
,
num_frames
=
10
)
frames_missing
,
metadata_missing
=
videoio_missing
.
load_bytes
(
b
"test"
)
np
.
testing
.
assert_array_equal
(
frames_missing
,
FAKE_OUTPUT_2
)
assert
metadata_missing
[
"video_backend"
]
==
"test_video_backend_override_2"
# ============================================================================
# ============================================================================
# Frame Recovery Tests
# Frame Recovery Tests
# ============================================================================
# ============================================================================
...
...
tools/pre_commit/check_pickle_imports.py
View file @
28459785
...
@@ -27,7 +27,7 @@ ALLOWED_FILES = {
...
@@ -27,7 +27,7 @@ ALLOWED_FILES = {
"vllm/distributed/device_communicators/shm_broadcast.py"
,
"vllm/distributed/device_communicators/shm_broadcast.py"
,
"vllm/distributed/device_communicators/shm_object_storage.py"
,
"vllm/distributed/device_communicators/shm_object_storage.py"
,
"vllm/utils/hashing.py"
,
"vllm/utils/hashing.py"
,
"tests/multimodal/test_
imag
e.py"
,
"tests/multimodal/
media/
test_
bas
e.py"
,
"tests/tokenizers_/test_hf.py"
,
"tests/tokenizers_/test_hf.py"
,
"tests/utils_/test_hashing.py"
,
"tests/utils_/test_hashing.py"
,
"benchmarks/kernels/graph_machete_bench.py"
,
"benchmarks/kernels/graph_machete_bench.py"
,
...
...
vllm/multimodal/audio.py
View file @
28459785
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
base64
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
enum
import
Enum
from
enum
import
Enum
from
io
import
BytesIO
from
pathlib
import
Path
from
typing
import
Literal
from
typing
import
Literal
import
numpy
as
np
import
numpy
as
np
import
numpy.typing
as
npt
import
numpy.typing
as
npt
import
pybase64
import
torch
import
torch
from
vllm.utils.import_utils
import
PlaceholderModule
from
vllm.utils.import_utils
import
PlaceholderModule
from
vllm.utils.serial_utils
import
tensor2base64
from
.base
import
MediaIO
try
:
try
:
import
librosa
import
librosa
except
ImportError
:
except
ImportError
:
librosa
=
PlaceholderModule
(
"librosa"
)
# type: ignore[assignment]
librosa
=
PlaceholderModule
(
"librosa"
)
# type: ignore[assignment]
try
:
import
soundfile
except
ImportError
:
soundfile
=
PlaceholderModule
(
"soundfile"
)
# type: ignore[assignment]
try
:
try
:
import
scipy.signal
as
scipy_signal
import
scipy.signal
as
scipy_signal
...
@@ -220,68 +208,3 @@ class AudioResampler:
...
@@ -220,68 +208,3 @@ class AudioResampler:
f
"Invalid resampling method:
{
self
.
method
}
. "
f
"Invalid resampling method:
{
self
.
method
}
. "
"Supported methods are 'librosa' and 'scipy'."
"Supported methods are 'librosa' and 'scipy'."
)
)
class
AudioMediaIO
(
MediaIO
[
tuple
[
npt
.
NDArray
,
float
]]):
def
__init__
(
self
,
**
kwargs
)
->
None
:
super
().
__init__
()
# `kwargs` contains custom arguments from
# --media-io-kwargs for this modality.
# They can be passed to the underlying
# media loaders (e.g. custom implementations)
# for flexible control.
self
.
kwargs
=
kwargs
def
load_bytes
(
self
,
data
:
bytes
)
->
tuple
[
npt
.
NDArray
,
float
]:
return
librosa
.
load
(
BytesIO
(
data
),
sr
=
None
)
def
load_base64
(
self
,
media_type
:
str
,
data
:
str
,
)
->
tuple
[
npt
.
NDArray
,
float
]:
return
self
.
load_bytes
(
base64
.
b64decode
(
data
))
def
load_file
(
self
,
filepath
:
Path
)
->
tuple
[
npt
.
NDArray
,
float
]:
return
librosa
.
load
(
filepath
,
sr
=
None
)
def
encode_base64
(
self
,
media
:
tuple
[
npt
.
NDArray
,
int
],
*
,
audio_format
:
str
=
"WAV"
,
)
->
str
:
audio
,
sr
=
media
with
BytesIO
()
as
buffer
:
soundfile
.
write
(
buffer
,
audio
,
sr
,
format
=
audio_format
)
data
=
buffer
.
getvalue
()
return
base64
.
b64encode
(
data
).
decode
(
"utf-8"
)
class
AudioEmbeddingMediaIO
(
MediaIO
[
torch
.
Tensor
]):
def
__init__
(
self
)
->
None
:
super
().
__init__
()
def
load_bytes
(
self
,
data
:
bytes
)
->
torch
.
Tensor
:
buffer
=
BytesIO
(
data
)
# Enable sparse tensor integrity checks to prevent out-of-bounds
# writes from maliciously crafted tensors
with
torch
.
sparse
.
check_sparse_tensor_invariants
():
tensor
=
torch
.
load
(
buffer
,
weights_only
=
True
)
return
tensor
.
to_dense
()
def
load_base64
(
self
,
media_type
:
str
,
data
:
str
)
->
torch
.
Tensor
:
return
self
.
load_bytes
(
pybase64
.
b64decode
(
data
,
validate
=
True
))
def
load_file
(
self
,
filepath
:
Path
)
->
torch
.
Tensor
:
# Enable sparse tensor integrity checks to prevent out-of-bounds
# writes from maliciously crafted tensors
with
torch
.
sparse
.
check_sparse_tensor_invariants
():
tensor
=
torch
.
load
(
filepath
,
weights_only
=
True
)
return
tensor
.
to_dense
()
def
encode_base64
(
self
,
media
:
torch
.
Tensor
)
->
str
:
return
tensor2base64
(
media
)
vllm/multimodal/hasher.py
View file @
28459785
...
@@ -12,7 +12,7 @@ from PIL import Image
...
@@ -12,7 +12,7 @@ from PIL import Image
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
.
base
import
MediaWithBytes
from
.
media
import
MediaWithBytes
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
...
vllm/multimodal/image.py
View file @
28459785
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
io
import
BytesIO
from
pathlib
import
Path
import
pybase64
import
torch
from
PIL
import
Image
from
PIL
import
Image
from
vllm.logger
import
init_logger
from
.base
import
MediaIO
,
MediaWithBytes
logger
=
init_logger
(
__file__
)
def
rescale_image_size
(
def
rescale_image_size
(
image
:
Image
.
Image
,
size_factor
:
float
,
transpose
:
int
=
-
1
image
:
Image
.
Image
,
size_factor
:
float
,
transpose
:
int
=
-
1
...
@@ -45,111 +34,3 @@ def convert_image_mode(image: Image.Image, to_mode: str):
...
@@ -45,111 +34,3 @@ def convert_image_mode(image: Image.Image, to_mode: str):
return
rgba_to_rgb
(
image
)
return
rgba_to_rgb
(
image
)
else
:
else
:
return
image
.
convert
(
to_mode
)
return
image
.
convert
(
to_mode
)
class
ImageMediaIO
(
MediaIO
[
Image
.
Image
]):
def
__init__
(
self
,
image_mode
:
str
=
"RGB"
,
**
kwargs
)
->
None
:
super
().
__init__
()
self
.
image_mode
=
image_mode
# `kwargs` contains custom arguments from
# --media-io-kwargs for this modality.
# They can be passed to the underlying
# media loaders (e.g. custom implementations)
# for flexible control.
self
.
kwargs
=
kwargs
# Extract RGBA background color from kwargs if provided
# Default to white background for backward compatibility
rgba_bg
=
kwargs
.
get
(
"rgba_background_color"
,
(
255
,
255
,
255
))
# Convert list to tuple for consistency
if
isinstance
(
rgba_bg
,
list
):
rgba_bg
=
tuple
(
rgba_bg
)
# Validate rgba_background_color format
if
not
(
isinstance
(
rgba_bg
,
tuple
)
and
len
(
rgba_bg
)
==
3
and
all
(
isinstance
(
c
,
int
)
and
0
<=
c
<=
255
for
c
in
rgba_bg
)
):
raise
ValueError
(
"rgba_background_color must be a list or tuple of 3 integers "
"in the range [0, 255]."
)
self
.
rgba_background_color
=
rgba_bg
def
_convert_image_mode
(
self
,
image
:
Image
.
Image
|
MediaWithBytes
[
Image
.
Image
]
)
->
Image
.
Image
:
"""Convert image mode with custom background color."""
if
isinstance
(
image
,
MediaWithBytes
):
image
=
image
.
media
if
image
.
mode
==
self
.
image_mode
:
return
image
elif
image
.
mode
==
"RGBA"
and
self
.
image_mode
==
"RGB"
:
return
rgba_to_rgb
(
image
,
self
.
rgba_background_color
)
else
:
return
convert_image_mode
(
image
,
self
.
image_mode
)
def
load_bytes
(
self
,
data
:
bytes
)
->
MediaWithBytes
[
Image
.
Image
]:
image
=
Image
.
open
(
BytesIO
(
data
))
return
MediaWithBytes
(
self
.
_convert_image_mode
(
image
),
data
)
def
load_base64
(
self
,
media_type
:
str
,
data
:
str
)
->
MediaWithBytes
[
Image
.
Image
]:
return
self
.
load_bytes
(
pybase64
.
b64decode
(
data
,
validate
=
True
))
def
load_file
(
self
,
filepath
:
Path
)
->
MediaWithBytes
[
Image
.
Image
]:
with
open
(
filepath
,
"rb"
)
as
f
:
data
=
f
.
read
()
image
=
Image
.
open
(
BytesIO
(
data
))
return
MediaWithBytes
(
self
.
_convert_image_mode
(
image
),
data
)
def
encode_base64
(
self
,
media
:
Image
.
Image
,
*
,
image_format
:
str
|
None
=
None
,
)
->
str
:
if
image_format
is
None
:
logger
.
warning_once
(
"The default format of `ImageMediaIO.encode_base64` will be changed "
'from "JPEG" to "PNG" in v0.15 to avoid lossy compression. '
"To continue using the old default, "
'pass `format="JPEG"` explicitly to silence this warning.'
)
image_format
=
"JPEG"
image
=
media
with
BytesIO
()
as
buffer
:
image
=
self
.
_convert_image_mode
(
image
)
image
.
save
(
buffer
,
image_format
)
data
=
buffer
.
getvalue
()
return
pybase64
.
b64encode
(
data
).
decode
(
"utf-8"
)
class
ImageEmbeddingMediaIO
(
MediaIO
[
torch
.
Tensor
]):
def
__init__
(
self
)
->
None
:
super
().
__init__
()
def
load_bytes
(
self
,
data
:
bytes
)
->
torch
.
Tensor
:
buffer
=
BytesIO
(
data
)
# Enable sparse tensor integrity checks to prevent out-of-bounds
# writes from maliciously crafted tensors
with
torch
.
sparse
.
check_sparse_tensor_invariants
():
tensor
=
torch
.
load
(
buffer
,
weights_only
=
True
)
return
tensor
.
to_dense
()
def
load_base64
(
self
,
media_type
:
str
,
data
:
str
)
->
torch
.
Tensor
:
return
self
.
load_bytes
(
pybase64
.
b64decode
(
data
,
validate
=
True
))
def
load_file
(
self
,
filepath
:
Path
)
->
torch
.
Tensor
:
# Enable sparse tensor integrity checks to prevent out-of-bounds
# writes from maliciously crafted tensors
with
torch
.
sparse
.
check_sparse_tensor_invariants
():
tensor
=
torch
.
load
(
filepath
,
weights_only
=
True
)
return
tensor
.
to_dense
()
def
encode_base64
(
self
,
media
:
torch
.
Tensor
)
->
str
:
return
pybase64
.
b64encode
(
media
.
numpy
()).
decode
(
"utf-8"
)
vllm/multimodal/inputs.py
View file @
28459785
...
@@ -32,7 +32,7 @@ if TYPE_CHECKING:
...
@@ -32,7 +32,7 @@ if TYPE_CHECKING:
from
PIL.Image
import
Image
from
PIL.Image
import
Image
from
transformers.feature_extraction_utils
import
BatchFeature
from
transformers.feature_extraction_utils
import
BatchFeature
from
.
base
import
MediaWithBytes
from
.
media
import
MediaWithBytes
else
:
else
:
torch
=
LazyLoader
(
"torch"
,
globals
(),
"torch"
)
torch
=
LazyLoader
(
"torch"
,
globals
(),
"torch"
)
...
...
vllm/multimodal/media/__init__.py
0 → 100644
View file @
28459785
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
.audio
import
AudioEmbeddingMediaIO
,
AudioMediaIO
from
.base
import
MediaIO
,
MediaWithBytes
from
.image
import
ImageEmbeddingMediaIO
,
ImageMediaIO
from
.video
import
VideoMediaIO
__all__
=
[
"MediaIO"
,
"MediaWithBytes"
,
"AudioEmbeddingMediaIO"
,
"AudioMediaIO"
,
"ImageEmbeddingMediaIO"
,
"ImageMediaIO"
,
"VideoMediaIO"
,
]
vllm/multimodal/media/audio.py
0 → 100644
View file @
28459785
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
base64
from
io
import
BytesIO
from
pathlib
import
Path
import
numpy.typing
as
npt
import
pybase64
import
torch
from
vllm.utils.import_utils
import
PlaceholderModule
from
vllm.utils.serial_utils
import
tensor2base64
from
.base
import
MediaIO
try
:
import
librosa
except
ImportError
:
librosa
=
PlaceholderModule
(
"librosa"
)
# type: ignore[assignment]
try
:
import
soundfile
except
ImportError
:
soundfile
=
PlaceholderModule
(
"soundfile"
)
# type: ignore[assignment]
class
AudioMediaIO
(
MediaIO
[
tuple
[
npt
.
NDArray
,
float
]]):
def
__init__
(
self
,
**
kwargs
)
->
None
:
super
().
__init__
()
# `kwargs` contains custom arguments from
# --media-io-kwargs for this modality.
# They can be passed to the underlying
# media loaders (e.g. custom implementations)
# for flexible control.
self
.
kwargs
=
kwargs
def
load_bytes
(
self
,
data
:
bytes
)
->
tuple
[
npt
.
NDArray
,
float
]:
return
librosa
.
load
(
BytesIO
(
data
),
sr
=
None
)
def
load_base64
(
self
,
media_type
:
str
,
data
:
str
,
)
->
tuple
[
npt
.
NDArray
,
float
]:
return
self
.
load_bytes
(
base64
.
b64decode
(
data
))
def
load_file
(
self
,
filepath
:
Path
)
->
tuple
[
npt
.
NDArray
,
float
]:
return
librosa
.
load
(
filepath
,
sr
=
None
)
def
encode_base64
(
self
,
media
:
tuple
[
npt
.
NDArray
,
int
],
*
,
audio_format
:
str
=
"WAV"
,
)
->
str
:
audio
,
sr
=
media
with
BytesIO
()
as
buffer
:
soundfile
.
write
(
buffer
,
audio
,
sr
,
format
=
audio_format
)
data
=
buffer
.
getvalue
()
return
base64
.
b64encode
(
data
).
decode
(
"utf-8"
)
class
AudioEmbeddingMediaIO
(
MediaIO
[
torch
.
Tensor
]):
def
__init__
(
self
)
->
None
:
super
().
__init__
()
def
load_bytes
(
self
,
data
:
bytes
)
->
torch
.
Tensor
:
buffer
=
BytesIO
(
data
)
# Enable sparse tensor integrity checks to prevent out-of-bounds
# writes from maliciously crafted tensors
with
torch
.
sparse
.
check_sparse_tensor_invariants
():
tensor
=
torch
.
load
(
buffer
,
weights_only
=
True
)
return
tensor
.
to_dense
()
def
load_base64
(
self
,
media_type
:
str
,
data
:
str
)
->
torch
.
Tensor
:
return
self
.
load_bytes
(
pybase64
.
b64decode
(
data
,
validate
=
True
))
def
load_file
(
self
,
filepath
:
Path
)
->
torch
.
Tensor
:
# Enable sparse tensor integrity checks to prevent out-of-bounds
# writes from maliciously crafted tensors
with
torch
.
sparse
.
check_sparse_tensor_invariants
():
tensor
=
torch
.
load
(
filepath
,
weights_only
=
True
)
return
tensor
.
to_dense
()
def
encode_base64
(
self
,
media
:
torch
.
Tensor
)
->
str
:
return
tensor2base64
(
media
)
vllm/multimodal/base.py
→
vllm/multimodal/
media/
base.py
View file @
28459785
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
from
abc
import
ABC
,
abstractmethod
from
abc
import
ABC
,
abstractmethod
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
pathlib
import
Path
from
pathlib
import
Path
from
typing
import
Generic
,
TypeVar
from
typing
import
Any
,
Generic
,
TypeVar
import
numpy
as
np
import
numpy
as
np
...
@@ -32,13 +32,14 @@ class MediaWithBytes(Generic[_T]):
...
@@ -32,13 +32,14 @@ class MediaWithBytes(Generic[_T]):
"""Allow np.array(obj) to return np.array(obj.media)."""
"""Allow np.array(obj) to return np.array(obj.media)."""
return
np
.
array
(
self
.
media
,
*
args
,
**
kwargs
)
return
np
.
array
(
self
.
media
,
*
args
,
**
kwargs
)
def
__getstate__
(
self
):
return
self
.
__dict__
.
copy
()
def
__setstate__
(
self
,
state
:
dict
[
str
,
Any
]):
self
.
__dict__
.
update
(
state
)
def
__getattr__
(
self
,
name
:
str
):
def
__getattr__
(
self
,
name
:
str
):
"""Delegate attribute access to the underlying media object."""
"""Delegate attribute access to the underlying media object."""
# Guard against recursion during unpickling when media isn't set yet.
# pickle creates objects without calling __init__, so self.media may
# not exist when __getattr__ is called for methods like __setstate__.
if
"media"
not
in
self
.
__dict__
:
raise
AttributeError
(
name
)
return
getattr
(
self
.
media
,
name
)
return
getattr
(
self
.
media
,
name
)
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment