Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
28459785
Unverified
Commit
28459785
authored
Jan 15, 2026
by
Cyrus Leung
Committed by
GitHub
Jan 15, 2026
Browse files
[3/N] Group together media-related code (#32406)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
8853a50a
Changes
25
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
222 additions
and
86 deletions
+222
-86
vllm/multimodal/media/image.py
vllm/multimodal/media/image.py
+124
-0
vllm/multimodal/media/video.py
vllm/multimodal/media/video.py
+89
-0
vllm/multimodal/parse.py
vllm/multimodal/parse.py
+1
-1
vllm/multimodal/utils.py
vllm/multimodal/utils.py
+8
-4
vllm/multimodal/video.py
vllm/multimodal/video.py
+0
-81
No files found.
vllm/multimodal/media/image.py
0 → 100644
View file @
28459785
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
io
import
BytesIO
from
pathlib
import
Path
import
pybase64
import
torch
from
PIL
import
Image
from
vllm.logger
import
init_logger
from
..image
import
convert_image_mode
,
rgba_to_rgb
from
.base
import
MediaIO
,
MediaWithBytes
logger
=
init_logger
(
__file__
)
class
ImageMediaIO
(
MediaIO
[
Image
.
Image
]):
def
__init__
(
self
,
image_mode
:
str
=
"RGB"
,
**
kwargs
)
->
None
:
super
().
__init__
()
self
.
image_mode
=
image_mode
# `kwargs` contains custom arguments from
# --media-io-kwargs for this modality.
# They can be passed to the underlying
# media loaders (e.g. custom implementations)
# for flexible control.
self
.
kwargs
=
kwargs
# Extract RGBA background color from kwargs if provided
# Default to white background for backward compatibility
rgba_bg
=
kwargs
.
get
(
"rgba_background_color"
,
(
255
,
255
,
255
))
# Convert list to tuple for consistency
if
isinstance
(
rgba_bg
,
list
):
rgba_bg
=
tuple
(
rgba_bg
)
# Validate rgba_background_color format
if
not
(
isinstance
(
rgba_bg
,
tuple
)
and
len
(
rgba_bg
)
==
3
and
all
(
isinstance
(
c
,
int
)
and
0
<=
c
<=
255
for
c
in
rgba_bg
)
):
raise
ValueError
(
"rgba_background_color must be a list or tuple of 3 integers "
"in the range [0, 255]."
)
self
.
rgba_background_color
=
rgba_bg
def
_convert_image_mode
(
self
,
image
:
Image
.
Image
|
MediaWithBytes
[
Image
.
Image
]
)
->
Image
.
Image
:
"""Convert image mode with custom background color."""
if
isinstance
(
image
,
MediaWithBytes
):
image
=
image
.
media
if
image
.
mode
==
self
.
image_mode
:
return
image
elif
image
.
mode
==
"RGBA"
and
self
.
image_mode
==
"RGB"
:
return
rgba_to_rgb
(
image
,
self
.
rgba_background_color
)
else
:
return
convert_image_mode
(
image
,
self
.
image_mode
)
def
load_bytes
(
self
,
data
:
bytes
)
->
MediaWithBytes
[
Image
.
Image
]:
image
=
Image
.
open
(
BytesIO
(
data
))
return
MediaWithBytes
(
self
.
_convert_image_mode
(
image
),
data
)
def
load_base64
(
self
,
media_type
:
str
,
data
:
str
)
->
MediaWithBytes
[
Image
.
Image
]:
return
self
.
load_bytes
(
pybase64
.
b64decode
(
data
,
validate
=
True
))
def
load_file
(
self
,
filepath
:
Path
)
->
MediaWithBytes
[
Image
.
Image
]:
with
open
(
filepath
,
"rb"
)
as
f
:
data
=
f
.
read
()
image
=
Image
.
open
(
BytesIO
(
data
))
return
MediaWithBytes
(
self
.
_convert_image_mode
(
image
),
data
)
def
encode_base64
(
self
,
media
:
Image
.
Image
,
*
,
image_format
:
str
|
None
=
None
,
)
->
str
:
if
image_format
is
None
:
logger
.
warning_once
(
"The default format of `ImageMediaIO.encode_base64` will be changed "
'from "JPEG" to "PNG" in v0.15 to avoid lossy compression. '
"To continue using the old default, "
'pass `format="JPEG"` explicitly to silence this warning.'
)
image_format
=
"JPEG"
image
=
media
with
BytesIO
()
as
buffer
:
image
=
self
.
_convert_image_mode
(
image
)
image
.
save
(
buffer
,
image_format
)
data
=
buffer
.
getvalue
()
return
pybase64
.
b64encode
(
data
).
decode
(
"utf-8"
)
class
ImageEmbeddingMediaIO
(
MediaIO
[
torch
.
Tensor
]):
def
__init__
(
self
)
->
None
:
super
().
__init__
()
def
load_bytes
(
self
,
data
:
bytes
)
->
torch
.
Tensor
:
buffer
=
BytesIO
(
data
)
# Enable sparse tensor integrity checks to prevent out-of-bounds
# writes from maliciously crafted tensors
with
torch
.
sparse
.
check_sparse_tensor_invariants
():
tensor
=
torch
.
load
(
buffer
,
weights_only
=
True
)
return
tensor
.
to_dense
()
def
load_base64
(
self
,
media_type
:
str
,
data
:
str
)
->
torch
.
Tensor
:
return
self
.
load_bytes
(
pybase64
.
b64decode
(
data
,
validate
=
True
))
def
load_file
(
self
,
filepath
:
Path
)
->
torch
.
Tensor
:
# Enable sparse tensor integrity checks to prevent out-of-bounds
# writes from maliciously crafted tensors
with
torch
.
sparse
.
check_sparse_tensor_invariants
():
tensor
=
torch
.
load
(
filepath
,
weights_only
=
True
)
return
tensor
.
to_dense
()
def
encode_base64
(
self
,
media
:
torch
.
Tensor
)
->
str
:
return
pybase64
.
b64encode
(
media
.
numpy
()).
decode
(
"utf-8"
)
vllm/multimodal/media/video.py
0 → 100644
View file @
28459785
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
base64
from
functools
import
partial
from
pathlib
import
Path
from
typing
import
Any
import
numpy
as
np
import
numpy.typing
as
npt
from
PIL
import
Image
from
vllm
import
envs
from
..video
import
VIDEO_LOADER_REGISTRY
from
.base
import
MediaIO
from
.image
import
ImageMediaIO
class
VideoMediaIO
(
MediaIO
[
tuple
[
npt
.
NDArray
,
dict
[
str
,
Any
]]]):
def
__init__
(
self
,
image_io
:
ImageMediaIO
,
num_frames
:
int
=
32
,
**
kwargs
,
)
->
None
:
super
().
__init__
()
self
.
image_io
=
image_io
self
.
num_frames
=
num_frames
# `kwargs` contains custom arguments from
# --media-io-kwargs for this modality.
# They can be passed to the underlying
# media loaders (e.g. custom implementations)
# for flexible control.
# Allow per-request override of video backend via kwargs.
# This enables users to specify a different backend than the
# global VLLM_VIDEO_LOADER_BACKEND env var, e.g.:
# --media-io-kwargs '{"video": {"video_backend": "torchcodec"}}'
video_loader_backend
=
(
kwargs
.
pop
(
"video_backend"
,
None
)
or
envs
.
VLLM_VIDEO_LOADER_BACKEND
)
self
.
kwargs
=
kwargs
self
.
video_loader
=
VIDEO_LOADER_REGISTRY
.
load
(
video_loader_backend
)
def
load_bytes
(
self
,
data
:
bytes
)
->
tuple
[
npt
.
NDArray
,
dict
[
str
,
Any
]]:
return
self
.
video_loader
.
load_bytes
(
data
,
num_frames
=
self
.
num_frames
,
**
self
.
kwargs
)
def
load_base64
(
self
,
media_type
:
str
,
data
:
str
)
->
tuple
[
npt
.
NDArray
,
dict
[
str
,
Any
]]:
if
media_type
.
lower
()
==
"video/jpeg"
:
load_frame
=
partial
(
self
.
image_io
.
load_base64
,
"image/jpeg"
,
)
return
np
.
stack
(
[
np
.
asarray
(
load_frame
(
frame_data
))
for
frame_data
in
data
.
split
(
","
)]
),
{}
return
self
.
load_bytes
(
base64
.
b64decode
(
data
))
def
load_file
(
self
,
filepath
:
Path
)
->
tuple
[
npt
.
NDArray
,
dict
[
str
,
Any
]]:
with
filepath
.
open
(
"rb"
)
as
f
:
data
=
f
.
read
()
return
self
.
load_bytes
(
data
)
def
encode_base64
(
self
,
media
:
npt
.
NDArray
,
*
,
video_format
:
str
=
"JPEG"
,
)
->
str
:
video
=
media
if
video_format
==
"JPEG"
:
encode_frame
=
partial
(
self
.
image_io
.
encode_base64
,
image_format
=
video_format
,
)
return
","
.
join
(
encode_frame
(
Image
.
fromarray
(
frame
))
for
frame
in
video
)
msg
=
"Only JPEG format is supported for now."
raise
NotImplementedError
(
msg
)
vllm/multimodal/parse.py
View file @
28459785
...
...
@@ -23,7 +23,6 @@ from vllm.utils.collection_utils import is_list_of
from
vllm.utils.import_utils
import
LazyLoader
from
.audio
import
AudioResampler
,
AudioSpec
,
normalize_audio
from
.base
import
MediaWithBytes
from
.inputs
import
(
AudioItem
,
HfAudioItem
,
...
...
@@ -36,6 +35,7 @@ from .inputs import (
MultiModalKwargsItems
,
VideoItem
,
)
from
.media
import
MediaWithBytes
_T
=
TypeVar
(
"_T"
)
_I
=
TypeVar
(
"_I"
)
...
...
vllm/multimodal/utils.py
View file @
28459785
...
...
@@ -22,10 +22,14 @@ from vllm.connections import HTTPConnection, global_http_connection
from
vllm.logger
import
init_logger
from
vllm.utils.registry
import
ExtensionManager
from
.audio
import
AudioEmbeddingMediaIO
,
AudioMediaIO
from
.base
import
MediaIO
from
.image
import
ImageEmbeddingMediaIO
,
ImageMediaIO
from
.video
import
VideoMediaIO
from
.media
import
(
AudioEmbeddingMediaIO
,
AudioMediaIO
,
ImageEmbeddingMediaIO
,
ImageMediaIO
,
MediaIO
,
VideoMediaIO
,
)
if
TYPE_CHECKING
:
from
.inputs
import
(
...
...
vllm/multimodal/video.py
View file @
28459785
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
base64
import
math
from
abc
import
abstractmethod
from
functools
import
partial
from
io
import
BytesIO
from
pathlib
import
Path
from
typing
import
TYPE_CHECKING
,
Any
,
cast
import
numpy
as
np
import
numpy.typing
as
npt
from
PIL
import
Image
if
TYPE_CHECKING
:
import
cv2
from
vllm
import
envs
from
vllm.logger
import
init_logger
from
vllm.utils.registry
import
ExtensionManager
from
.base
import
MediaIO
from
.image
import
ImageMediaIO
logger
=
init_logger
(
__name__
)
...
...
@@ -755,76 +747,3 @@ class Molmo2VideoBackend(VideoLoader):
**
kwargs
,
)
return
out
class
VideoMediaIO
(
MediaIO
[
tuple
[
npt
.
NDArray
,
dict
[
str
,
Any
]]]):
def
__init__
(
self
,
image_io
:
ImageMediaIO
,
num_frames
:
int
=
32
,
**
kwargs
,
)
->
None
:
super
().
__init__
()
self
.
image_io
=
image_io
self
.
num_frames
=
num_frames
# `kwargs` contains custom arguments from
# --media-io-kwargs for this modality.
# They can be passed to the underlying
# media loaders (e.g. custom implementations)
# for flexible control.
# Allow per-request override of video backend via kwargs.
# This enables users to specify a different backend than the
# global VLLM_VIDEO_LOADER_BACKEND env var, e.g.:
# --media-io-kwargs '{"video": {"video_backend": "torchcodec"}}'
video_loader_backend
=
(
kwargs
.
pop
(
"video_backend"
,
None
)
or
envs
.
VLLM_VIDEO_LOADER_BACKEND
)
self
.
kwargs
=
kwargs
self
.
video_loader
=
VIDEO_LOADER_REGISTRY
.
load
(
video_loader_backend
)
def
load_bytes
(
self
,
data
:
bytes
)
->
tuple
[
npt
.
NDArray
,
dict
[
str
,
Any
]]:
return
self
.
video_loader
.
load_bytes
(
data
,
num_frames
=
self
.
num_frames
,
**
self
.
kwargs
)
def
load_base64
(
self
,
media_type
:
str
,
data
:
str
)
->
tuple
[
npt
.
NDArray
,
dict
[
str
,
Any
]]:
if
media_type
.
lower
()
==
"video/jpeg"
:
load_frame
=
partial
(
self
.
image_io
.
load_base64
,
"image/jpeg"
,
)
return
np
.
stack
(
[
np
.
asarray
(
load_frame
(
frame_data
))
for
frame_data
in
data
.
split
(
","
)]
),
{}
return
self
.
load_bytes
(
base64
.
b64decode
(
data
))
def
load_file
(
self
,
filepath
:
Path
)
->
tuple
[
npt
.
NDArray
,
dict
[
str
,
Any
]]:
with
filepath
.
open
(
"rb"
)
as
f
:
data
=
f
.
read
()
return
self
.
load_bytes
(
data
)
def
encode_base64
(
self
,
media
:
npt
.
NDArray
,
*
,
video_format
:
str
=
"JPEG"
,
)
->
str
:
video
=
media
if
video_format
==
"JPEG"
:
encode_frame
=
partial
(
self
.
image_io
.
encode_base64
,
image_format
=
video_format
,
)
return
","
.
join
(
encode_frame
(
Image
.
fromarray
(
frame
))
for
frame
in
video
)
msg
=
"Only JPEG format is supported for now."
raise
NotImplementedError
(
msg
)
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment