[Bugfix] Follow-up fix on MediaWithBytes (#29951)

Signed-off-by: Roger Wang <hey@rogerw.io>

[Bugfix] Follow-up fix on MediaWithBytes (#29951)
Signed-off-by: Roger Wang <hey@rogerw.io>
787b84a9 · Roger Wang · GitHub · 42c19496 · 787b84a9 · 787b84a9
Unverified Commit 787b84a9 authored Dec 03, 2025 by Roger Wang Committed by GitHub Dec 03, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 2 deletions

vllm/multimodal/base.py vllm/multimodal/base.py +2 -0

vllm/multimodal/inputs.py vllm/multimodal/inputs.py +2 -1

vllm/multimodal/parse.py vllm/multimodal/parse.py +1 -1

No files found.
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -21,6 +21,8 @@ class MediaWithBytes(Generic[_T]):
    The wrapper delegates attribute access to the underlying media object,
    making it behave transparently like the wrapped type (e.g., PIL.Image).
+    NOTE: Currently, this wrapper is used only for the image modality.
    """
    media: _T

--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -32,6 +32,7 @@ if TYPE_CHECKING:
    from PIL.Image import Image
    from transformers.feature_extraction_utils import BatchFeature
+    from .base import MediaWithBytes
    from .processing import MultiModalHashes
 else:
@@ -59,7 +60,7 @@ Represents a single audio
 item, which can be passed to a HuggingFace `AudioProcessor`.
 """
-ImageItem: TypeAlias = Union[HfImageItem, "torch.Tensor"]
+ImageItem: TypeAlias = Union[HfImageItem, "torch.Tensor", "MediaWithBytes[HfImageItem]"]
 """
 A `transformers.image_utils.ImageInput` representing a single image
 item, which can be passed to a HuggingFace `ImageProcessor`.

--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -484,7 +484,7 @@ class MultiModalDataParser:
            return ImageEmbeddingItems(data)
        if (
-            isinstance(data, PILImage.Image)
+            isinstance(data, (PILImage.Image, MediaWithBytes))
            or isinstance(data, (np.ndarray, torch.Tensor))
            and data.ndim == 3
        ):