Migrate MllamaImagePixelInputs to TensorSchema (#22020)

Signed-off-by: Benji Beck <benjibeck@meta.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>

Migrate MllamaImagePixelInputs to TensorSchema (#22020)
Signed-off-by: Benji Beck <benjibeck@meta.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
0b9cc56f · Benji Beck · GitHub · 8896eb72 · 0b9cc56f
Unverified Commit 0b9cc56f authored Aug 21, 2025 by Benji Beck Committed by GitHub Aug 22, 2025
Show whitespace changes
Inline Side-by-side

Showing with 26 additions and 10 deletions

vllm/model_executor/models/mllama.py vllm/model_executor/models/mllama.py +26 -10

No files found.
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -17,7 +17,7 @@
 """PyTorch Mllama model."""
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Literal, Optional, TypedDict, Union
+from typing import Annotated, Literal, Optional, Union

 import numpy as np
 import torch
@@ -64,6 +64,7 @@ from vllm.multimodal.processing import (BaseProcessingInfo,
                                        EncDecMultiModalProcessor,
                                        PromptReplacement, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.utils.tensor_schema import TensorSchema, TensorShape

 from .clip import CLIPMLP
 from .interfaces import SupportsMultiModal, SupportsV0Only
@@ -73,15 +74,30 @@ from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
 logger = init_logger(__name__)


-class MllamaImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
-    data: torch.Tensor
-    """Shape: """
-    """(batch_size, max_num_image, max_num_chunk, num_channel, height, width)"""
-    aspect_ratio_ids: torch.Tensor
-    """Shape: `(batch_size, max_num_image)`"""
-    aspect_ratio_mask: torch.Tensor
-    """Shape: `(batch_size, max_num_image, max_num_tiles)`"""
+class MllamaImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - batch_size: Batch size
+        - max_num_image: Max number of images
+        - max_num_chunk: Max number of chunks
+        - max_num_tiles: Max number of tiles per image
+        - num_channel: Number of channels
+        - height: Height
+        - width: Width
+    """
+
+    type: Literal["pixel_values"] = "pixel_values"
+
+    data: Annotated[torch.Tensor,
+                    TensorShape("batch_size", "max_num_image", "max_num_chunk",
+                                "num_channel", "height", "width")]
+
+    aspect_ratio_ids: Annotated[torch.Tensor,
+                                TensorShape("batch_size", "max_num_image")]
+
+    aspect_ratio_mask: Annotated[
+        torch.Tensor,
+        TensorShape("batch_size", "max_num_image", "max_num_tiles")]


 # TODO: support LlamaImageEmbeddingInputs