Migrate KimiVLImagePixelInputs to TensorSchema (#21769)

Signed-off-by: Benji Beck <benjibeck@meta.com> Co-authored-by: Isotr0py <2037008807@qq.com>

Migrate KimiVLImagePixelInputs to TensorSchema (#21769)
Signed-off-by: Benji Beck <benjibeck@meta.com> Co-authored-by: Isotr0py <2037008807@qq.com>
05fae021 · Benji Beck · GitHub · d1bf1b97 · 05fae021
Unverified Commit 05fae021 authored Aug 05, 2025 by Benji Beck Committed by GitHub Aug 05, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 15 additions and 9 deletions

vllm/model_executor/models/kimi_vl.py vllm/model_executor/models/kimi_vl.py +15 -9

No files found.
--- a/vllm/model_executor/models/kimi_vl.py
+++ b/vllm/model_executor/models/kimi_vl.py
@@ -46,7 +46,7 @@ import copy
 import math
 from collections.abc import Iterable, Mapping, Sequence
 from dataclasses import dataclass
-from typing import Any, Literal, Optional, TypedDict, Union
+from typing import Annotated, Any, Literal, Optional, Union
 import torch
 from torch import nn
@@ -79,6 +79,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import KimiVLConfig, MoonViTConfig
 from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from .utils import is_pp_missing_parameter, maybe_prefix
@@ -118,15 +119,22 @@ class KimiVLMultiModalProjector(nn.Module):
        return hidden_states
-class KimiVLImagePixelInputs(TypedDict):
+class KimiVLImagePixelInputs(TensorSchema):
-    type: Literal["pixel_values"]
-    pixel_values: Union[torch.Tensor, list[torch.Tensor]]
    """
-    Shape:`(num_patches, num_channels, patch_size, patch_size)`
+    Dimensions:
+        - nc: Number of channels
+        - np: Number of patches
+        - ps: Patch size
+        - ni: Number of images
    """
+    type: Literal["pixel_values"] = "pixel_values"
-    image_grid_hws: torch.Tensor
+    pixel_values: Annotated[
-    """Shape:`(num_images, 2)`"""
+        Union[torch.Tensor, list[torch.Tensor]],
+        TensorShape("np", 3, "ps", "ps"),
+    ]
+    image_grid_hws: Annotated[torch.Tensor, TensorShape("ni", 2)]
 # TODO: support embeds too
@@ -348,8 +356,6 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal):
            pixel_values = pixel_values.reshape(-1, num_channels, patch_size,
                                                patch_size)
        pixel_values = pixel_values.to(self.vision_tower.dtype)
-        # image_grid_hws.shape = (N, 2)
-        assert image_grid_hws.ndim == 2, f"unexpected shape for image_grid_hws: {image_grid_hws.shape}"
        return KimiVLImagePixelInputs(
            type="pixel_values",