[Model] Update Kimi-K25 and Isaac processors to fit HF-style (#37693)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>

[Model] Update Kimi-K25 and Isaac processors to fit HF-style (#37693)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
37aadf62 · Cyrus Leung · GitHub · d7d2b5e4 · 37aadf62 · 37aadf62
Unverified Commit 37aadf62 authored Mar 21, 2026 by Cyrus Leung Committed by GitHub Mar 20, 2026
5 changed files
--- a/vllm/model_executor/models/isaac.py
+++ b/vllm/model_executor/models/isaac.py
@@ -334,15 +334,14 @@ class IsaacProcessingInfo(BaseProcessingInfo):
        return IsaacConfig()
    def get_image_processor(self, **kwargs) -> IsaacImageProcessor:
-        return IsaacImageProcessor(kwargs)
+        return IsaacImageProcessor(**kwargs)
    def get_hf_processor(self, **kwargs) -> IsaacProcessor:
        hf_config = self.get_hf_config()
-        return self.ctx.init_processor(
+        return IsaacProcessor(
-            IsaacProcessor,
            tokenizer=self.get_tokenizer(),
-            image_processor=self.get_image_processor(),
+            image_processor=self.get_image_processor(**kwargs),
            image_token=hf_config.vision_token,
        )

--- a/vllm/model_executor/models/kimi_k25.py
+++ b/vllm/model_executor/models/kimi_k25.py
@@ -104,19 +104,25 @@ class KimiK25ProcessingInfo(BaseProcessingInfo):
    def __init__(self, ctx: InputProcessingContext) -> None:
        super().__init__(ctx)
-        self.hf_config = self.get_hf_config()
-        self.media_token_id = self.hf_config.media_placeholder_token_id
+        self.hf_config = hf_config = self.get_hf_config()
-        media_processor = cached_get_image_processor(
+        tokenizer = self.get_tokenizer()
+        image_processor = cached_get_image_processor(
            self.ctx.model_config.model,
            trust_remote_code=self.ctx.model_config.trust_remote_code,
        )
-        self.media_processor = media_processor
+        self.media_token_id = media_token_id = hf_config.media_placeholder_token_id
+        self.media_token = tokenizer.decode(media_token_id)
+        self.image_processor = image_processor
        self.hf_processor = KimiK25Processor(
-            media_processor=self.media_processor,
+            tokenizer=tokenizer,
-            tokenizer=self.get_tokenizer(),
+            image_processor=image_processor,
-            media_token_id=self.media_token_id,
+            media_token_id=media_token_id,
        )
-        self.media_tokens_calculator = self.media_processor.media_tokens_calculator
+        self.media_tokens_calculator = image_processor.media_tokens_calculator
    def get_hf_processor(self):
        return self.hf_processor
@@ -132,20 +138,15 @@ class KimiK25ProcessingInfo(BaseProcessingInfo):
 class KimiK25DummyInputsBuilder(BaseDummyInputsBuilder[KimiK25ProcessingInfo]):
    """Builds dummy inputs for Kimi-K2.5 model profiling."""
-    def __init__(self, info: KimiK25ProcessingInfo) -> None:
-        super().__init__(info)
-        self.media_token_id = self.info.media_token_id
-        self.frame_per_chunk = self.info.media_processor.num_frames_per_chunk
    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
        num_media = mm_counts.get("vision_chunk", 0)
-        return "<|media_pad|>" * num_media
+        return self.info.media_token * num_media
    def get_dummy_mm_items(self):
        dummy_videos = self._get_dummy_images(
            height=MaxImageTokenMeta.height,
            width=MaxImageTokenMeta.width,
-            num_images=self.frame_per_chunk,
+            num_images=self.info.image_processor.num_frames_per_chunk,
        )
        video_chunk_dummy_item = VisionChunkVideo(
@@ -236,9 +237,6 @@ class KimiK25MultiModalProcessor(BaseMultiModalProcessor[KimiK25ProcessingInfo])
            ),
        ]
-    def split_video_chunks(self, video):
-        return self.info.media_processor.split_video_chunks(video)
 @MULTIMODAL_REGISTRY.register_processor(
    KimiK25MultiModalProcessor,

--- a/vllm/transformers_utils/processors/isaac.py
+++ b/vllm/transformers_utils/processors/isaac.py
@@ -6,12 +6,14 @@ import math
 from typing import Any
 import numpy as np
-import PIL.Image
 import torch
 import torch.nn.functional as F
+from PIL import Image
 from transformers import BatchFeature, ProcessorMixin, TensorType
 from typing_extensions import TypedDict, Unpack
+from vllm.tokenizers.hf import HfTokenizer
 MAX_PIXELS = 60_000_000  # 60-megapixel ceiling ≈ 8200 × 7300 px
 # Vision preprocessing constants
@@ -39,7 +41,7 @@ def _make_writeable(arr: np.ndarray) -> np.ndarray:
        return arr.copy()
-def extract_image_pil(image: PIL.Image.Image) -> torch.Tensor | None:
+def extract_image_pil(image: Image.Image) -> torch.Tensor:
    if image.width * image.height > MAX_PIXELS:
        raise ValueError(
            f"Image (w={image.width}, h={image.height}) > MAX=`{MAX_PIXELS}`"
@@ -314,31 +316,30 @@ class IsaacImageProcessorKwargs(TypedDict, total=False):
 class IsaacImageProcessor:
-    patch_size = 16
-    max_num_patches = 6144
-    min_num_patches = 256
-    pixel_shuffle_scale = 2
    valid_kwargs = IsaacImageProcessorKwargs
    model_input_names = ["pixel_values", "image_grid_thw"]
-    def __init__(self, kwargs):
+    def __init__(
-        self.patch_size = kwargs.pop("patch_size", self.patch_size)
-        self.vision_max_num_patches = kwargs.pop(
-            "vision_max_num_patches", self.max_num_patches
-        )
-        self.vision_min_num_patches = kwargs.pop(
-            "vision_min_num_patches", self.min_num_patches
-        )
-        self.pixel_shuffle_scale = kwargs.pop("pixel_shuffle_scale", 2)
-    def preprocess(
        self,
-        images: list[torch.Tensor],
+        patch_size: int = 16,
-        return_tensors: str | TensorType | None,
+        vision_max_num_patches: int = 6144,
+        vision_min_num_patches: int = 256,
+        pixel_shuffle_scale: int = 2,
+    ) -> None:
+        self.patch_size = patch_size
+        self.vision_max_num_patches = vision_max_num_patches
+        self.vision_min_num_patches = vision_min_num_patches
+        self.pixel_shuffle_scale = pixel_shuffle_scale
+    def __call__(
+        self,
+        images: Image.Image | list[Image.Image],
+        return_tensors: str | TensorType | None = None,
        **kwargs: Unpack[IsaacImageProcessorKwargs],
    ) -> BatchFeature:
        """Preprocess images into format compatible with vLLM input processing."""
+        if not isinstance(images, list):
+            images = [images]
        all_pixel_values: list[torch.Tensor] = []
        all_image_grids: list[torch.Tensor] = []
@@ -388,23 +389,40 @@ class IsaacImageProcessor:
 class IsaacProcessor(ProcessorMixin):
    attributes = ["image_processor", "tokenizer"]
-    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+    def __init__(
-        self.image_token = kwargs.pop("image_token", "<image>")
+        self,
+        image_processor: IsaacImageProcessor,
+        tokenizer: HfTokenizer,
+        image_token: str = "<image>",
+    ):
        self.image_processor = image_processor
        self.tokenizer = tokenizer
-    def __call__(self, text=None, images=None, **kwargs) -> BatchFeature:
+        self.image_token = image_token
-        result = {}
+    def __call__(
+        self,
+        text: str | list[str] | None = None,
+        images: Image.Image | list[Image.Image] | None = None,
+        return_tensors: str | TensorType | None = None,
+        **kwargs,
+    ) -> BatchFeature:
        if images is not None:
-            image_inputs = self.image_processor.preprocess(images, **kwargs)
+            image_inputs = self.image_processor(
+                images,
+                return_tensors=return_tensors,
+                **kwargs,
+            )
            image_grid_thw = image_inputs["image_grid_thw"]
-            result.update(image_inputs)
+        else:
+            image_inputs = {}
+            image_grid_thw = []
-            if text is not None:
+        if text is not None:
-                if not isinstance(text, list):
+            if not isinstance(text, list):
-                    text = [text]
+                text = [text]
+            if image_inputs:
                text = text.copy()  # below lines change text in-place
                merge_length = self.image_processor.pixel_shuffle_scale**2
                index = 0
@@ -417,10 +435,14 @@ class IsaacProcessor(ProcessorMixin):
                        index += 1
                    text[i] = text[i].replace("<|placeholder|>", "<|image_pad|>")
-        if text is not None:
+            text_inputs = self.tokenizer(text, return_tensors=return_tensors)
-            result.update(self.tokenizer(text, **kwargs))
+        else:
+            text_inputs = {}
-        return BatchFeature(result)
+        return BatchFeature(
+            data={**text_inputs, **image_inputs},
+            tensor_type=return_tensors,
+        )
    def apply_chat_template(
        self,

--- a/vllm/transformers_utils/processors/kimi_k25.py
+++ b/vllm/transformers_utils/processors/kimi_k25.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import torch
+from transformers import BaseImageProcessor, BatchFeature, TensorType
-from transformers import BatchFeature
 from transformers.processing_utils import ProcessorMixin
 from vllm.multimodal.inputs import VisionChunk
+from vllm.tokenizers.hf import HfTokenizer
 class KimiK25Processor(ProcessorMixin):
-    attributes = ["tokenizer"]
+    attributes = ["image_processor", "tokenizer"]
-    tokenizer_class = "AutoTokenizer"
    def __init__(
-        self, media_processor=None, tokenizer=None, media_token_id: int | None = None
+        self,
-    ):
+        image_processor: BaseImageProcessor,
-        super().__init__(tokenizer)
+        tokenizer: HfTokenizer,
-        self.media_processor = media_processor
+        media_token_id: int,
+    ) -> None:
+        self.image_processor = image_processor
+        self.tokenizer = tokenizer
        self.media_token_id = media_token_id
-        assert self.media_token_id is not None
    def __call__(
        self,
+        text: str | list[str] | None = None,
        vision_chunks: list[VisionChunk] | None = None,
-        *,
+        return_tensors: str | TensorType | None = None,
-        text: list[int] | str,
        **kwargs,
    ) -> BatchFeature:
        """
        Args:
-            vision_chunks: List of VisionChunk items to be processed.
+            text: The text to be field to the model.
-                For image: VisionChunkImage with type='image', image=PIL.Image
+            vision_chunks: List of `VisionChunk` items to be processed.
-                For video_chunk: VisionChunkVideo with type='video_chunk',
+                For image: `VisionChunkImage` with
-                  video_chunk=list[PIL.Image]
+                  `type='image', image=PIL.Image`
-            text: The token ids to be fed to a model (required).
+                For video_chunk: `VisionChunkVideo` with
+                  `type='video_chunk', video_chunk=list[PIL.Image]`
        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
@@ -42,31 +45,44 @@ class KimiK25Processor(ProcessorMixin):
            - **grid_thws** -- list of image 3D grid in LLM.
              Returned when `vision_chunks` is not `None`.
        """
-        mm_inputs = {}
-        input_ids = self.tokenizer.encode(text) if isinstance(text, str) else text
        if vision_chunks is not None:
-            assert isinstance(vision_chunks, list)
+            mm_inputs = self.image_processor.preprocess(
-            mm_inputs = self.media_processor.preprocess(vision_chunks)
+                vision_chunks,
+                return_tensors=return_tensors,
+            )
+        else:
+            mm_inputs = {}
+        if text is not None:
+            if not isinstance(text, list):
+                text = [text]
+            text_inputs = self.tokenizer(text)
+            # Note: Modify in-place
+            input_ids: list[list[int]] = text_inputs["input_ids"]  # type: ignore
+            if vision_chunks is not None:
+                num_tokens_per_chunk = [
+                    self.image_processor.media_tokens_calculator(chunk)
+                    for chunk in vision_chunks
+                ]
-            num_tokens_per_chunk = [
+                for i in range(len(input_ids)):
-                self.media_processor.media_tokens_calculator(chunk)
+                    new_input_ids = []
-                for chunk in vision_chunks
+                    for token in input_ids[i]:
-            ]
+                        if token == self.media_token_id:
+                            new_input_ids.extend(
+                                [self.media_token_id] * num_tokens_per_chunk.pop(0)
+                            )
+                        else:
+                            new_input_ids.append(token)
-            new_input_ids = []
+                    input_ids[i] = new_input_ids
-            for token in input_ids:
+        else:
-                if token == self.media_token_id:
+            text_inputs = {}
-                    new_input_ids.extend(
-                        [self.media_token_id] * num_tokens_per_chunk.pop(0)
-                    )
-                else:
-                    new_input_ids.append(token)
-            input_ids = new_input_ids
-        # XXX: _apply_hf_processor_text_mm will call tolist() on input_ids
        return BatchFeature(
-            data={
+            data={**text_inputs, **mm_inputs},
-                "input_ids": torch.tensor([input_ids]),
+            tensor_type=return_tensors,
-                **mm_inputs,
-            }
        )
--- a/vllm/transformers_utils/processors/step3_vl.py
+++ b/vllm/transformers_utils/processors/step3_vl.py
@@ -286,11 +286,9 @@ class Step3VLImageProcessor:
    def __call__(
        self,
-        images: Image.Image | list[Image.Image] | None = None,
+        images: Image.Image | list[Image.Image],
        return_tensors: str | TensorType | None = None,
    ) -> BatchFeature:
-        if images is None:
-            images = []
        if not isinstance(images, list):
            images = [images]