[Misc] Cleanup more configs and processors (#37560)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>

[Misc] Cleanup more configs and processors (#37560)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
657855ab · Cyrus Leung · GitHub · e27b8ba3 · 657855ab · 657855ab
Unverified Commit 657855ab authored Mar 19, 2026 by Cyrus Leung Committed by GitHub Mar 19, 2026
7 changed files
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -55,7 +55,7 @@ _CLASS_TO_MODULE: dict[str, str] = {
    "OvisConfig": "vllm.transformers_utils.configs.ovis",
    "PixelShuffleSiglip2VisionConfig": "vllm.transformers_utils.configs.isaac",
    "RadioConfig": "vllm.transformers_utils.configs.radio",
-    "SpeculatorsConfig": "vllm.transformers_utils.configs.speculators.base",
+    "SpeculatorsConfig": "vllm.transformers_utils.configs.speculators",
    "UltravoxConfig": "vllm.transformers_utils.configs.ultravox",
    "Step3VLConfig": "vllm.transformers_utils.configs.step3_vl",
    "Step3VisionEncoderConfig": "vllm.transformers_utils.configs.step3_vl",

--- a/vllm/transformers_utils/configs/speculators/__init__.py
+++ b/vllm/transformers_utils/configs/speculators/__init__.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from .base import SpeculatorsConfig
+
+__all__ = ["SpeculatorsConfig"]
--- a/vllm/transformers_utils/configs/speculators/base.py
+++ b/vllm/transformers_utils/configs/speculators/base.py
@@ -8,9 +8,6 @@ from transformers import PretrainedConfig
 from vllm.transformers_utils.configs.speculators.algos import (
    SUPPORTED_SPECULATORS_TYPES,
 )
-
-__all__ = ["SpeculatorsConfig"]
-
 from vllm.transformers_utils.utils import without_trust_remote_code



--- a/vllm/transformers_utils/processors/__init__.py
+++ b/vllm/transformers_utils/processors/__init__.py
@@ -21,7 +21,9 @@ __all__ = [
    "HunYuanVLProcessor",
    "HunYuanVLImageProcessor",
    "InternVLProcessor",
+    "IsaacProcessor",
    "KimiAudioProcessor",
+    "KimiK25Processor",
    "MistralCommonPixtralProcessor",
    "MistralCommonVoxtralProcessor",
    "NanoNemotronVLProcessor",
@@ -32,6 +34,7 @@ __all__ = [
    "Ovis2_5Processor",
    "QwenVLProcessor",
    "Qwen3ASRProcessor",
+    "Step3VLProcessor",
 ]

 _CLASS_TO_MODULE: dict[str, str] = {
@@ -45,7 +48,9 @@ _CLASS_TO_MODULE: dict[str, str] = {
    "HunYuanVLProcessor": "vllm.transformers_utils.processors.hunyuan_vl",
    "HunYuanVLImageProcessor": "vllm.transformers_utils.processors.hunyuan_vl_image",
    "InternVLProcessor": "vllm.transformers_utils.processors.internvl",
+    "IsaacProcessor": "vllm.transformers_utils.processors.isaac",
    "KimiAudioProcessor": "vllm.transformers_utils.processors.kimi_audio",
+    "KimiK25Processor": "vllm.transformers_utils.processors.kimi_k25",
    "MistralCommonPixtralProcessor": "vllm.transformers_utils.processors.pixtral",
    "MistralCommonVoxtralProcessor": "vllm.transformers_utils.processors.voxtral",
    "NanoNemotronVLProcessor": "vllm.transformers_utils.processors.nano_nemotron_vl",
@@ -56,6 +61,7 @@ _CLASS_TO_MODULE: dict[str, str] = {
    "Ovis2_5Processor": "vllm.transformers_utils.processors.ovis2_5",
    "QwenVLProcessor": "vllm.transformers_utils.processors.qwen_vl",
    "Qwen3ASRProcessor": "vllm.transformers_utils.processors.qwen3_asr",
+    "Step3VLProcessor": "vllm.transformers_utils.processors.step3_vl",
 }



--- a/vllm/transformers_utils/processors/isaac.py
+++ b/vllm/transformers_utils/processors/isaac.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import math
+from typing import Any
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import BatchFeature, ProcessorMixin, TensorType
+from typing_extensions import TypedDict, Unpack
+
+MAX_PIXELS = 60_000_000  # 60-megapixel ceiling ≈ 8200 × 7300 px
+
+# Vision preprocessing constants
+VISION_MEAN = (0.5, 0.5, 0.5)
+VISION_STD = (0.5, 0.5, 0.5)
+VISION_SCALE = 1 / 255
+
+
+def _make_writeable(arr: np.ndarray) -> np.ndarray:
+    """Return *arr* itself if it is already writeable, otherwise try to flip the
+    write flag in-place and finally fall back to `arr.copy()`.
+    This guarantees the buffer handed to `torch.from_numpy()` is always
+    writeable, silencing the PyTorch warning about undefined behaviour.
+    """
+    if arr.flags.writeable:
+        return arr
+
+    # First, try the cheap path — in-place flag toggle (works for mmap'd arrays
+    # and some shared memory buffers):
+    try:
+        arr.setflags(write=True)
+        return arr  # success: no data copy
+    except ValueError:
+        # Buffer is inherently read-only (e.g. backed by PyAV / PIL): make copy
+        return arr.copy()
+
+
+def extract_image_pil(image: PIL.Image.Image) -> torch.Tensor | None:
+    if image.width * image.height > MAX_PIXELS:
+        raise ValueError(
+            f"Image (w={image.width}, h={image.height}) > MAX=`{MAX_PIXELS}`"
+        )
+    img = image if image.mode == "RGB" else image.convert("RGB")
+    arr = np.asarray(img)
+    arr = _make_writeable(arr)
+    return torch.from_numpy(arr)
+
+
+def get_image_size_for_max_num_patches(
+    image_height: int,
+    image_width: int,
+    patch_size: int,
+    max_num_patches: int,
+    min_num_patches: int | None = None,
+    eps: float = 1e-5,
+    pixel_shuffle_scale: int = 1,
+) -> tuple[int, int]:
+    r"""Compute a target resolution whose patch grid satisfies patching parametrization.
+
+    Args:
+        image_height (`int`):
+            Height in pixels of the source image prior to any resizing.
+        image_width (`int`):
+            Width in pixels of the source image prior to any resizing.
+        patch_size (`int`):
+            Size of the square patch used by the vision encoder.
+        max_num_patches (`int`):
+            Upper bound on `(height / patch_size) * (width / patch_size)` after
+            resizing.
+        min_num_patches (`int`, *optional*):
+            Lower bound on the number of patches. When provided the image will
+            be scaled up if necessary.
+        eps (`float`, *optional*, defaults to 1e-5):
+            Convergence tolerance for the internal binary search to determine
+            the target dimensions.
+        pixel_shuffle_scale (`int`, *optional*, defaults to 1):
+            Additional stride multiplier applied when pixel shuffle later
+            reduces spatial resolution.
+
+    Returns:
+        `tuple[int, int]`: Height and width (in pixels) that are multiples of
+        `patch_size * pixel_shuffle_scale` and respect both the maximum and
+        optional minimum patch-count constraints.
+    """
+
+    def get_scaled_image_size(scale, original_size, patch_size, pixel_shuffle_scale):
+        scaled_size = scale * original_size
+        divisor = patch_size * pixel_shuffle_scale
+        scaled_size = math.ceil(scaled_size / divisor) * divisor
+        scaled_size = max(divisor, scaled_size)
+        return int(scaled_size)
+
+    # Ensure divisibility
+    divisor = patch_size * pixel_shuffle_scale
+    adjusted_height = math.ceil(image_height / divisor) * divisor
+    adjusted_height = max(divisor, adjusted_height)
+    adjusted_width = math.ceil(image_width / divisor) * divisor
+    adjusted_width = max(divisor, adjusted_width)
+
+    num_patches = (adjusted_height / patch_size) * (adjusted_width / patch_size)
+
+    if min_num_patches is not None and num_patches < min_num_patches:
+        # Scale up
+        scale_min, scale_max = 1.0, 100.0
+        while (scale_max - scale_min) >= eps:
+            scale = (scale_min + scale_max) / 2
+            target_height = get_scaled_image_size(
+                scale, image_height, patch_size, pixel_shuffle_scale
+            )
+            target_width = get_scaled_image_size(
+                scale, image_width, patch_size, pixel_shuffle_scale
+            )
+            num_patches = (target_height / patch_size) * (target_width / patch_size)
+            if num_patches >= min_num_patches:
+                scale_max = scale
+            else:
+                scale_min = scale
+        scale = scale_max
+        target_height = get_scaled_image_size(
+            scale, image_height, patch_size, pixel_shuffle_scale
+        )
+        target_width = get_scaled_image_size(
+            scale, image_width, patch_size, pixel_shuffle_scale
+        )
+        return target_height, target_width
+    elif num_patches <= max_num_patches:
+        return adjusted_height, adjusted_width
+    else:
+        # Scale down
+        scale_min, scale_max = eps / 10, 1.0
+        while (scale_max - scale_min) >= eps:
+            scale = (scale_min + scale_max) / 2
+            target_height = get_scaled_image_size(
+                scale, image_height, patch_size, pixel_shuffle_scale
+            )
+            target_width = get_scaled_image_size(
+                scale, image_width, patch_size, pixel_shuffle_scale
+            )
+            num_patches = (target_height / patch_size) * (target_width / patch_size)
+            if num_patches <= max_num_patches:
+                scale_min = scale
+            else:
+                scale_max = scale
+        scale = scale_min
+        target_height = get_scaled_image_size(
+            scale, image_height, patch_size, pixel_shuffle_scale
+        )
+        target_width = get_scaled_image_size(
+            scale, image_width, patch_size, pixel_shuffle_scale
+        )
+        return target_height, target_width
+
+
+_MEAN_TENSOR = torch.tensor(VISION_MEAN, dtype=torch.float32).view(1, 1, 1, -1)
+_STD_TENSOR = torch.tensor(VISION_STD, dtype=torch.float32).view(1, 1, 1, -1)
+
+
+def prepare_image_tensor(
+    image: torch.Tensor,
+    scale: float = VISION_SCALE,
+) -> torch.Tensor:
+    r"""Standardize RGB images prior to patch extraction via rescaling and whitening.
+
+    Args:
+        image (`torch.Tensor`):
+            Tensor with shape `(..., height, width, 3)` containing RGB values.
+            The tensor is converted to floating point if needed.
+        scale (`float`, *optional*, defaults to `VISION_SCALE`):
+            Scalar multiplier applied before normalization.
+    Returns:
+        `torch.Tensor`: Normalized tensor with the same shape as the input and
+        dtype `torch.float32`.
+    """
+    if not torch.is_floating_point(image):
+        image = image.float()
+    rescaled = image * scale
+
+    # Use precomputed tensors and move to the correct device if needed
+    mean_tensor = _MEAN_TENSOR.to(image.device)
+    std_tensor = _STD_TENSOR.to(image.device)
+
+    normalized = (rescaled - mean_tensor) / std_tensor
+    return normalized
+
+
+def patchify_vision(image: torch.Tensor, patch_size: int) -> torch.Tensor:
+    r"""Convert normalized images into flattened ViT-style patches.
+
+    Args:
+        image (`torch.Tensor`):
+            Tensor of shape `(num_images, height, width, channels)`.
+        patch_size (`int`):
+            Edge length of the square patches
+
+    Returns:
+        `torch.Tensor`:
+            Patch tensor where each position stores the flattened pixels
+            belonging to that patch.
+
+    Raises:
+        ValueError: If `height` or `width` is not divisible by `patch_size`.
+    """
+    num_images, height, width, channels = image.shape
+    if height % patch_size or width % patch_size:
+        raise ValueError(
+            "Dimensions of images "
+            f"{image.shape} are not divisible by patch_size={patch_size}."
+        )
+    patches = image.reshape(
+        num_images,
+        height // patch_size,
+        patch_size,
+        width // patch_size,
+        patch_size,
+        channels,
+    )
+    patches = patches.permute(0, 1, 3, 2, 4, 5)
+    patches = patches.reshape(
+        num_images,
+        height // patch_size,
+        width // patch_size,
+        channels * patch_size * patch_size,
+    )
+    return patches
+
+
+def process_vision_for_patches(
+    images: torch.Tensor,
+    patch_size: int,
+    max_num_patches: int,
+    min_num_patches: int | None = None,
+    pixel_shuffle_scale: int = 1,
+) -> tuple[torch.Tensor, list[int]]:
+    r"""Resize, normalize, and patchify RGB images for the vision encoder.
+
+    Args:
+        images (`torch.Tensor`):
+            Either `(height, width, channels)` for a single image or
+            `(num_images, height, width, channels)` for a batch. Channels are
+            expected to be RGB.
+        patch_size (`int`):
+            Edge length of square patches; implicitly controls resize grid granularity.
+        max_num_patches (`int`):
+            Maximum number of patches allowed after resizing.
+        min_num_patches (`int`, *optional*):
+            Minimum number of patches. If provided, the routine upsamples images
+            as needed to satisfy the lower bound.
+        pixel_shuffle_scale (`int`, *optional*, defaults to 1):
+            Pixel shuffle scale factor; influences the target grid that the
+            function produces.
+
+    Returns:
+        `tuple[torch.Tensor, list[int]]`: A pair `(patches, dims_virtual)`
+        where `patches` has shape `(num_images, target_h / patch_size, target_w
+        / patch_size, channels * patch_size**2)` and `dims_virtual` encodes
+        effective `(images, height, width)` dimensions after optional pixel
+        shuffling.
+    """
+    # Add batch dim if single image
+    if images.dim() == 3:
+        images = images.unsqueeze(0)
+
+    # Permute to channel first for resize
+    images = images.permute(0, 3, 1, 2)
+
+    # Get target dimensions
+    _, _, orig_height, orig_width = images.shape
+    target_height, target_width = get_image_size_for_max_num_patches(
+        orig_height,
+        orig_width,
+        patch_size,
+        max_num_patches,
+        min_num_patches=min_num_patches,
+        pixel_shuffle_scale=pixel_shuffle_scale,
+    )
+
+    # Resize
+    images = F.interpolate(
+        images,
+        size=(target_height, target_width),
+        mode="bilinear",
+        align_corners=False,
+    )
+
+    # Back to channel last
+    images = images.permute(0, 2, 3, 1)
+
+    # Normalize
+    images = prepare_image_tensor(images)
+
+    # Patchify
+    patches = patchify_vision(images, patch_size=patch_size)
+
+    # Calculate dimensions for the patches
+    n_images, h_patches, w_patches, _ = patches.shape
+    dims_virtual = (
+        [1, h_patches, w_patches]
+        if pixel_shuffle_scale == 1
+        else [1, h_patches // pixel_shuffle_scale, w_patches // pixel_shuffle_scale]
+    )
+
+    return patches, dims_virtual
+
+
+class IsaacImageProcessorKwargs(TypedDict, total=False):
+    patch_size: int
+    max_num_patches: int
+    min_num_patches: int
+    pixel_shuffle_scale: int
+
+
+class IsaacImageProcessor:
+    patch_size = 16
+    max_num_patches = 6144
+    min_num_patches = 256
+    pixel_shuffle_scale = 2
+
+    valid_kwargs = IsaacImageProcessorKwargs
+    model_input_names = ["pixel_values", "image_grid_thw"]
+
+    def __init__(self, kwargs):
+        self.patch_size = kwargs.pop("patch_size", self.patch_size)
+        self.vision_max_num_patches = kwargs.pop(
+            "vision_max_num_patches", self.max_num_patches
+        )
+        self.vision_min_num_patches = kwargs.pop(
+            "vision_min_num_patches", self.min_num_patches
+        )
+        self.pixel_shuffle_scale = kwargs.pop("pixel_shuffle_scale", 2)
+
+    def preprocess(
+        self,
+        images: list[torch.Tensor],
+        return_tensors: str | TensorType | None,
+        **kwargs: Unpack[IsaacImageProcessorKwargs],
+    ) -> BatchFeature:
+        """Preprocess images into format compatible with vLLM input processing."""
+
+        all_pixel_values: list[torch.Tensor] = []
+        all_image_grids: list[torch.Tensor] = []
+
+        for image in images:
+            image_tensor = extract_image_pil(image)
+
+            patches, dims_virtual = process_vision_for_patches(
+                image_tensor,
+                patch_size=self.patch_size,
+                max_num_patches=self.vision_max_num_patches,
+                min_num_patches=self.vision_min_num_patches,
+                pixel_shuffle_scale=self.pixel_shuffle_scale,
+            )
+
+            # Isaac packs a dummy temporal dim for images
+            patches = patches.unsqueeze(1)  # [N, T=1, Hp, Wp, D]
+
+            hp, wp, dim = patches.shape[-3], patches.shape[-2], patches.shape[-1]
+            current_num_patches = hp * wp
+            pixel_values = patches.reshape(current_num_patches, dim)  # [N_tokens, D]
+
+            # Use real patch dimensions for image_grid_thw, not virtual dimensions
+            # This ensures the vision model receives correct grid info for pixel shuffle
+            dims_real = [1, hp, wp]  # Real patch dimensions
+            image_grid_thw = torch.tensor(dims_real).unsqueeze(0)
+
+            all_pixel_values.append(pixel_values)
+            all_image_grids.append(image_grid_thw)
+
+        if all_pixel_values:
+            final_pixel_values = torch.cat(all_pixel_values, dim=0)
+            final_image_grids = torch.cat(all_image_grids, dim=0)
+        else:
+            final_pixel_values = torch.empty(0, 0)
+            final_image_grids = torch.empty(0, 3)
+
+        return BatchFeature(
+            data={
+                "pixel_values": final_pixel_values,
+                "image_grid_thw": final_image_grids,
+            },
+            tensor_type=return_tensors,
+        )
+
+
+class IsaacProcessor(ProcessorMixin):
+    attributes = ["image_processor", "tokenizer"]
+
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        self.image_token = kwargs.pop("image_token", "<image>")
+        self.image_processor = image_processor
+        self.tokenizer = tokenizer
+
+    def __call__(self, text=None, images=None, **kwargs) -> BatchFeature:
+        result = {}
+
+        if images is not None:
+            image_inputs = self.image_processor.preprocess(images, **kwargs)
+            image_grid_thw = image_inputs["image_grid_thw"]
+            result.update(image_inputs)
+
+            if text is not None:
+                if not isinstance(text, list):
+                    text = [text]
+
+                text = text.copy()  # below lines change text in-place
+                merge_length = self.image_processor.pixel_shuffle_scale**2
+                index = 0
+                for i in range(len(text)):
+                    while self.image_token in text[i]:
+                        num_image_tokens = image_grid_thw[index].prod() // merge_length
+                        text[i] = text[i].replace(
+                            self.image_token, "<|placeholder|>" * num_image_tokens, 1
+                        )
+                        index += 1
+                    text[i] = text[i].replace("<|placeholder|>", "<|image_pad|>")
+
+        if text is not None:
+            result.update(self.tokenizer(text, **kwargs))
+
+        return BatchFeature(result)
+
+    def apply_chat_template(
+        self,
+        messages: list[dict[str, Any]],
+        tokenize: bool = False,
+        add_generation_prompt: bool = False,
+        **kwargs,
+    ) -> Any:
+        # Convert mixed content messages to simple text format
+        processed_messages = []
+
+        for message in messages:
+            if "content" in message and isinstance(message["content"], list):
+                # Handle mixed content (text + image)
+                text_parts = []
+                for content_item in message["content"]:
+                    if content_item.get("type") == "text":
+                        text_parts.append(content_item.get("text", ""))
+                    elif content_item.get("type") == "image":
+                        # Replace image with vision token
+                        text_parts.append(self.image_token)
+
+                processed_message = {
+                    "role": message.get("role", "user"),
+                    "content": "".join(text_parts),
+                }
+                processed_messages.append(processed_message)
+            else:
+                # Regular text message
+                processed_messages.append(message)
+
+        kwargs["return_dict"] = False
+        return self.tokenizer.apply_chat_template(
+            processed_messages,
+            tokenize=tokenize,
+            add_generation_prompt=add_generation_prompt,
+            **kwargs,
+        )
--- a/vllm/transformers_utils/processors/kimi_k25.py
+++ b/vllm/transformers_utils/processors/kimi_k25.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+from transformers import BatchFeature
+from transformers.processing_utils import ProcessorMixin
+
+from vllm.multimodal.inputs import VisionChunk
+
+
+class KimiK25Processor(ProcessorMixin):
+    attributes = ["tokenizer"]
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self, media_processor=None, tokenizer=None, media_token_id: int | None = None
+    ):
+        super().__init__(tokenizer)
+        self.media_processor = media_processor
+        self.media_token_id = media_token_id
+        assert self.media_token_id is not None
+
+    # We do not support str input for text here
+    def __call__(
+        self,
+        vision_chunks: list[VisionChunk] | None = None,
+        *,
+        text: list[int] | str,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Args:
+            vision_chunks: List of VisionChunk items to be processed.
+                For image: VisionChunkImage with type='image', image=PIL.Image
+                For video_chunk: VisionChunkVideo with type='video_chunk',
+                  video_chunk=list[PIL.Image]
+            text: The token ids to be fed to a model (required).
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- list of token ids to be fed to a model.
+            - **pixel_values** -- Pixel values to be fed to a model.
+              Returned when `vision_chunks` is not `None`.
+            - **grid_thws** -- list of image 3D grid in LLM.
+              Returned when `vision_chunks` is not `None`.
+        """
+        mm_inputs = {}
+        input_ids = self.tokenizer.encode(text) if isinstance(text, str) else text
+        if vision_chunks is not None:
+            assert isinstance(vision_chunks, list)
+            mm_inputs = self.media_processor.preprocess(vision_chunks)
+
+            num_tokens_per_chunk = [
+                self.media_processor.media_tokens_calculator(chunk)
+                for chunk in vision_chunks
+            ]
+
+            new_input_ids = []
+            for token in input_ids:
+                if token == self.media_token_id:
+                    new_input_ids.extend(
+                        [self.media_token_id] * num_tokens_per_chunk.pop(0)
+                    )
+                else:
+                    new_input_ids.append(token)
+            input_ids = new_input_ids
+
+        # XXX: _apply_hf_processor_text_mm will call tolist() on input_ids
+        return BatchFeature(
+            data={
+                "input_ids": torch.tensor([input_ids]),
+                **mm_inputs,
+            }
+        )
--- a/vllm/transformers_utils/processors/step3_vl.py
+++ b/vllm/transformers_utils/processors/step3_vl.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from itertools import product
+from math import ceil
+
+import numpy as np
+import torch
+from PIL import Image
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+from transformers import BatchFeature, PretrainedConfig, TensorType
+
+from vllm.tokenizers import TokenizerLike
+
+MAX_IMAGE_SIZE: int = 3024
+
+ImageWithPatches = tuple[Image.Image, list[Image.Image], list[bool] | None]
+
+
+class Step3VisionProcessor:
+    def __init__(self, size, interpolation_mode="bicubic", patch_size=None):
+        mean = [0.48145466, 0.4578275, 0.40821073]
+        std = [0.26862954, 0.26130258, 0.27577711]
+        patch_size = patch_size if patch_size is not None else size
+
+        self.transform = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize(mean, std),
+                transforms.Resize(
+                    (size, size),
+                    interpolation=InterpolationMode.BICUBIC
+                    if interpolation_mode == "bicubic"
+                    else InterpolationMode.BILINEAR,
+                    antialias=True,
+                ),
+            ]
+        )
+
+        self.patch_transform = (
+            transforms.Compose(
+                [
+                    transforms.ToTensor(),
+                    transforms.Normalize(mean, std),
+                    transforms.Resize(
+                        (patch_size, patch_size),
+                        interpolation=InterpolationMode.BICUBIC
+                        if interpolation_mode == "bicubic"
+                        else InterpolationMode.BILINEAR,
+                        antialias=True,
+                    ),
+                ]
+            )
+            if patch_size is not None
+            else None
+        )
+
+    def __call__(self, image, is_patch=False):
+        if is_patch:
+            assert self.patch_transform is not None
+            return {"pixel_values": self.patch_transform(image).unsqueeze(0)}
+
+        return {"pixel_values": self.transform(image).unsqueeze(0)}
+
+
+class ImagePatcher:
+    def __init__(self, enable_patch: bool = True) -> None:
+        self.enable_patch = enable_patch
+
+    def determine_window_size(self, long: int, short: int) -> int:
+        if long < 728:
+            return short if long / short > 1.5 else 0
+        return min(short, 504) if long / short > 4 else 504
+
+    def slide_window(
+        self,
+        width: int,
+        height: int,
+        sizes: list[tuple[int, int]],
+        steps: list[tuple[int, int]],
+        img_rate_thr: float = 0.6,
+    ) -> tuple[list[tuple[int, int, int, int]], tuple[int, int]]:
+        assert 1 >= img_rate_thr >= 0, "The `in_rate_thr` should lie in 0~1"
+        windows = []
+        # Sliding windows.
+        for size, step in zip(sizes, steps):
+            size_w, size_h = size
+            step_w, step_h = step
+
+            x_num = 1 if width <= size_w else ceil((width - size_w) / step_w + 1)
+            x_start = [step_w * i for i in range(x_num)]
+            if len(x_start) > 1 and x_start[-1] + size_w > width:
+                x_start[-1] = width - size_w
+
+            y_num = 1 if height <= size_h else ceil((height - size_h) / step_h + 1)
+            y_start = [step_h * i for i in range(y_num)]
+            if len(y_start) > 1 and y_start[-1] + size_h > height:
+                y_start[-1] = height - size_h
+
+            start = np.array(list(product(y_start, x_start)), dtype=int)
+            start[:, [0, 1]] = start[:, [1, 0]]
+            windows.append(np.concatenate([start, start + size], axis=1))
+        windows = np.concatenate(windows, axis=0)
+
+        return [
+            (int(box[0]), int(box[1]), int(box[2] - box[0]), int(box[3] - box[1]))
+            for box in windows
+        ], (x_num, y_num)
+
+    def square_pad(self, img: Image.Image) -> Image.Image:
+        w, h = img.size
+        if w == h:
+            return img
+        size = max(w, h)
+        padded = Image.new(img.mode, (size, size), 0)
+        padded.paste(img, (0, 0))
+        return padded
+
+    def get_image_size_for_padding(
+        self, img_width: int, img_height: int
+    ) -> tuple[int, int]:
+        ratio = img_width / img_height
+        if min(img_height, img_width) < 32 and (ratio > 4 or ratio < 1 / 4):
+            new_size = max(img_height, img_width)
+            return new_size, new_size
+        return img_width, img_height
+
+    def get_image_size_for_preprocess(
+        self, img_width: int, img_height: int
+    ) -> tuple[int, int]:
+        if max(img_height, img_width) > MAX_IMAGE_SIZE:
+            scale_factor = MAX_IMAGE_SIZE / max(img_height, img_width)
+            img_width = int(img_width * scale_factor)
+            img_height = int(img_height * scale_factor)
+        return img_width, img_height
+
+    def get_image_size_for_crop(
+        self, img_width: int, img_height: int, window_size: int
+    ):
+        w_ratio = img_width / window_size
+        h_ratio = img_height / window_size
+
+        if w_ratio < 1:
+            width_new = img_width
+        else:
+            decimal_w = w_ratio - img_width // window_size
+            w_ratio = int(w_ratio) + 1 if decimal_w > 0.2 else int(w_ratio)
+            width_new = window_size * w_ratio
+        if h_ratio < 1:
+            height_new = img_height
+        else:
+            decimal_h = h_ratio - img_height // window_size
+            h_ratio = int(h_ratio) + 1 if decimal_h > 0.2 else int(h_ratio)
+            height_new = window_size * h_ratio
+        return int(width_new), int(height_new)
+
+    def patch_crop(self, img: Image.Image, i: int, j: int, th: int, tw: int):
+        target = img.crop((j, i, j + tw, i + th))
+        return target
+
+    def get_num_patches(self, img_width: int, img_height: int) -> tuple[int, int]:
+        img_width, img_height = self.get_image_size_for_padding(img_width, img_height)
+        img_width, img_height = self.get_image_size_for_preprocess(
+            img_width, img_height
+        )
+        window_size = self.determine_window_size(
+            max(img_height, img_width), min(img_height, img_width)
+        )
+        if window_size == 0 or not self.enable_patch:
+            return 0, 0
+        else:
+            img_width, img_height = self.get_image_size_for_crop(
+                img_width, img_height, window_size
+            )
+            center_list, (x_num, y_num) = self.slide_window(
+                img_width,
+                img_height,
+                [(window_size, window_size)],
+                [(window_size, window_size)],
+            )
+            full_rows = (len(center_list) - 1) // x_num + 1
+            if len(center_list) > 0 and len(center_list) % x_num == 0:
+                full_rows -= 1
+            return len(center_list), full_rows
+
+    def __call__(
+        self, img: Image.Image
+    ) -> tuple[Image.Image, list[Image.Image], list[bool] | None]:
+        img_width, img_height = img.size
+        new_img_width, new_img_height = self.get_image_size_for_padding(
+            img_width, img_height
+        )
+        if new_img_width != img_width or new_img_height != img_height:
+            img = self.square_pad(img)
+            img_width, img_height = img.size
+
+        new_img_width, new_img_height = self.get_image_size_for_preprocess(
+            img_width, img_height
+        )
+        img = img.resize((new_img_width, new_img_height), Image.Resampling.BILINEAR)
+        window_size = self.determine_window_size(
+            max(new_img_height, new_img_width), min(new_img_height, new_img_width)
+        )
+
+        if window_size == 0 or not self.enable_patch:
+            return img, [], None
+        else:
+            new_img_width, new_img_height = self.get_image_size_for_crop(
+                new_img_width, new_img_height, window_size
+            )
+            if (new_img_width, new_img_height) != (img_width, img_height):
+                img_for_crop = img.resize(
+                    (new_img_width, new_img_height), Image.Resampling.BILINEAR
+                )
+            else:
+                img_for_crop = img
+
+            patches = []
+            newlines = []
+            center_list, (x_num, y_num) = self.slide_window(
+                new_img_width,
+                new_img_height,
+                [(window_size, window_size)],
+                [(window_size, window_size)],
+            )
+            for patch_id, center_lf_point in enumerate(center_list):
+                x, y, patch_w, patch_h = center_lf_point
+                big_patch = self.patch_crop(img_for_crop, y, x, patch_h, patch_w)
+                patches.append(big_patch)
+                if (patch_id + 1) % x_num == 0:
+                    newlines.append(patch_id)
+
+            if newlines and newlines[-1] == len(patches) - 1:
+                newlines.pop()
+
+            return (
+                img,
+                patches,
+                [i in newlines for i in range(len(patches))]
+                if len(patches) > 0
+                else None,
+            )
+
+
+class Step3VLProcessor:
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: TokenizerLike,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.tokenizer = tokenizer
+        self.image_size = 728
+        self.patch_size = 504
+        self.image_preprocessor = Step3VisionProcessor(
+            self.image_size, "bilinear", self.patch_size
+        )
+
+        self.num_image_feature_size = 169
+        self.num_patch_feature_size = 81
+        self.image_token = "<im_patch>"
+        self.image_feature_placeholder = self.image_token * self.num_image_feature_size
+        self.patch_feature_placeholder = self.image_token * self.num_patch_feature_size
+
+        # Respect vision config switch to enable/disable patch extraction.
+        # For video understanding, it's preferable to disable patch.
+        enable_patch = getattr(self.config.vision_config, "enable_patch", True)
+        self.patcher = ImagePatcher(enable_patch=enable_patch)
+
+    @property
+    def image_token_id(self) -> int:
+        return self.tokenizer.get_vocab()[self.image_token]
+
+    def get_num_image_tokens(self, img_width: int, img_height: int) -> int:
+        num_patches, num_newlines = self.patcher.get_num_patches(img_width, img_height)
+
+        return (
+            num_patches * (self.num_patch_feature_size + 2)
+            + self.num_image_feature_size
+            + 2
+            + num_newlines
+        )
+
+    def _split_images(self, images: list[Image.Image]) -> list[ImageWithPatches]:
+        result = []
+        for img in images:
+            result.append(self.patcher(img))
+        return result
+
+    def _convert_images_to_pixel_values(
+        self,
+        images: list[Image.Image],
+        is_patch: bool = False,
+    ) -> list[torch.Tensor]:
+        return [
+            self.image_preprocessor(img, is_patch=is_patch)["pixel_values"]
+            for img in images
+        ]
+
+    def _get_patch_repl(
+        self,
+        num_patches: int,
+        patch_newline_mask: list[bool] | None,
+    ) -> tuple[str, list[int]]:
+        text = ""
+        token_ids = []
+        for i in range(num_patches):
+            assert (
+                patch_newline_mask is not None
+                and len(patch_newline_mask) == num_patches
+            )
+            text += f"<patch_start>{self.patch_feature_placeholder}<patch_end>"
+            token_ids.extend(
+                [self.tokenizer.convert_tokens_to_ids("<patch_start>")]
+                + [self.image_token_id] * self.num_patch_feature_size
+                + [self.tokenizer.convert_tokens_to_ids("<patch_end>")]
+            )
+            if patch_newline_mask and patch_newline_mask[i]:
+                text += "<patch_newline>"
+                token_ids.append(
+                    self.tokenizer.convert_tokens_to_ids("<patch_newline>")
+                )
+        return text, token_ids
+
+    def _get_image_repl(
+        self,
+        num_images: int,
+    ) -> tuple[str, list[int]]:
+        text = f"<im_start>{self.image_feature_placeholder}<im_end>"
+        token_ids = (
+            [self.tokenizer.convert_tokens_to_ids("<im_start>")]
+            + [self.image_token_id] * self.num_image_feature_size
+            + [self.tokenizer.convert_tokens_to_ids("<im_end>")]
+        )
+        return text * num_images, token_ids * num_images
+
+    def _get_image_repl_features(
+        self,
+        num_images: int,
+        num_patches: int,
+        patch_new_line_idx: list[bool] | None,
+    ) -> tuple[str, list[int]]:
+        if num_patches > 0:
+            patch_repl, patch_repl_ids = self._get_patch_repl(
+                num_patches, patch_new_line_idx
+            )
+        else:
+            patch_repl = ""
+            patch_repl_ids = []
+        image_repl, image_repl_ids = self._get_image_repl(num_images)
+        return patch_repl + image_repl, patch_repl_ids + image_repl_ids
+
+    def replace_placeholder(self, text: str, placeholder: str, repls: list[str]) -> str:
+        parts = text.split(placeholder)
+
+        if len(parts) - 1 != len(repls):
+            raise ValueError(
+                "The number of placeholders does not match the number of replacements."
+            )
+
+        result = [parts[0]]
+        for i, repl in enumerate(repls):
+            result.append(repl)
+            result.append(parts[i + 1])
+
+        return "".join(result)
+
+    def __call__(
+        self,
+        text: str | list[str] | None = None,
+        images: Image.Image | list[Image.Image] | None = None,
+        return_tensors: str | TensorType | None = None,
+    ) -> BatchFeature:
+        if text is None:
+            text = []
+        if not isinstance(text, list):
+            text = [text]
+        if images is None:
+            images = []
+        if not isinstance(images, list):
+            images = [images]
+
+        if len(images) == 0:
+            image_inputs = {}
+            text_inputs = self.tokenizer(text)
+        else:
+            split_images_data = self._split_images(images)
+            pixel_values_lst = []
+            patch_pixel_values_lst = []
+            patch_newline_mask_lst = []
+            image_repl_str_lst = []
+            image_repl_ids_lst = []
+            num_patches = []
+            for raw_img, img_patches, patch_newline_mask in split_images_data:
+                pixel_values_lst.extend(self._convert_images_to_pixel_values([raw_img]))
+
+                if len(img_patches) > 0:
+                    patch_pixel_values_lst.extend(
+                        self._convert_images_to_pixel_values(img_patches, is_patch=True)
+                    )
+                num_patches.append(len(img_patches))
+
+                image_repl_str, image_repl_ids = self._get_image_repl_features(
+                    1, len(img_patches), patch_newline_mask
+                )
+                image_repl_str_lst.append(image_repl_str)
+                image_repl_ids_lst.extend(image_repl_ids)
+
+                if patch_newline_mask is not None:
+                    patch_newline_mask_lst.extend(patch_newline_mask)
+
+            pixel_values = torch.cat(pixel_values_lst)
+            patch_size = self.patch_size
+            image_inputs = {
+                "pixel_values": pixel_values,
+                "num_patches": num_patches,
+                "patch_pixel_values": (
+                    torch.cat(patch_pixel_values_lst)
+                    if patch_pixel_values_lst
+                    else pixel_values.new_empty((0, 3, patch_size, patch_size))
+                ),
+                "patch_newline_mask": torch.tensor(
+                    patch_newline_mask_lst, dtype=torch.bool
+                ),
+            }
+
+            text = [
+                self.replace_placeholder(t, self.image_token, image_repl_str_lst)
+                for t in text
+            ]
+            text_inputs = self.tokenizer(text)
+
+        return BatchFeature(
+            {
+                **text_inputs,
+                **image_inputs,
+            },
+            tensor_type=return_tensors,
+        )