[Misc] Cleanup more configs and processors (#37560)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>

[Misc] Cleanup more configs and processors (#37560)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
657855ab · Cyrus Leung · GitHub · e27b8ba3 · 657855ab · 657855ab
Unverified Commit 657855ab authored Mar 19, 2026 by Cyrus Leung Committed by GitHub Mar 19, 2026
20 changed files
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -520,8 +520,10 @@ class SpeculativeConfig:

                # Replace hf_config for EAGLE draft_model
                if self.method in ("eagle", "eagle3"):
-                    from vllm.transformers_utils.configs import SpeculatorsConfig
                    from vllm.transformers_utils.configs.eagle import EAGLEConfig
+                    from vllm.transformers_utils.configs.speculators import (
+                        SpeculatorsConfig,
+                    )

                    if isinstance(
                        self.draft_model_config.hf_config,

--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -32,7 +32,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 )
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs import ChatGLMConfig
+from vllm.transformers_utils.configs.chatglm import ChatGLMConfig

 from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
 from .utils import (

--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -54,7 +54,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 )
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs import RWConfig
+from vllm.transformers_utils.configs.falcon import RWConfig

 from .interfaces import SupportsPP
 from .utils import (

--- a/vllm/model_executor/models/flex_olmo.py
+++ b/vllm/model_executor/models/flex_olmo.py
@@ -24,7 +24,7 @@ from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.models.olmoe import OlmoeAttention, OlmoeForCausalLM
-from vllm.transformers_utils.configs import FlexOlmoConfig
+from vllm.transformers_utils.configs.flex_olmo import FlexOlmoConfig

 logger = init_logger(__name__)


--- a/vllm/model_executor/models/hyperclovax_vision.py
+++ b/vllm/model_executor/models/hyperclovax_vision.py
@@ -20,7 +20,6 @@ from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.cache import BaseMultiModalProcessorCache
 from vllm.multimodal.inputs import (
    MultiModalDataDict,
    MultiModalFieldConfig,
@@ -31,7 +30,6 @@ from vllm.multimodal.processing import (
    BaseDummyInputsBuilder,
    BaseMultiModalProcessor,
    BaseProcessingInfo,
-    InputProcessingContext,
    PromptReplacement,
    PromptUpdate,
 )
@@ -336,28 +334,6 @@ class HCXVisionMultiModalProcessor(BaseMultiModalProcessor[HCXVisionProcessingIn
        return fields


-def _build_hcxvision_hf_info(
-    ctx: InputProcessingContext,
-) -> HCXVisionProcessingInfo:
-    return HCXVisionProcessingInfo(ctx)
-
-
-def _build_hcxvision_hf_processor(
-    info: HCXVisionProcessingInfo,
-    dummy_inputs: BaseDummyInputsBuilder[HCXVisionProcessingInfo],
-    *,
-    cache: BaseMultiModalProcessorCache | None = None,
-) -> BaseMultiModalProcessor:
-    if isinstance(info, HCXVisionProcessingInfo):
-        return HCXVisionMultiModalProcessor(
-            info,
-            dummy_inputs,  # type: ignore
-            cache=cache,
-        )
-
-    raise NotImplementedError(type(info))
-
-
 def init_vision_tower_for_hcxvision(
    vision_config,
    quant_config: QuantizationConfig | None,
@@ -587,8 +563,8 @@ class HCXVisionCAbstractor(nn.Module):


 @MULTIMODAL_REGISTRY.register_processor(
-    _build_hcxvision_hf_processor,
-    info=_build_hcxvision_hf_info,
+    HCXVisionMultiModalProcessor,
+    info=HCXVisionProcessingInfo,
    dummy_inputs=HCXVisionDummyInputsBuilder,
 )
 class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):

--- a/vllm/model_executor/models/isaac.py
+++ b/vllm/model_executor/models/isaac.py
@@ -2,22 +2,17 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations

-import math
 from collections.abc import Iterable, Iterator, Mapping, Sequence
 from typing import Annotated, Any

 import numpy as np
-import PIL.Image
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
 from transformers.image_processing_utils import BatchFeature
-from transformers.utils import TensorType
-from typing_extensions import TypedDict, Unpack

-from vllm.config import VllmConfig
-from vllm.config.model import ModelConfig
+from vllm.config import ModelConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import parallel_state
 from vllm.distributed import utils as dist_utils
@@ -64,13 +59,17 @@ from vllm.multimodal.processing import (
    PromptUpdateDetails,
 )
 from vllm.sequence import IntermediateTensors
-from vllm.tokenizers import get_tokenizer
-from vllm.tokenizers.hf import get_cached_tokenizer
+from vllm.tokenizers import cached_tokenizer_from_config
 from vllm.transformers_utils.config import patch_rope_parameters
-from vllm.transformers_utils.configs import (
+from vllm.transformers_utils.configs.isaac import (
    IsaacConfig,
    PixelShuffleSiglip2VisionConfig,
 )
+from vllm.transformers_utils.processors.isaac import (
+    IsaacImageProcessor,
+    IsaacProcessor,
+    get_image_size_for_max_num_patches,
+)
 from vllm.utils.tensor_schema import TensorSchema, TensorShape

 from .vision import is_vit_use_data_parallel
@@ -307,467 +306,6 @@ def pixel_shuffle_varlen(
 # Configuration
 # ============================================================================

-MAX_PIXELS = 60_000_000  # 60-megapixel ceiling ≈ 8200 × 7300 px
-
-# Vision preprocessing constants
-VISION_MEAN = (0.5, 0.5, 0.5)
-VISION_STD = (0.5, 0.5, 0.5)
-VISION_SCALE = 1 / 255
-
-
-def _make_writeable(arr: np.ndarray) -> np.ndarray:
-    """Return *arr* itself if it is already writeable, otherwise try to flip the
-    write flag in-place and finally fall back to `arr.copy()`.
-    This guarantees the buffer handed to `torch.from_numpy()` is always
-    writeable, silencing the PyTorch warning about undefined behaviour.
-    """
-    if arr.flags.writeable:
-        return arr
-
-    # First, try the cheap path — in-place flag toggle (works for mmap'd arrays
-    # and some shared memory buffers):
-    try:
-        arr.setflags(write=True)
-        return arr  # success: no data copy
-    except ValueError:
-        # Buffer is inherently read-only (e.g. backed by PyAV / PIL): make copy
-        return arr.copy()
-
-
-def extract_image_pil(image: PIL.Image.Image) -> torch.Tensor | None:
-    if image.width * image.height > MAX_PIXELS:
-        raise ValueError(
-            f"Image (w={image.width}, h={image.height}) > MAX=`{MAX_PIXELS}`"
-        )
-    img = image if image.mode == "RGB" else image.convert("RGB")
-    arr = np.asarray(img)
-    arr = _make_writeable(arr)
-    return torch.from_numpy(arr)
-
-
-def get_image_size_for_max_num_patches(
-    image_height: int,
-    image_width: int,
-    patch_size: int,
-    max_num_patches: int,
-    min_num_patches: int | None = None,
-    eps: float = 1e-5,
-    pixel_shuffle_scale: int = 1,
-) -> tuple[int, int]:
-    r"""Compute a target resolution whose patch grid satisfies patching parametrization.
-
-    Args:
-        image_height (`int`):
-            Height in pixels of the source image prior to any resizing.
-        image_width (`int`):
-            Width in pixels of the source image prior to any resizing.
-        patch_size (`int`):
-            Size of the square patch used by the vision encoder.
-        max_num_patches (`int`):
-            Upper bound on `(height / patch_size) * (width / patch_size)` after
-            resizing.
-        min_num_patches (`int`, *optional*):
-            Lower bound on the number of patches. When provided the image will
-            be scaled up if necessary.
-        eps (`float`, *optional*, defaults to 1e-5):
-            Convergence tolerance for the internal binary search to determine
-            the target dimensions.
-        pixel_shuffle_scale (`int`, *optional*, defaults to 1):
-            Additional stride multiplier applied when pixel shuffle later
-            reduces spatial resolution.
-
-    Returns:
-        `tuple[int, int]`: Height and width (in pixels) that are multiples of
-        `patch_size * pixel_shuffle_scale` and respect both the maximum and
-        optional minimum patch-count constraints.
-    """
-
-    def get_scaled_image_size(scale, original_size, patch_size, pixel_shuffle_scale):
-        scaled_size = scale * original_size
-        divisor = patch_size * pixel_shuffle_scale
-        scaled_size = math.ceil(scaled_size / divisor) * divisor
-        scaled_size = max(divisor, scaled_size)
-        return int(scaled_size)
-
-    # Ensure divisibility
-    divisor = patch_size * pixel_shuffle_scale
-    adjusted_height = math.ceil(image_height / divisor) * divisor
-    adjusted_height = max(divisor, adjusted_height)
-    adjusted_width = math.ceil(image_width / divisor) * divisor
-    adjusted_width = max(divisor, adjusted_width)
-
-    num_patches = (adjusted_height / patch_size) * (adjusted_width / patch_size)
-
-    if min_num_patches is not None and num_patches < min_num_patches:
-        # Scale up
-        scale_min, scale_max = 1.0, 100.0
-        while (scale_max - scale_min) >= eps:
-            scale = (scale_min + scale_max) / 2
-            target_height = get_scaled_image_size(
-                scale, image_height, patch_size, pixel_shuffle_scale
-            )
-            target_width = get_scaled_image_size(
-                scale, image_width, patch_size, pixel_shuffle_scale
-            )
-            num_patches = (target_height / patch_size) * (target_width / patch_size)
-            if num_patches >= min_num_patches:
-                scale_max = scale
-            else:
-                scale_min = scale
-        scale = scale_max
-        target_height = get_scaled_image_size(
-            scale, image_height, patch_size, pixel_shuffle_scale
-        )
-        target_width = get_scaled_image_size(
-            scale, image_width, patch_size, pixel_shuffle_scale
-        )
-        return target_height, target_width
-    elif num_patches <= max_num_patches:
-        return adjusted_height, adjusted_width
-    else:
-        # Scale down
-        scale_min, scale_max = eps / 10, 1.0
-        while (scale_max - scale_min) >= eps:
-            scale = (scale_min + scale_max) / 2
-            target_height = get_scaled_image_size(
-                scale, image_height, patch_size, pixel_shuffle_scale
-            )
-            target_width = get_scaled_image_size(
-                scale, image_width, patch_size, pixel_shuffle_scale
-            )
-            num_patches = (target_height / patch_size) * (target_width / patch_size)
-            if num_patches <= max_num_patches:
-                scale_min = scale
-            else:
-                scale_max = scale
-        scale = scale_min
-        target_height = get_scaled_image_size(
-            scale, image_height, patch_size, pixel_shuffle_scale
-        )
-        target_width = get_scaled_image_size(
-            scale, image_width, patch_size, pixel_shuffle_scale
-        )
-        return target_height, target_width
-
-
-_MEAN_TENSOR = torch.tensor(VISION_MEAN, dtype=torch.float32).view(1, 1, 1, -1)
-_STD_TENSOR = torch.tensor(VISION_STD, dtype=torch.float32).view(1, 1, 1, -1)
-
-
-def _resolve_vision_token_id(model_config: ModelConfig, vision_token: str) -> int:
-    tokenizer_name = model_config.tokenizer or model_config.model
-    tokenizer = get_cached_tokenizer(
-        get_tokenizer(
-            tokenizer_name,
-            tokenizer_mode=model_config.tokenizer_mode,
-            trust_remote_code=model_config.trust_remote_code,
-            revision=model_config.tokenizer_revision or model_config.revision,
-        )
-    )
-    return tokenizer.encode(vision_token, add_special_tokens=False)[0]
-
-
-def prepare_image_tensor(
-    image: torch.Tensor,
-    scale: float = VISION_SCALE,
-) -> torch.Tensor:
-    r"""Standardize RGB images prior to patch extraction via rescaling and whitening.
-
-    Args:
-        image (`torch.Tensor`):
-            Tensor with shape `(..., height, width, 3)` containing RGB values.
-            The tensor is converted to floating point if needed.
-        scale (`float`, *optional*, defaults to `VISION_SCALE`):
-            Scalar multiplier applied before normalization.
-    Returns:
-        `torch.Tensor`: Normalized tensor with the same shape as the input and
-        dtype `torch.float32`.
-    """
-    if not torch.is_floating_point(image):
-        image = image.float()
-    rescaled = image * scale
-
-    # Use precomputed tensors and move to the correct device if needed
-    mean_tensor = _MEAN_TENSOR.to(image.device)
-    std_tensor = _STD_TENSOR.to(image.device)
-
-    normalized = (rescaled - mean_tensor) / std_tensor
-    return normalized
-
-
-def patchify_vision(image: torch.Tensor, patch_size: int) -> torch.Tensor:
-    r"""Convert normalized images into flattened ViT-style patches.
-
-    Args:
-        image (`torch.Tensor`):
-            Tensor of shape `(num_images, height, width, channels)`.
-        patch_size (`int`):
-            Edge length of the square patches
-
-    Returns:
-        `torch.Tensor`:
-            Patch tensor where each position stores the flattened pixels
-            belonging to that patch.
-
-    Raises:
-        ValueError: If `height` or `width` is not divisible by `patch_size`.
-    """
-    num_images, height, width, channels = image.shape
-    if height % patch_size or width % patch_size:
-        raise ValueError(
-            "Dimensions of images "
-            f"{image.shape} are not divisible by patch_size={patch_size}."
-        )
-    patches = image.reshape(
-        num_images,
-        height // patch_size,
-        patch_size,
-        width // patch_size,
-        patch_size,
-        channels,
-    )
-    patches = patches.permute(0, 1, 3, 2, 4, 5)
-    patches = patches.reshape(
-        num_images,
-        height // patch_size,
-        width // patch_size,
-        channels * patch_size * patch_size,
-    )
-    return patches
-
-
-def process_vision_for_patches(
-    images: torch.Tensor,
-    patch_size: int,
-    max_num_patches: int,
-    min_num_patches: int | None = None,
-    pixel_shuffle_scale: int = 1,
-) -> tuple[torch.Tensor, list[int]]:
-    r"""Resize, normalize, and patchify RGB images for the vision encoder.
-
-    Args:
-        images (`torch.Tensor`):
-            Either `(height, width, channels)` for a single image or
-            `(num_images, height, width, channels)` for a batch. Channels are
-            expected to be RGB.
-        patch_size (`int`):
-            Edge length of square patches; implicitly controls resize grid granularity.
-        max_num_patches (`int`):
-            Maximum number of patches allowed after resizing.
-        min_num_patches (`int`, *optional*):
-            Minimum number of patches. If provided, the routine upsamples images
-            as needed to satisfy the lower bound.
-        pixel_shuffle_scale (`int`, *optional*, defaults to 1):
-            Pixel shuffle scale factor; influences the target grid that the
-            function produces.
-
-    Returns:
-        `tuple[torch.Tensor, list[int]]`: A pair `(patches, dims_virtual)`
-        where `patches` has shape `(num_images, target_h / patch_size, target_w
-        / patch_size, channels * patch_size**2)` and `dims_virtual` encodes
-        effective `(images, height, width)` dimensions after optional pixel
-        shuffling.
-    """
-    # Add batch dim if single image
-    if images.dim() == 3:
-        images = images.unsqueeze(0)
-
-    # Permute to channel first for resize
-    images = images.permute(0, 3, 1, 2)
-
-    # Get target dimensions
-    _, _, orig_height, orig_width = images.shape
-    target_height, target_width = get_image_size_for_max_num_patches(
-        orig_height,
-        orig_width,
-        patch_size,
-        max_num_patches,
-        min_num_patches=min_num_patches,
-        pixel_shuffle_scale=pixel_shuffle_scale,
-    )
-
-    # Resize
-    images = F.interpolate(
-        images,
-        size=(target_height, target_width),
-        mode="bilinear",
-        align_corners=False,
-    )
-
-    # Back to channel last
-    images = images.permute(0, 2, 3, 1)
-
-    # Normalize
-    images = prepare_image_tensor(images)
-
-    # Patchify
-    patches = patchify_vision(images, patch_size=patch_size)
-
-    # Calculate dimensions for the patches
-    n_images, h_patches, w_patches, _ = patches.shape
-    dims_virtual = (
-        [1, h_patches, w_patches]
-        if pixel_shuffle_scale == 1
-        else [1, h_patches // pixel_shuffle_scale, w_patches // pixel_shuffle_scale]
-    )
-
-    return patches, dims_virtual
-
-
-class IsaacImageProcessorKwargs(TypedDict, total=False):
-    patch_size: int
-    max_num_patches: int
-    min_num_patches: int
-    pixel_shuffle_scale: int
-
-
-class IsaacImageProcessor:
-    patch_size = 16
-    max_num_patches = 6144
-    min_num_patches = 256
-    pixel_shuffle_scale = 2
-
-    valid_kwargs = IsaacImageProcessorKwargs
-    model_input_names = ["pixel_values", "image_grid_thw"]
-
-    def __init__(self, kwargs):
-        self.patch_size = kwargs.pop("patch_size", self.patch_size)
-        self.vision_max_num_patches = kwargs.pop(
-            "vision_max_num_patches", self.max_num_patches
-        )
-        self.vision_min_num_patches = kwargs.pop(
-            "vision_min_num_patches", self.min_num_patches
-        )
-        self.pixel_shuffle_scale = kwargs.pop("pixel_shuffle_scale", 2)
-
-    def preprocess(
-        self,
-        images: list[torch.Tensor],
-        return_tensors: str | TensorType | None,
-        **kwargs: Unpack[IsaacImageProcessorKwargs],
-    ) -> BatchFeature:
-        """Preprocess images into format compatible with vLLM input processing."""
-
-        all_pixel_values: list[torch.Tensor] = []
-        all_image_grids: list[torch.Tensor] = []
-
-        for image in images:
-            image_tensor = extract_image_pil(image)
-
-            patches, dims_virtual = process_vision_for_patches(
-                image_tensor,
-                patch_size=self.patch_size,
-                max_num_patches=self.vision_max_num_patches,
-                min_num_patches=self.vision_min_num_patches,
-                pixel_shuffle_scale=self.pixel_shuffle_scale,
-            )
-
-            # Isaac packs a dummy temporal dim for images
-            patches = patches.unsqueeze(1)  # [N, T=1, Hp, Wp, D]
-
-            hp, wp, dim = patches.shape[-3], patches.shape[-2], patches.shape[-1]
-            current_num_patches = hp * wp
-            pixel_values = patches.reshape(current_num_patches, dim)  # [N_tokens, D]
-
-            # Use real patch dimensions for image_grid_thw, not virtual dimensions
-            # This ensures the vision model receives correct grid info for pixel shuffle
-            dims_real = [1, hp, wp]  # Real patch dimensions
-            image_grid_thw = torch.tensor(dims_real).unsqueeze(0)
-
-            all_pixel_values.append(pixel_values)
-            all_image_grids.append(image_grid_thw)
-
-        if all_pixel_values:
-            final_pixel_values = torch.cat(all_pixel_values, dim=0)
-            final_image_grids = torch.cat(all_image_grids, dim=0)
-        else:
-            final_pixel_values = torch.empty(0, 0)
-            final_image_grids = torch.empty(0, 3)
-
-        return BatchFeature(
-            data={
-                "pixel_values": final_pixel_values,
-                "image_grid_thw": final_image_grids,
-            },
-            tensor_type=return_tensors,
-        )
-
-
-class IsaacProcessor:
-    """Processor wrapper (tokenizer + IsaacImageProcessor)."""
-
-    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
-        self.image_token = kwargs.pop("image_token", "<image>")
-        self.image_processor = image_processor or IsaacImageProcessor(kwargs)
-        self.tokenizer = tokenizer
-
-    def __call__(self, text=None, images=None, **kwargs) -> BatchFeature:
-        result = {}
-
-        if images is not None:
-            image_inputs = self.image_processor.preprocess(images, **kwargs)
-            image_grid_thw = image_inputs["image_grid_thw"]
-            result.update(image_inputs)
-
-            if text is not None:
-                if not isinstance(text, list):
-                    text = [text]
-
-                text = text.copy()  # below lines change text in-place
-                merge_length = self.image_processor.pixel_shuffle_scale**2
-                index = 0
-                for i in range(len(text)):
-                    while self.image_token in text[i]:
-                        num_image_tokens = image_grid_thw[index].prod() // merge_length
-                        text[i] = text[i].replace(
-                            self.image_token, "<|placeholder|>" * num_image_tokens, 1
-                        )
-                        index += 1
-                    text[i] = text[i].replace("<|placeholder|>", "<|image_pad|>")
-
-        if text is not None:
-            result.update(self.tokenizer(text, **kwargs))
-
-        return BatchFeature(result)
-
-    def apply_chat_template(
-        self,
-        messages: list[dict[str, Any]],
-        tokenize: bool = False,
-        add_generation_prompt: bool = False,
-        **kwargs,
-    ) -> Any:
-        # Convert mixed content messages to simple text format
-        processed_messages = []
-
-        for message in messages:
-            if "content" in message and isinstance(message["content"], list):
-                # Handle mixed content (text + image)
-                text_parts = []
-                for content_item in message["content"]:
-                    if content_item.get("type") == "text":
-                        text_parts.append(content_item.get("text", ""))
-                    elif content_item.get("type") == "image":
-                        # Replace image with vision token
-                        text_parts.append(self.image_token)
-
-                processed_message = {
-                    "role": message.get("role", "user"),
-                    "content": "".join(text_parts),
-                }
-                processed_messages.append(processed_message)
-            else:
-                # Regular text message
-                processed_messages.append(message)
-
-        kwargs["return_dict"] = False
-        return self.tokenizer.apply_chat_template(
-            processed_messages,
-            tokenize=tokenize,
-            add_generation_prompt=add_generation_prompt,
-            **kwargs,
-        )
-

 class IsaacProcessingInfo(BaseProcessingInfo):
    def get_hf_config(self) -> IsaacConfig:
@@ -795,16 +333,18 @@ class IsaacProcessingInfo(BaseProcessingInfo):
            )
        return IsaacConfig()

+    def get_image_processor(self, **kwargs) -> IsaacImageProcessor:
+        return IsaacImageProcessor(kwargs)
+
    def get_hf_processor(self, **kwargs) -> IsaacProcessor:
        hf_config = self.get_hf_config()
-        processor_kwargs = {
-            "image_token": hf_config.vision_token,
-        }
-        processor_kwargs.update(kwargs)
-        return self.ctx.get_hf_processor(IsaacProcessor, **processor_kwargs)

-    def get_tokenizer(self):
-        return self.ctx.tokenizer
+        return self.ctx.init_processor(
+            IsaacProcessor,
+            tokenizer=self.get_tokenizer(),
+            image_processor=self.get_image_processor(),
+            image_token=hf_config.vision_token,
+        )

    def get_image_size_with_most_features(self) -> ImageSize:
        hf_config = self.get_hf_config()
@@ -819,9 +359,6 @@ class IsaacProcessingInfo(BaseProcessingInfo):
        )
        return ImageSize(width=target_width, height=target_height)

-    def get_image_processor(self, **kwargs) -> IsaacImageProcessor:
-        return self.get_hf_processor(**kwargs).image_processor
-
    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
        return {"image": None}

@@ -1206,6 +743,12 @@ class Siglip2VisionTransformer(nn.Module):
        return loaded_params


+def _resolve_vision_token_id(model_config: ModelConfig, vision_token: str) -> int:
+    tokenizer = cached_tokenizer_from_config(model_config)
+    assert tokenizer is not None
+    return tokenizer.encode(vision_token, add_special_tokens=False)[0]
+
+
 class IsaacVisionEmbedding(nn.Module):
    def __init__(
        self,

--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -49,7 +49,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 )
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs import JAISConfig
+from vllm.transformers_utils.configs.jais import JAISConfig

 from .interfaces import SupportsPP
 from .utils import (

--- a/vllm/model_executor/models/kimi_k25.py
+++ b/vllm/model_executor/models/kimi_k25.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# ruff: noqa: E501
 """
 Kimi-K2.5 Model Implementation for vLLM.

-Kimi-K2.5 extends Kimi-K2 with vision support
-
-This module defines:
- KimiK25ProcessingInfo/KimiK25MultiModalProcessor: Processing logic
- KimiK25ForConditionalGeneration: Main model class
+Kimi-K2.5 extends Kimi-K2 with vision support.
 """

 from collections.abc import Iterable, Mapping, Sequence
@@ -18,14 +13,13 @@ from typing import Annotated, Any, Literal
 import torch
 from torch import nn
 from transformers import BatchFeature
-from transformers.processing_utils import ProcessorMixin

 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (
-    CompressedTensorsConfig,
+from vllm.model_executor.layers.quantization.compressed_tensors import (
+    compressed_tensors,
 )
 from vllm.model_executor.models.interfaces import (
    SupportsEagle,
@@ -45,7 +39,6 @@ from vllm.multimodal.inputs import (
    MultiModalFieldConfig,
    MultiModalKwargsItems,
    NestedTensors,
-    VisionChunk,
    VisionChunkImage,
    VisionChunkVideo,
 )
@@ -60,8 +53,9 @@ from vllm.multimodal.processing import (
 )
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs import KimiK25Config
+from vllm.transformers_utils.configs.kimi_k25 import KimiK25Config
 from vllm.transformers_utils.processor import cached_get_image_processor
+from vllm.transformers_utils.processors.kimi_k25 import KimiK25Processor
 from vllm.utils.tensor_schema import TensorSchema, TensorShape

 from .utils import (
@@ -101,69 +95,6 @@ class KimiK25MediaPixelInputs(TensorSchema):
    grid_thws: Annotated[torch.Tensor, TensorShape("nm", 3)]


-class MoonshotKimiVAutoProcessor(ProcessorMixin):
-    attributes = ["tokenizer"]
-    tokenizer_class = "AutoTokenizer"
-
-    def __init__(
-        self, media_processor=None, tokenizer=None, media_token_id: int | None = None
-    ):
-        super().__init__(tokenizer)
-        self.media_processor = media_processor
-        self.media_token_id = media_token_id
-        assert self.media_token_id is not None
-
-    # We do not support str input for text here
-    def __call__(
-        self,
-        vision_chunks: list[VisionChunk] | None = None,
-        *,
-        text: list[int] | str,
-        **kwargs,
-    ) -> BatchFeature:
-        """
-        Args:
-            vision_chunks: List of VisionChunk items to be processed.
-                For image: VisionChunkImage with type='image', image=PIL.Image
-                For video_chunk: VisionChunkVideo with type='video_chunk', video_chunk=list[PIL.Image]
-            text: The token ids to be fed to a model (required).
-        Returns:
-            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
-
-            - **input_ids** -- list of token ids to be fed to a model.
-            - **pixel_values** -- Pixel values to be fed to a model. Returned when `vision_chunks` is not `None`.
-            - **grid_thws** -- list of image 3D grid in LLM. Returned when `vision_chunks` is not `None`.
-        """
-        mm_inputs = {}
-        input_ids = self.tokenizer.encode(text) if isinstance(text, str) else text
-        if vision_chunks is not None:
-            assert isinstance(vision_chunks, list)
-            mm_inputs = self.media_processor.preprocess(vision_chunks)
-
-            num_tokens_per_chunk = [
-                self.media_processor.media_tokens_calculator(chunk)
-                for chunk in vision_chunks
-            ]
-
-            new_input_ids = []
-            for token in input_ids:
-                if token == self.media_token_id:
-                    new_input_ids.extend(
-                        [self.media_token_id] * num_tokens_per_chunk.pop(0)
-                    )
-                else:
-                    new_input_ids.append(token)
-            input_ids = new_input_ids
-
-        # XXX: _apply_hf_processor_text_mm will call tolist() on input_ids
-        return BatchFeature(
-            data={
-                "input_ids": torch.tensor([input_ids]),
-                **mm_inputs,
-            }
-        )
-
-
 class KimiK25ProcessingInfo(BaseProcessingInfo):
    """Processing information for Kimi-K2.5 model.

@@ -180,7 +111,7 @@ class KimiK25ProcessingInfo(BaseProcessingInfo):
            trust_remote_code=self.ctx.model_config.trust_remote_code,
        )
        self.media_processor = media_processor
-        self.hf_processor = MoonshotKimiVAutoProcessor(
+        self.hf_processor = KimiK25Processor(
            media_processor=self.media_processor,
            tokenizer=self.get_tokenizer(),
            media_token_id=self.media_token_id,
@@ -263,12 +194,14 @@ class KimiK25MultiModalProcessor(BaseMultiModalProcessor[KimiK25ProcessingInfo])
    ) -> Mapping[str, MultiModalFieldConfig]:
        """Indicates how to slice media input into multiple items.

-        pixel_values: [N, 3, patch_size, patch_size], all patches collected from B medias
-        grid_thws: [B,3], each item: [N_t, N_h ,N_w], indicates the grid size in time/height/width direction
-                    for current item.
+        pixel_values: [N, 3, patch_size, patch_size],
+          all patches collected from B medias
+        grid_thws: [B,3], each item: [N_t, N_h ,N_w],
+          indicates the grid size in time/height/width direction for current item.

-        by multiplying [N_t, N_h ,N_w], we get the number of patches for each media item, thus we can slice
-        pixel_values by pixel_values[start:start + N_t*N_h*N_w] to get patches of one item.
+        by multiplying [N_t, N_h ,N_w], we get the number of patches
+        for each media item, thus we can slice pixel_values by
+        pixel_values[start:start + N_t*N_h*N_w] to get patches of one item.

        """
        grid_thws = hf_inputs.get("grid_thws", torch.empty((0, 3)))
@@ -403,7 +336,7 @@ class KimiK25ForConditionalGeneration(
        self.media_placeholder: int = self.config.media_placeholder_token_id

    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
-        if isinstance(quant_config, CompressedTensorsConfig):
+        if isinstance(quant_config, compressed_tensors.CompressedTensorsConfig):
            return None
        return quant_config


--- a/vllm/model_executor/models/kimi_vl.py
+++ b/vllm/model_executor/models/kimi_vl.py
@@ -77,7 +77,7 @@ from vllm.multimodal.processing import (
    PromptUpdate,
 )
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs import KimiVLConfig, MoonViTConfig
+from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig, MoonViTConfig
 from vllm.utils.tensor_schema import TensorSchema, TensorShape

 from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix

--- a/vllm/model_executor/models/lfm2_moe.py
+++ b/vllm/model_executor/models/lfm2_moe.py
@@ -39,7 +39,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 )
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs import Lfm2MoeConfig
+from vllm.transformers_utils.configs.lfm2_moe import Lfm2MoeConfig

 from .interfaces import (
    HasInnerState,

--- a/vllm/model_executor/models/lightonocr.py
+++ b/vllm/model_executor/models/lightonocr.py
@@ -16,8 +16,7 @@ from vllm.model_executor.models.mistral3 import (
    Mistral3ForConditionalGeneration,
    Mistral3MultiModalProjector,
    Mistral3ProcessingInfo,
-    _build_mistral3_info,
-    init_vision_tower_for_llava,
+    init_vision_tower_for_mistral3,
 )
 from vllm.model_executor.models.pixtral import PixtralHFEncoderInfo
 from vllm.model_executor.models.utils import (
@@ -27,11 +26,9 @@ from vllm.model_executor.models.utils import (
    maybe_prefix,
 )
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.cache import BaseMultiModalProcessorCache
 from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargsItems
 from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems
 from vllm.multimodal.processing import (
-    BaseDummyInputsBuilder,
    BaseMultiModalProcessor,
    PromptReplacement,
    PromptUpdate,
@@ -128,19 +125,9 @@ class LightOnOCRMultiModalProcessor(BaseMultiModalProcessor[Mistral3ProcessingIn
        ]


-def _build_LightOnOCR_processor(
-    info: _I,
-    dummy_inputs: BaseDummyInputsBuilder[_I],
-    *,
-    cache: BaseMultiModalProcessorCache | None = None,
-):
-    assert isinstance(info, Mistral3ProcessingInfo)
-    return LightOnOCRMultiModalProcessor(info, dummy_inputs, cache=cache)
-
-
 @MULTIMODAL_REGISTRY.register_processor(
-    _build_LightOnOCR_processor,
-    info=_build_mistral3_info,
+    LightOnOCRMultiModalProcessor,
+    info=Mistral3ProcessingInfo,
    dummy_inputs=Mistral3DummyInputsBuilder,
 )
 class LightOnOCRForConditionalGeneration(Mistral3ForConditionalGeneration):
@@ -164,7 +151,7 @@ class LightOnOCRForConditionalGeneration(Mistral3ForConditionalGeneration):
        self.multimodal_config = multimodal_config

        with self._mark_tower_model(vllm_config, "image"):
-            self.vision_tower = init_vision_tower_for_llava(
+            self.vision_tower = init_vision_tower_for_mistral3(
                config,
                quant_config=quant_config,
                require_post_norm=False,

--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-from abc import abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Final, Literal, Protocol, TypeVar
+from typing import Annotated, Literal

 import torch
 import torch.nn as nn
-from transformers import (
-    BatchFeature,
-    Mistral3Config,
-    PixtralVisionConfig,
-    PretrainedConfig,
-)
+from transformers import BatchFeature, Mistral3Config, PixtralVisionConfig
 from transformers.models.pixtral import PixtralProcessor

 from vllm.config import VllmConfig
@@ -23,7 +17,6 @@ from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelL
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.cache import BaseMultiModalProcessorCache
 from vllm.multimodal.inputs import (
    MultiModalDataDict,
    MultiModalFieldConfig,
@@ -34,7 +27,6 @@ from vllm.multimodal.processing import (
    BaseDummyInputsBuilder,
    BaseMultiModalProcessor,
    BaseProcessingInfo,
-    InputProcessingContext,
    PromptReplacement,
    PromptUpdate,
    PromptUpdateDetails,
@@ -178,27 +170,15 @@ class Mistral3MultiModalProjector(nn.Module):
        return hidden_states


-class LlavaLikeConfig(Protocol):
-    vision_config: Final[PretrainedConfig]
-    image_token_index: Final[int]
-    vision_feature_select_strategy: Final[str]
-    vision_feature_layer: Final[int | list[int]]
-
-
-class LlavaLikeProcessor(Protocol):
-    image_token: Final[str]
-
-
-class BaseLlavaProcessingInfo(BaseProcessingInfo):
-    def get_hf_config(self) -> LlavaLikeConfig:
+class Mistral3ProcessingInfo(BaseProcessingInfo):
+    def get_hf_config(self) -> Mistral3Config:
        return self.ctx.get_hf_config(Mistral3Config)

    def get_vision_encoder_info(self):
        return get_vision_encoder_info(self.get_hf_config())

-    @abstractmethod
-    def get_hf_processor(self, **kwargs: object) -> LlavaLikeProcessor:
-        raise NotImplementedError
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(PixtralProcessor, **kwargs)

    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
        return {"image": None}
@@ -221,10 +201,7 @@ class BaseLlavaProcessingInfo(BaseProcessingInfo):
        return ImageSize(width=width, height=height)


-_I = TypeVar("_I", bound=BaseLlavaProcessingInfo)
-
-
-class Mistral3DummyInputsBuilder(BaseDummyInputsBuilder[_I]):
+class Mistral3DummyInputsBuilder(BaseDummyInputsBuilder[Mistral3ProcessingInfo]):
    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
        num_images = mm_counts.get("image", 0)

@@ -255,11 +232,6 @@ class Mistral3DummyInputsBuilder(BaseDummyInputsBuilder[_I]):
        }


-class Mistral3ProcessingInfo(BaseLlavaProcessingInfo):
-    def get_hf_processor(self, **kwargs: object):
-        return self.ctx.get_hf_processor(PixtralProcessor, **kwargs)
-
-
 class Mistral3MultiModalProcessor(BaseMultiModalProcessor[Mistral3ProcessingInfo]):
    def _call_hf_processor(
        self,
@@ -339,29 +311,7 @@ class Mistral3MultiModalProcessor(BaseMultiModalProcessor[Mistral3ProcessingInfo
        ]


-def _build_mistral3_info(
-    ctx: InputProcessingContext,
-) -> BaseLlavaProcessingInfo:
-    hf_config = ctx.get_hf_config(Mistral3Config)
-    assert isinstance(hf_config.vision_config, PixtralVisionConfig)
-    return Mistral3ProcessingInfo(ctx)
-
-
-def _build_mistral3_processor(
-    info: _I,
-    dummy_inputs: BaseDummyInputsBuilder[_I],
-    *,
-    cache: BaseMultiModalProcessorCache | None = None,
-) -> BaseMultiModalProcessor:
-    assert isinstance(info, Mistral3ProcessingInfo)
-    return Mistral3MultiModalProcessor(
-        info,
-        dummy_inputs,  # type: ignore
-        cache=cache,
-    )
-
-
-def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
+def _get_num_hidden_layers(hf_config: Mistral3Config) -> int:
    """Determine the number of hidden layers to initialize up to in the
    visual encoder.

@@ -381,8 +331,8 @@ def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
    )


-def init_vision_tower_for_llava(
-    hf_config: LlavaLikeConfig,
+def init_vision_tower_for_mistral3(
+    hf_config: Mistral3Config,
    quant_config: QuantizationConfig | None,
    *,
    require_post_norm: bool | None = None,
@@ -405,8 +355,8 @@ def init_vision_tower_for_llava(


 @MULTIMODAL_REGISTRY.register_processor(
-    _build_mistral3_processor,
-    info=_build_mistral3_info,
+    Mistral3MultiModalProcessor,
+    info=Mistral3ProcessingInfo,
    dummy_inputs=Mistral3DummyInputsBuilder,
 )
 class Mistral3ForConditionalGeneration(
@@ -466,7 +416,7 @@ class Mistral3ForConditionalGeneration(
            config.projector_hidden_act = "gelu"

        with self._mark_tower_model(vllm_config, "image"):
-            self.vision_tower = init_vision_tower_for_llava(
+            self.vision_tower = init_vision_tower_for_mistral3(
                config,
                quant_config=quant_config,
                require_post_norm=False,

--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -52,7 +52,7 @@ from vllm.model_executor.model_loader.weight_utils import (
    maybe_remap_kv_scale_name,
 )
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs import NemotronConfig
+from vllm.transformers_utils.configs.nemotron import NemotronConfig

 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (

--- a/vllm/model_executor/models/nemotron_h.py
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -81,7 +81,7 @@ from vllm.model_executor.models.utils import (
    sequence_parallel_chunk,
 )
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs import NemotronHConfig
+from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig


 class NemotronHMLP(nn.Module):

--- a/vllm/model_executor/models/nemotron_h_mtp.py
+++ b/vllm/model_executor/models/nemotron_h_mtp.py
@@ -26,7 +26,7 @@ from vllm.model_executor.models.utils import (
    maybe_prefix,
 )
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs import NemotronHConfig
+from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig

 from .interfaces import SupportsPP
 from .nemotron_h import (

--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -63,7 +63,7 @@ from vllm.model_executor.models.utils import (
    maybe_prefix,
 )
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs import Olmo3Config
+from vllm.transformers_utils.configs.olmo3 import Olmo3Config


 class Olmo2Attention(nn.Module):

--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -80,7 +80,7 @@ from vllm.model_executor.models.utils import sequence_parallel_chunk
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs import Qwen3NextConfig
+from vllm.transformers_utils.configs.qwen3_next import Qwen3NextConfig
 from vllm.triton_utils import tl, triton
 from vllm.utils.multi_stream_utils import maybe_execute_in_parallel
 from vllm.utils.torch_utils import (

--- a/vllm/model_executor/models/qwen3_next_mtp.py
+++ b/vllm/model_executor/models/qwen3_next_mtp.py
@@ -25,7 +25,7 @@ from vllm.model_executor.models.qwen3_next import (
    QwenNextMixtureOfExperts,
 )
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs import Qwen3NextConfig
+from vllm.transformers_utils.configs.qwen3_next import Qwen3NextConfig

 from .utils import (
    AutoWeightsLoader,

--- a/vllm/model_executor/models/step3_vl.py
+++ b/vllm/model_executor/models/step3_vl.py
@@ -2,18 +2,13 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from itertools import product
-from math import ceil, sqrt
+from math import sqrt
 from typing import Annotated, Any, Literal, TypeAlias

-import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from PIL import Image
-from torchvision import transforms
-from torchvision.transforms.functional import InterpolationMode
-from transformers import BatchFeature, PretrainedConfig, TensorType
+from transformers import BatchFeature

 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
@@ -43,8 +38,8 @@ from vllm.multimodal.processing import (
    PromptUpdateDetails,
 )
 from vllm.sequence import IntermediateTensors
-from vllm.tokenizers import TokenizerLike
-from vllm.transformers_utils.configs import Step3VisionEncoderConfig
+from vllm.transformers_utils.configs.step3_vl import Step3VisionEncoderConfig
+from vllm.transformers_utils.processors.step3_vl import Step3VLProcessor
 from vllm.utils.tensor_schema import TensorSchema, TensorShape

 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -89,430 +84,6 @@ class Step3VLImageEmbeddingInputs(TensorSchema):

 Step3VLImageInputs: TypeAlias = Step3VLImagePixelInputs | Step3VLImageEmbeddingInputs

-ImageWithPatches = tuple[Image.Image, list[Image.Image], list[bool] | None]
-
-MAX_IMAGE_SIZE: int = 3024
-
-
-class Step3VisionProcessor:
-    def __init__(self, size, interpolation_mode="bicubic", patch_size=None):
-        mean = [0.48145466, 0.4578275, 0.40821073]
-        std = [0.26862954, 0.26130258, 0.27577711]
-        patch_size = patch_size if patch_size is not None else size
-
-        self.transform = transforms.Compose(
-            [
-                transforms.ToTensor(),
-                transforms.Normalize(mean, std),
-                transforms.Resize(
-                    (size, size),
-                    interpolation=InterpolationMode.BICUBIC
-                    if interpolation_mode == "bicubic"
-                    else InterpolationMode.BILINEAR,
-                    antialias=True,
-                ),
-            ]
-        )
-
-        self.patch_transform = (
-            transforms.Compose(
-                [
-                    transforms.ToTensor(),
-                    transforms.Normalize(mean, std),
-                    transforms.Resize(
-                        (patch_size, patch_size),
-                        interpolation=InterpolationMode.BICUBIC
-                        if interpolation_mode == "bicubic"
-                        else InterpolationMode.BILINEAR,
-                        antialias=True,
-                    ),
-                ]
-            )
-            if patch_size is not None
-            else None
-        )
-
-    def __call__(self, image, is_patch=False):
-        if is_patch:
-            return {"pixel_values": self.patch_transform(image).unsqueeze(0)}
-        else:
-            return {"pixel_values": self.transform(image).unsqueeze(0)}
-
-
-class ImagePatcher:
-    def __init__(self, enable_patch: bool = True) -> None:
-        self.enable_patch = enable_patch
-
-    def determine_window_size(self, long: int, short: int) -> int:
-        if long < 728:
-            return short if long / short > 1.5 else 0
-        return min(short, 504) if long / short > 4 else 504
-
-    def slide_window(
-        self,
-        width: int,
-        height: int,
-        sizes: list[tuple[int, int]],
-        steps: list[tuple[int, int]],
-        img_rate_thr: float = 0.6,
-    ) -> tuple[list[tuple[int, int, int, int]], tuple[int, int]]:
-        assert 1 >= img_rate_thr >= 0, "The `in_rate_thr` should lie in 0~1"
-        windows = []
-        # Sliding windows.
-        for size, step in zip(sizes, steps):
-            size_w, size_h = size
-            step_w, step_h = step
-
-            x_num = 1 if width <= size_w else ceil((width - size_w) / step_w + 1)
-            x_start = [step_w * i for i in range(x_num)]
-            if len(x_start) > 1 and x_start[-1] + size_w > width:
-                x_start[-1] = width - size_w
-
-            y_num = 1 if height <= size_h else ceil((height - size_h) / step_h + 1)
-            y_start = [step_h * i for i in range(y_num)]
-            if len(y_start) > 1 and y_start[-1] + size_h > height:
-                y_start[-1] = height - size_h
-
-            start = np.array(list(product(y_start, x_start)), dtype=int)
-            start[:, [0, 1]] = start[:, [1, 0]]
-            windows.append(np.concatenate([start, start + size], axis=1))
-        windows = np.concatenate(windows, axis=0)
-
-        return [
-            (int(box[0]), int(box[1]), int(box[2] - box[0]), int(box[3] - box[1]))
-            for box in windows
-        ], (x_num, y_num)
-
-    def square_pad(self, img: Image.Image) -> Image.Image:
-        w, h = img.size
-        if w == h:
-            return img
-        size = max(w, h)
-        padded = Image.new(img.mode, (size, size), 0)
-        padded.paste(img, (0, 0))
-        return padded
-
-    def get_image_size_for_padding(
-        self, img_width: int, img_height: int
-    ) -> tuple[int, int]:
-        ratio = img_width / img_height
-        if min(img_height, img_width) < 32 and (ratio > 4 or ratio < 1 / 4):
-            new_size = max(img_height, img_width)
-            return new_size, new_size
-        return img_width, img_height
-
-    def get_image_size_for_preprocess(
-        self, img_width: int, img_height: int
-    ) -> tuple[int, int]:
-        if max(img_height, img_width) > MAX_IMAGE_SIZE:
-            scale_factor = MAX_IMAGE_SIZE / max(img_height, img_width)
-            img_width = int(img_width * scale_factor)
-            img_height = int(img_height * scale_factor)
-        return img_width, img_height
-
-    def get_image_size_for_crop(
-        self, img_width: int, img_height: int, window_size: int
-    ):
-        w_ratio = img_width / window_size
-        h_ratio = img_height / window_size
-
-        if w_ratio < 1:
-            width_new = img_width
-        else:
-            decimal_w = w_ratio - img_width // window_size
-            w_ratio = int(w_ratio) + 1 if decimal_w > 0.2 else int(w_ratio)
-            width_new = window_size * w_ratio
-        if h_ratio < 1:
-            height_new = img_height
-        else:
-            decimal_h = h_ratio - img_height // window_size
-            h_ratio = int(h_ratio) + 1 if decimal_h > 0.2 else int(h_ratio)
-            height_new = window_size * h_ratio
-        return int(width_new), int(height_new)
-
-    def patch_crop(self, img: Image.Image, i: int, j: int, th: int, tw: int):
-        target = img.crop((j, i, j + tw, i + th))
-        return target
-
-    def get_num_patches(self, img_width: int, img_height: int) -> tuple[int, int]:
-        img_width, img_height = self.get_image_size_for_padding(img_width, img_height)
-        img_width, img_height = self.get_image_size_for_preprocess(
-            img_width, img_height
-        )
-        window_size = self.determine_window_size(
-            max(img_height, img_width), min(img_height, img_width)
-        )
-        if window_size == 0 or not self.enable_patch:
-            return 0, 0
-        else:
-            img_width, img_height = self.get_image_size_for_crop(
-                img_width, img_height, window_size
-            )
-            center_list, (x_num, y_num) = self.slide_window(
-                img_width,
-                img_height,
-                [(window_size, window_size)],
-                [(window_size, window_size)],
-            )
-            full_rows = (len(center_list) - 1) // x_num + 1
-            if len(center_list) > 0 and len(center_list) % x_num == 0:
-                full_rows -= 1
-            return len(center_list), full_rows
-
-    def __call__(
-        self, img: Image.Image
-    ) -> tuple[Image.Image, list[Image.Image], list[bool] | None]:
-        img_width, img_height = img.size
-        new_img_width, new_img_height = self.get_image_size_for_padding(
-            img_width, img_height
-        )
-        if new_img_width != img_width or new_img_height != img_height:
-            img = self.square_pad(img)
-            img_width, img_height = img.size
-
-        new_img_width, new_img_height = self.get_image_size_for_preprocess(
-            img_width, img_height
-        )
-        img = img.resize((new_img_width, new_img_height), Image.Resampling.BILINEAR)
-        window_size = self.determine_window_size(
-            max(new_img_height, new_img_width), min(new_img_height, new_img_width)
-        )
-
-        if window_size == 0 or not self.enable_patch:
-            return img, [], None
-        else:
-            new_img_width, new_img_height = self.get_image_size_for_crop(
-                new_img_width, new_img_height, window_size
-            )
-            if (new_img_width, new_img_height) != (img_width, img_height):
-                img_for_crop = img.resize(
-                    (new_img_width, new_img_height), Image.Resampling.BILINEAR
-                )
-            else:
-                img_for_crop = img
-
-            patches = []
-            newlines = []
-            center_list, (x_num, y_num) = self.slide_window(
-                new_img_width,
-                new_img_height,
-                [(window_size, window_size)],
-                [(window_size, window_size)],
-            )
-            for patch_id, center_lf_point in enumerate(center_list):
-                x, y, patch_w, patch_h = center_lf_point
-                big_patch = self.patch_crop(img_for_crop, y, x, patch_h, patch_w)
-                patches.append(big_patch)
-                if (patch_id + 1) % x_num == 0:
-                    newlines.append(patch_id)
-
-            if newlines and newlines[-1] == len(patches) - 1:
-                newlines.pop()
-
-            return (
-                img,
-                patches,
-                [i in newlines for i in range(len(patches))]
-                if len(patches) > 0
-                else None,
-            )
-
-
-class Step3VLProcessor:
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        tokenizer: TokenizerLike,
-    ) -> None:
-        super().__init__()
-
-        self.config = config
-        self.tokenizer = tokenizer
-        self.image_size = 728
-        self.patch_size = 504
-        self.image_preprocessor = Step3VisionProcessor(
-            self.image_size, "bilinear", self.patch_size
-        )
-
-        self.num_image_feature_size = 169
-        self.num_patch_feature_size = 81
-        self.image_token = "<im_patch>"
-        self.image_feature_placeholder = self.image_token * self.num_image_feature_size
-        self.patch_feature_placeholder = self.image_token * self.num_patch_feature_size
-
-        # Respect vision config switch to enable/disable patch extraction.
-        # For video understanding, it's preferable to disable patch.
-        enable_patch = getattr(self.config.vision_config, "enable_patch", True)
-        self.patcher = ImagePatcher(enable_patch=enable_patch)
-
-    @property
-    def image_token_id(self) -> int:
-        return self.tokenizer.get_vocab()[self.image_token]
-
-    def get_num_image_tokens(self, img_width: int, img_height: int) -> int:
-        num_patches, num_newlines = self.patcher.get_num_patches(img_width, img_height)
-
-        return (
-            num_patches * (self.num_patch_feature_size + 2)
-            + self.num_image_feature_size
-            + 2
-            + num_newlines
-        )
-
-    def _split_images(self, images: list[Image.Image]) -> list[ImageWithPatches]:
-        result = []
-        for img in images:
-            result.append(self.patcher(img))
-        return result
-
-    def _convert_images_to_pixel_values(
-        self,
-        images: list[Image.Image],
-        is_patch: bool = False,
-    ) -> list[torch.Tensor]:
-        return [
-            self.image_preprocessor(img, is_patch=is_patch)["pixel_values"]
-            for img in images
-        ]
-
-    def _get_patch_repl(
-        self,
-        num_patches: int,
-        patch_newline_mask: list[bool] | None,
-    ) -> tuple[str, list[int]]:
-        text = ""
-        token_ids = []
-        for i in range(num_patches):
-            assert len(patch_newline_mask) == num_patches
-            text += f"<patch_start>{self.patch_feature_placeholder}<patch_end>"
-            token_ids.extend(
-                [self.tokenizer.convert_tokens_to_ids("<patch_start>")]
-                + [self.image_token_id] * self.num_patch_feature_size
-                + [self.tokenizer.convert_tokens_to_ids("<patch_end>")]
-            )
-            if patch_newline_mask and patch_newline_mask[i]:
-                text += "<patch_newline>"
-                token_ids.append(
-                    self.tokenizer.convert_tokens_to_ids("<patch_newline>")
-                )
-        return text, token_ids
-
-    def _get_image_repl(
-        self,
-        num_images: int,
-    ) -> tuple[str, list[int]]:
-        text = f"<im_start>{self.image_feature_placeholder}<im_end>"
-        token_ids = (
-            [self.tokenizer.convert_tokens_to_ids("<im_start>")]
-            + [self.image_token_id] * self.num_image_feature_size
-            + [self.tokenizer.convert_tokens_to_ids("<im_end>")]
-        )
-        return text * num_images, token_ids * num_images
-
-    def _get_image_repl_features(
-        self,
-        num_images: int,
-        num_patches: int,
-        patch_new_line_idx: list[bool] | None,
-    ) -> tuple[str, list[int]]:
-        if num_patches > 0:
-            patch_repl, patch_repl_ids = self._get_patch_repl(
-                num_patches, patch_new_line_idx
-            )
-        else:
-            patch_repl = ""
-            patch_repl_ids = []
-        image_repl, image_repl_ids = self._get_image_repl(num_images)
-        return patch_repl + image_repl, patch_repl_ids + image_repl_ids
-
-    def replace_placeholder(self, text: str, placeholder: str, repls: list[str]) -> str:
-        parts = text.split(placeholder)
-
-        if len(parts) - 1 != len(repls):
-            raise ValueError(
-                "The number of placeholders does not match the number of replacements."
-            )
-
-        result = [parts[0]]
-        for i, repl in enumerate(repls):
-            result.append(repl)
-            result.append(parts[i + 1])
-
-        return "".join(result)
-
-    def __call__(
-        self,
-        text: str | list[str] | None = None,
-        images: Image.Image | list[Image.Image] | None = None,
-        return_tensors: str | TensorType | None = None,
-    ) -> BatchFeature:
-        if text is None:
-            text = []
-        if not isinstance(text, list):
-            text = [text]
-        if images is None:
-            images = []
-        if not isinstance(images, list):
-            images = [images]
-
-        if len(images) == 0:
-            image_inputs = {}
-            text_inputs = self.tokenizer(text)
-        else:
-            split_images_data = self._split_images(images)
-            pixel_values_lst = []
-            patch_pixel_values_lst = []
-            patch_newline_mask_lst = []
-            image_repl_str_lst = []
-            image_repl_ids_lst = []
-            num_patches = []
-            for raw_img, img_patches, patch_newline_mask in split_images_data:
-                pixel_values_lst.extend(self._convert_images_to_pixel_values([raw_img]))
-
-                if len(img_patches) > 0:
-                    patch_pixel_values_lst.extend(
-                        self._convert_images_to_pixel_values(img_patches, is_patch=True)
-                    )
-                num_patches.append(len(img_patches))
-
-                image_repl_str, image_repl_ids = self._get_image_repl_features(
-                    1, len(img_patches), patch_newline_mask
-                )
-                image_repl_str_lst.append(image_repl_str)
-                image_repl_ids_lst.extend(image_repl_ids)
-
-                if patch_newline_mask is not None:
-                    patch_newline_mask_lst.extend(patch_newline_mask)
-
-            pixel_values = torch.cat(pixel_values_lst)
-            patch_size = self.patch_size
-            image_inputs = {
-                "pixel_values": pixel_values,
-                "num_patches": num_patches,
-                "patch_pixel_values": (
-                    torch.cat(patch_pixel_values_lst)
-                    if patch_pixel_values_lst
-                    else pixel_values.new_empty((0, 3, patch_size, patch_size))
-                ),
-                "patch_newline_mask": torch.tensor(
-                    patch_newline_mask_lst, dtype=torch.bool
-                ),
-            }
-
-            text = [
-                self.replace_placeholder(t, self.image_token, image_repl_str_lst)
-                for t in text
-            ]
-            text_inputs = self.tokenizer(text)
-
-        return BatchFeature(
-            {
-                **text_inputs,
-                **image_inputs,
-            },
-            tensor_type=return_tensors,
-        )
-

 class Step3VLProcessingInfo(BaseProcessingInfo):
    def get_hf_processor(self) -> Step3VLProcessor:

--- a/vllm/model_executor/models/tarsier.py
+++ b/vllm/model_executor/models/tarsier.py
@@ -25,7 +25,6 @@ from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelL
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.models.llava import LlavaDummyInputsBuilder
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.cache import BaseMultiModalProcessorCache
 from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargsItems
 from vllm.multimodal.parse import (
    ImageEmbeddingItems,
@@ -34,10 +33,8 @@ from vllm.multimodal.parse import (
    MultiModalDataItems,
 )
 from vllm.multimodal.processing import (
-    BaseDummyInputsBuilder,
    BaseMultiModalProcessor,
    BaseProcessingInfo,
-    InputProcessingContext,
    PromptReplacement,
    PromptUpdate,
 )
@@ -329,25 +326,6 @@ class TarsierMultiModalProcessor(BaseMultiModalProcessor[_I_Tarsier]):
        ]


-def _build_tarsier_hf_info(ctx: InputProcessingContext) -> TarsierProcessingInfo:
-    return TarsierProcessingInfo(ctx)
-
-
-def _build_tarsier_hf_processor(
-    info: _I_Tarsier,
-    dummy_inputs: BaseDummyInputsBuilder[_I_Tarsier],
-    *,
-    cache: BaseMultiModalProcessorCache | None = None,
-) -> BaseMultiModalProcessor:
-    if isinstance(info, TarsierProcessingInfo):
-        return TarsierMultiModalProcessor(
-            info,
-            dummy_inputs,
-            cache=cache,
-        )
-    raise NotImplementedError(type(info))
-
-
 def init_vision_tower_for_tarsier(
    hf_config: TarsierHfConfig,  # Use the Tarsier specific config protocol
    quant_config: QuantizationConfig | None,
@@ -395,8 +373,8 @@ def init_vision_tower_for_tarsier(


 @MULTIMODAL_REGISTRY.register_processor(
-    _build_tarsier_hf_processor,
-    info=_build_tarsier_hf_info,
+    TarsierMultiModalProcessor,
+    info=TarsierProcessingInfo,
    dummy_inputs=TarsierDummyInputsBuilder,
 )
 class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):