[1/2] Move InternVL-based processors (#37260)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>

[1/2] Move InternVL-based processors (#37260)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
f3403243 · Cyrus Leung · GitHub · 2660b928 · f3403243 · f3403243
Unverified Commit f3403243 authored Mar 17, 2026 by Cyrus Leung Committed by GitHub Mar 17, 2026
20 changed files
--- a/tests/models/multimodal/processing/test_h2ovl.py
+++ b/tests/models/multimodal/processing/test_h2ovl.py
@@ -23,7 +23,7 @@ def _get_expected_num_patches(
    min_num: int,
    max_num: int,
 ):
-    from vllm.model_executor.models.h2ovl import (
+    from vllm.transformers_utils.processors.h2ovl import (
        calculate_h2ovl_targets,
        get_h2ovl_target_ratios,
    )

--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -23,7 +23,7 @@ def _get_expected_num_patches(
    min_num: int,
    max_num: int,
 ):
-    from vllm.model_executor.models.internvl import (
+    from vllm.transformers_utils.processors.internvl import (
        calculate_internvl_targets,
        get_internvl_target_ratios,
    )

--- a/tests/models/multimodal/processing/test_nemotron_vl.py
+++ b/tests/models/multimodal/processing/test_nemotron_vl.py
@@ -23,7 +23,7 @@ def _get_expected_num_patches(
    min_num: int,
    max_num: int,
 ):
-    from vllm.model_executor.models.nemotron_vl import (
+    from vllm.transformers_utils.processors.nemotron_vl import (
        calculate_nemotron_vl_targets,
        get_nemotron_vl_target_ratios,
    )

--- a/vllm/model_executor/models/eagle2_5_vl.py
+++ b/vllm/model_executor/models/eagle2_5_vl.py
@@ -15,9 +15,8 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.siglip import SiglipVisionModel
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.processing import PromptUpdateDetails
 from vllm.sequence import IntermediateTensors
-from vllm.tokenizers import TokenizerLike
+from vllm.transformers_utils.processors.eagle2_5_vl import Eagle2_5_VLProcessor
 from vllm.utils.tensor_schema import TensorSchema, TensorShape

 from .interfaces import (
@@ -27,13 +26,9 @@ from .interfaces import (
    SupportsPP,
 )
 from .internvl import (
-    IMG_CONTEXT,
-    IMG_END,
-    IMG_START,
    BaseInternVLDummyInputsBuilder,
    BaseInternVLMultiModalProcessor,
    BaseInternVLProcessingInfo,
-    BaseInternVLProcessor,
 )
 from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix

@@ -70,81 +65,6 @@ Eagle2_5_VLImageInputs: TypeAlias = (
 )


-class Eagle2_5_VLProcessor(BaseInternVLProcessor):
-    """
-    Custom processor for Eagle2.5-VL model.
-    Extends BaseInternVLProcessor with Eagle-specific token handling.
-    """
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        tokenizer: TokenizerLike,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-    ) -> None:
-        # Skip super().__init__() to avoid config manipulation
-        # Directly initialize all required attributes
-        self.config = config
-        self.tokenizer = tokenizer
-
-        # Image size with force_image_size override
-        image_size: int = config.vision_config.image_size
-        if hasattr(config, "force_image_size") and config.force_image_size:
-            image_size = config.force_image_size
-
-        patch_size: int = config.vision_config.patch_size
-        downsample_ratio: float = getattr(config, "downsample_ratio", 0.5)
-
-        # Compute num_image_token
-        self.num_image_token = int(
-            (image_size // patch_size) ** 2 * (downsample_ratio**2)
-        )
-        self.image_size = image_size
-
-        # Dynamic patch settings with defaults
-        self.min_dynamic_patch = (
-            min_dynamic_patch
-            if min_dynamic_patch is not None
-            else getattr(config, "min_dynamic_patch", 1)
-        )
-        self.max_dynamic_patch = (
-            max_dynamic_patch
-            if max_dynamic_patch is not None
-            else getattr(config, "max_dynamic_patch", 12)
-        )
-        self.dynamic_image_size = (
-            dynamic_image_size
-            if dynamic_image_size is not None
-            else getattr(config, "dynamic_image_size", True)
-        )
-        self.use_thumbnail: bool = getattr(config, "use_thumbnail", True)
-
-    @property
-    def image_token_id(self) -> int:
-        """Get the image token ID from config or tokenizer."""
-        if hasattr(self.config, "image_token_index"):
-            return self.config.image_token_index
-        # Fallback to tokenizer vocab - use <IMG_CONTEXT> (ID: 151667)
-        vocab = self.tokenizer.get_vocab()
-        if IMG_CONTEXT in vocab:
-            return vocab[IMG_CONTEXT]
-        raise ValueError(f"Cannot find image token '{IMG_CONTEXT}' in vocabulary")
-
-    def get_image_repl(
-        self,
-        feature_size: int,
-        num_patches: int | None,
-    ) -> PromptUpdateDetails[str]:
-        """Get image replacement string for prompt."""
-        repl_features = IMG_CONTEXT * feature_size
-        repl_full = IMG_START + repl_features + IMG_END
-
-        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
-
-
 class Eagle2_5_VLProcessingInfo(BaseInternVLProcessingInfo):
    """Processing info for Eagle2.5-VL model."""


--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -11,7 +11,6 @@
 from collections.abc import Mapping, Sequence

 import torch
-from PIL import Image
 from transformers import PretrainedConfig

 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -27,391 +26,19 @@ from vllm.multimodal.processing.processor import (
    ProcessorInputs,
    PromptReplacement,
    PromptUpdate,
-    PromptUpdateDetails,
    TimingContext,
 )
-from vllm.tokenizers import TokenizerLike
+from vllm.transformers_utils.processors.h2ovl import H2OVLProcessor

 from .intern_vit import InternVisionModel
 from .internvl import (
-    IMG_CONTEXT,
-    IMG_END,
-    IMG_START,
    BaseInternVLDummyInputsBuilder,
    BaseInternVLMultiModalProcessor,
    BaseInternVLProcessingInfo,
-    BaseInternVLProcessor,
    InternVLChatModel,
-    build_transform,
-    find_closest_aspect_ratio,
-    get_internvl_target_ratios,
 )


-def resolve_h2ovl_min_max_num(
-    *,
-    min_dynamic_patch: int,
-    max_dynamic_patch: int,
-    dynamic_image_size: bool,
-    use_thumbnail: bool,
-) -> tuple[int, int]:
-    min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
-    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
-
-    if use_thumbnail and max_dynamic_patch != 1:
-        max_dynamic_patch += 1
-
-    return min_dynamic_patch, max_dynamic_patch
-
-
-def get_h2ovl_target_ratios(
-    min_num: int,
-    max_num: int,
-    *,
-    prior_aspect_ratio: tuple[int, int] | None,
-) -> list[tuple[int, int]]:
-    target_ratios = get_internvl_target_ratios(min_num, max_num)
-
-    # if prior_aspect_ratio is provided, filter the target ratios
-    if prior_aspect_ratio is not None:
-        target_ratios = [
-            ratio
-            for ratio in target_ratios
-            if prior_aspect_ratio[0] % ratio[0] != 0
-            and prior_aspect_ratio[1] % ratio[1] != 0
-        ]
-
-    return target_ratios
-
-
-# modified to include blocks generated in second pass
-def calculate_h2ovl_targets(
-    *,
-    orig_width: int,
-    orig_height: int,
-    target_ratios: list[tuple[int, int]],
-    image_size: int,
-    use_thumbnail: bool,
-) -> tuple[int, int, int, tuple[int, int]]:
-    aspect_ratio = orig_width / orig_height
-
-    # find the closest aspect ratio to the target
-    target_aspect_ratio = find_closest_aspect_ratio(
-        aspect_ratio,
-        target_ratios,
-        width=orig_width,
-        height=orig_height,
-        image_size=image_size,
-    )
-
-    # calculate the target width and height
-    target_width = image_size * target_aspect_ratio[0]
-    target_height = image_size * target_aspect_ratio[1]
-    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
-
-    # add thumbnail image if num_blocks != 1
-    if use_thumbnail and blocks != 1:
-        blocks += 1
-
-    return blocks, target_width, target_height, target_aspect_ratio
-
-
-# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
-# refactored to handle prior_aspect_ratio
-def dynamic_preprocess_h2ovl(
-    image: Image.Image,
-    *,
-    target_ratios: list[tuple[int, int]],
-    image_size: int,
-    use_thumbnail: bool,
-) -> tuple[list[Image.Image], tuple[int, int]]:
-    orig_width, orig_height = image.size
-
-    # calculate the number of blocks without thumbnail
-    (
-        blocks,
-        target_width,
-        target_height,
-        target_aspect_ratio,
-    ) = calculate_h2ovl_targets(
-        orig_width=orig_width,
-        orig_height=orig_height,
-        target_ratios=target_ratios,
-        image_size=image_size,
-        use_thumbnail=False,
-    )
-
-    # resize the image
-    resized_img = image.resize((target_width, target_height))
-    processed_images = []
-    for i in range(blocks):
-        box = (
-            (i % (target_width // image_size)) * image_size,
-            (i // (target_width // image_size)) * image_size,
-            ((i % (target_width // image_size)) + 1) * image_size,
-            ((i // (target_width // image_size)) + 1) * image_size,
-        )
-        # split the image
-        split_img = resized_img.crop(box)
-        processed_images.append(split_img)
-
-    assert len(processed_images) == blocks
-
-    if use_thumbnail and len(processed_images) != 1:
-        thumbnail_img = image.resize((image_size, image_size))
-        processed_images.append(thumbnail_img)
-
-    return processed_images, target_aspect_ratio
-
-
-def _preprocess_image(
-    image: Image.Image,
-    *,
-    input_size: int,
-    min_num: int,
-    max_num: int,
-    use_thumbnail: bool,
-    prior_aspect_ratio: tuple[int, int] | None,
-) -> tuple[torch.Tensor, tuple[int, int]]:
-    target_ratios = get_h2ovl_target_ratios(
-        min_num,
-        max_num,
-        prior_aspect_ratio=prior_aspect_ratio,
-    )
-
-    transform = build_transform(input_size=input_size)
-    images, target_aspect_ratio = dynamic_preprocess_h2ovl(
-        image,
-        image_size=input_size,
-        use_thumbnail=use_thumbnail,
-        target_ratios=target_ratios,
-    )
-
-    pixel_values = torch.stack([transform(image) for image in images])
-    return pixel_values, target_aspect_ratio
-
-
-# refactored to use the _preprocess_image function
-def image_to_pixel_values_h2ovl(
-    image: Image.Image,
-    *,
-    input_size: int,
-    min_num: int,
-    max_num: int,
-    use_thumbnail: bool,
-    use_msac: bool,
-) -> torch.Tensor:
-    # when MSAC is turned on, we need to process the image twice
-    if use_msac:
-        # first pass
-        pixel_values1, aspect_ratio1 = _preprocess_image(
-            image,
-            input_size=input_size,
-            min_num=1,
-            max_num=max_num,
-            use_thumbnail=True,
-            prior_aspect_ratio=None,
-        )
-        # second pass
-        pixel_values2, _ = _preprocess_image(
-            image,
-            input_size=input_size,
-            min_num=3,
-            max_num=max_num,
-            use_thumbnail=True,
-            prior_aspect_ratio=aspect_ratio1,
-        )
-        # combine pixel values
-        pixel_values = torch.cat(
-            [pixel_values2[:-1], pixel_values1[:-1], pixel_values2[-1:]], 0
-        )
-
-    else:
-        pixel_values, _ = _preprocess_image(
-            image,
-            input_size=input_size,
-            min_num=min_num,
-            max_num=max_num,
-            use_thumbnail=use_thumbnail,
-            prior_aspect_ratio=None,
-        )
-
-    return pixel_values
-
-
-class H2OVLProcessor(BaseInternVLProcessor):
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        tokenizer: TokenizerLike,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-        use_msac: bool | None = None,
-    ) -> None:
-        super().__init__(
-            config,
-            tokenizer,
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-        )
-
-        if use_msac is None:
-            use_msac = config.use_msac
-        assert isinstance(use_msac, bool)
-
-        self.use_msac = use_msac
-
-    @property
-    def image_token_id(self) -> int:
-        return self.tokenizer.get_vocab()[IMG_CONTEXT]
-
-    def get_image_repl(
-        self,
-        feature_size: int,
-        num_patches: int | None,
-    ) -> PromptUpdateDetails[str]:
-        repl_features = IMG_CONTEXT * feature_size
-        repl_full = IMG_START + repl_features + IMG_END
-
-        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
-
-    def resolve_min_max_num(
-        self,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-        use_thumbnail: bool | None = None,
-    ) -> tuple[int, int]:
-        min_dynamic_patch = (
-            self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
-        )
-        max_dynamic_patch = (
-            self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
-        )
-        dynamic_image_size = (
-            self.dynamic_image_size
-            if dynamic_image_size is None
-            else dynamic_image_size
-        )
-        use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
-
-        return resolve_h2ovl_min_max_num(
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-            use_thumbnail=use_thumbnail,
-        )
-
-    def resolve_target_ratios(
-        self,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-        use_thumbnail: bool | None = None,
-        prior_aspect_ratio: tuple[int, int] | None = None,
-        override_min_num: int | None = None,
-    ) -> list[tuple[int, int]]:
-        min_num, max_num = self.resolve_min_max_num(
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-            use_thumbnail=use_thumbnail,
-        )
-        if override_min_num is not None:
-            min_num = override_min_num
-
-        return get_h2ovl_target_ratios(
-            min_num,
-            max_num,
-            prior_aspect_ratio=prior_aspect_ratio,
-        )
-
-    def get_num_image_tokens(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-        use_msac: bool | None = None,
-    ) -> int:
-        use_msac = self.use_msac if use_msac is None else use_msac
-
-        use_thumbnail = self.use_thumbnail
-
-        if use_msac:
-            target_ratios_1 = self.resolve_target_ratios(
-                use_thumbnail=False,  # Applied in calculate_targets
-                override_min_num=1,
-            )
-            num_patches_1, _, _, aspect_ratio_1 = calculate_h2ovl_targets(
-                orig_width=image_width,
-                orig_height=image_height,
-                image_size=self.image_size,
-                target_ratios=target_ratios_1,
-                use_thumbnail=True,
-            )
-
-            target_ratios_2 = self.resolve_target_ratios(
-                use_thumbnail=False,  # Applied in calculate_targets
-                prior_aspect_ratio=aspect_ratio_1,
-                override_min_num=3,
-            )
-            num_patches_2, _, _, _ = calculate_h2ovl_targets(
-                orig_width=image_width,
-                orig_height=image_height,
-                image_size=self.image_size,
-                target_ratios=target_ratios_2,
-                use_thumbnail=True,
-            )
-
-            num_patches = num_patches_1 + num_patches_2 - 1
-        else:
-            target_ratios = self.resolve_target_ratios(
-                use_thumbnail=False,  # Applied in calculate_targets
-            )
-            num_patches, _, _, _ = calculate_h2ovl_targets(
-                orig_width=image_width,
-                orig_height=image_height,
-                image_size=self.image_size,
-                target_ratios=target_ratios,
-                use_thumbnail=use_thumbnail,
-            )
-
-        return num_patches * self.num_image_token
-
-    def _images_to_pixel_values_lst(
-        self,
-        images: list[Image.Image],
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-    ) -> list[torch.Tensor]:
-        use_msac = self.use_msac if len(images) == 1 else False
-
-        min_num, max_num = self.resolve_min_max_num(
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-            use_thumbnail=False,  # Applied in image_to_pixel_values
-        )
-
-        return [
-            image_to_pixel_values_h2ovl(
-                image,
-                input_size=self.image_size,
-                min_num=min_num,
-                max_num=max_num,
-                use_thumbnail=self.use_thumbnail,
-                use_msac=use_msac,
-            )
-            for image in images
-        ]
-
-
 class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
    def get_hf_processor(self, **kwargs: object) -> H2OVLProcessor:
        return self.ctx.init_processor(

--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -7,16 +7,13 @@
 # Copyright (c) 2023 OpenGVLab
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
-from abc import ABC, abstractmethod
+from abc import abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Any, Literal, TypeAlias, TypeVar
+from typing import Annotated, Literal, TypeAlias, TypeVar

-import numpy.typing as npt
 import torch
 import torch.nn as nn
-import torchvision.transforms as T
-from PIL import Image
-from transformers import BatchFeature, PretrainedConfig, TensorType
+from transformers import BatchFeature, PretrainedConfig

 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
@@ -28,7 +25,6 @@ from vllm.model_executor.models.intern_vit import (
 )
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import convert_image_mode
 from vllm.multimodal.inputs import (
    MultiModalDataDict,
    MultiModalFieldConfig,
@@ -46,10 +42,12 @@ from vllm.multimodal.processing import (
    BaseProcessingInfo,
    PromptReplacement,
    PromptUpdate,
-    PromptUpdateDetails,
 )
 from vllm.sequence import IntermediateTensors
-from vllm.tokenizers import TokenizerLike
+from vllm.transformers_utils.processors.internvl import (
+    BaseInternVLProcessor,
+    InternVLProcessor,
+)
 from vllm.utils.tensor_schema import TensorSchema, TensorShape

 from .interfaces import (
@@ -60,13 +58,6 @@ from .interfaces import (
 )
 from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix

-IMG_START = "<img>"
-IMG_END = "</img>"
-IMG_CONTEXT = "<IMG_CONTEXT>"
-
-IMAGENET_MEAN = (0.485, 0.456, 0.406)
-IMAGENET_STD = (0.229, 0.224, 0.225)
-

 class InternVLImagePixelInputs(TensorSchema):
    """
@@ -128,568 +119,6 @@ class InternVLVideoEmbeddingInputs(TensorSchema):
 InternVLVideoInputs: TypeAlias = InternVLVideoPixelInputs | InternVLVideoEmbeddingInputs


-# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
-def build_transform(input_size: int):
-    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
-    transform = T.Compose(
-        [
-            T.Lambda(lambda img: convert_image_mode(img, "RGB")),
-            T.Resize(
-                (input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
-            ),
-            T.ToTensor(),
-            T.Normalize(mean=MEAN, std=STD),
-        ]
-    )
-    return transform
-
-
-# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
-def find_closest_aspect_ratio(
-    aspect_ratio: float,
-    target_ratios: list[tuple[int, int]],
-    *,
-    width: int,
-    height: int,
-    image_size: int,
-) -> tuple[int, int]:
-    best_ratio_diff = float("inf")
-    best_ratio = (1, 1)
-    area = width * height
-    for ratio in target_ratios:
-        target_aspect_ratio = ratio[0] / ratio[1]
-        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
-        if ratio_diff < best_ratio_diff:
-            best_ratio_diff = ratio_diff
-            best_ratio = ratio
-        elif ratio_diff == best_ratio_diff:
-            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
-                best_ratio = ratio
-    return best_ratio
-
-
-def resolve_internvl_min_max_num(
-    *,
-    min_dynamic_patch: int,
-    max_dynamic_patch: int,
-    dynamic_image_size: bool,
-    use_thumbnail: bool,
-) -> tuple[int, int]:
-    min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
-    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
-
-    if use_thumbnail and max_dynamic_patch != 1:
-        max_dynamic_patch += 1
-
-    return min_dynamic_patch, max_dynamic_patch
-
-
-def get_internvl_target_ratios(
-    min_num: int,
-    max_num: int,
-) -> list[tuple[int, int]]:
-    target_ratios = {
-        (i, j)
-        for n in range(min_num, max_num + 1)
-        for i in range(1, n + 1)
-        for j in range(1, n + 1)
-        if min_num <= i * j <= max_num
-    }
-    return sorted(target_ratios, key=lambda x: x[0] * x[1])
-
-
-def calculate_internvl_targets(
-    *,
-    orig_width: int,
-    orig_height: int,
-    target_ratios: list[tuple[int, int]],
-    image_size: int,
-    use_thumbnail: bool,
-) -> tuple[int, int, int]:
-    aspect_ratio = orig_width / orig_height
-
-    # find the closest aspect ratio to the target
-    target_aspect_ratio = find_closest_aspect_ratio(
-        aspect_ratio,
-        target_ratios,
-        width=orig_width,
-        height=orig_height,
-        image_size=image_size,
-    )
-
-    # calculate the target width and height
-    target_width = image_size * target_aspect_ratio[0]
-    target_height = image_size * target_aspect_ratio[1]
-    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
-
-    # add thumbnail image if num_blocks != 1
-    if use_thumbnail and blocks != 1:
-        blocks += 1
-
-    return blocks, target_width, target_height
-
-
-# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
-def dynamic_preprocess_internvl(
-    image: Image.Image,
-    *,
-    target_ratios: list[tuple[int, int]],
-    image_size: int,
-    use_thumbnail: bool,
-) -> list[Image.Image]:
-    orig_width, orig_height = image.size
-
-    # calculate the number of blocks without thumbnail
-    blocks, target_width, target_height = calculate_internvl_targets(
-        orig_width=orig_width,
-        orig_height=orig_height,
-        target_ratios=target_ratios,
-        image_size=image_size,
-        use_thumbnail=False,
-    )
-
-    # resize the image
-    resized_img = image.resize((target_width, target_height))
-    processed_images = []
-    for i in range(blocks):
-        box = (
-            (i % (target_width // image_size)) * image_size,
-            (i // (target_width // image_size)) * image_size,
-            ((i % (target_width // image_size)) + 1) * image_size,
-            ((i // (target_width // image_size)) + 1) * image_size,
-        )
-        # split the image
-        split_img = resized_img.crop(box)
-        processed_images.append(split_img)
-
-    assert len(processed_images) == blocks
-
-    if use_thumbnail and len(processed_images) != 1:
-        thumbnail_img = image.resize((image_size, image_size))
-        processed_images.append(thumbnail_img)
-
-    return processed_images
-
-
-# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
-def image_to_pixel_values_internvl(
-    image: Image.Image,
-    *,
-    input_size: int,
-    min_num: int,
-    max_num: int,
-    use_thumbnail: bool,
-) -> torch.Tensor:
-    target_ratios = get_internvl_target_ratios(min_num, max_num)
-
-    transform = build_transform(input_size=input_size)
-    images = dynamic_preprocess_internvl(
-        image,
-        target_ratios=target_ratios,
-        image_size=input_size,
-        use_thumbnail=use_thumbnail,
-    )
-
-    pixel_values = torch.stack([transform(image) for image in images])
-    return pixel_values
-
-
-# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
-def video_to_pixel_values_internvl(
-    video: npt.NDArray,
-    *,
-    input_size: int,
-    min_num: int,
-    max_num: int,
-    use_thumbnail: bool,
-) -> torch.Tensor:
-    target_ratios = get_internvl_target_ratios(min_num, max_num)
-
-    transform = build_transform(input_size=input_size)
-    frames_list = list[Image.Image]()
-    for frame in video:
-        pil_frame = dynamic_preprocess_internvl(
-            Image.fromarray(frame, mode="RGB"),
-            target_ratios=target_ratios,
-            image_size=input_size,
-            use_thumbnail=use_thumbnail,
-        )
-        assert len(pil_frame) == 1
-        frames_list.extend(pil_frame)
-
-    pixel_values = torch.stack([transform(image) for image in frames_list])
-    return pixel_values
-
-
-class BaseInternVLProcessor(ABC):
-    """
-    This model doesn't define its own HF processor,
-    so we implement our own one here.
-
-    The code to insert image tokens is based on:
-    https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
-    """
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        tokenizer: TokenizerLike,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-    ) -> None:
-        super().__init__()
-
-        self.config = config
-        self.tokenizer = tokenizer
-
-        image_size: int = config.vision_config.image_size
-        patch_size: int = config.vision_config.patch_size
-
-        if min_dynamic_patch is None:
-            min_dynamic_patch = config.min_dynamic_patch
-        assert isinstance(min_dynamic_patch, int)
-
-        if max_dynamic_patch is None:
-            max_dynamic_patch = config.max_dynamic_patch
-        assert isinstance(max_dynamic_patch, int)
-
-        if dynamic_image_size is None:
-            dynamic_image_size = config.dynamic_image_size
-        assert isinstance(dynamic_image_size, bool)
-
-        self.num_image_token = int(
-            (image_size // patch_size) ** 2 * (config.downsample_ratio**2)
-        )
-        self.image_size = image_size
-        self.min_dynamic_patch = min_dynamic_patch
-        self.max_dynamic_patch = max_dynamic_patch
-        self.dynamic_image_size = dynamic_image_size
-        self.use_thumbnail: bool = config.use_thumbnail
-
-    @property
-    @abstractmethod
-    def image_token_id(self) -> int:
-        raise NotImplementedError
-
-    @abstractmethod
-    def get_image_repl(
-        self,
-        feature_size: int,
-        num_patches: int | None,
-    ) -> PromptUpdateDetails[str]:
-        raise NotImplementedError
-
-    def resolve_min_max_num(
-        self,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-        use_thumbnail: bool | None = None,
-    ) -> tuple[int, int]:
-        min_dynamic_patch = (
-            self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
-        )
-        max_dynamic_patch = (
-            self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
-        )
-        dynamic_image_size = (
-            self.dynamic_image_size
-            if dynamic_image_size is None
-            else dynamic_image_size
-        )
-        use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
-
-        return resolve_internvl_min_max_num(
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-            use_thumbnail=use_thumbnail,
-        )
-
-    def resolve_target_ratios(
-        self,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-        use_thumbnail: bool | None = None,
-    ) -> list[tuple[int, int]]:
-        min_num, max_num = self.resolve_min_max_num(
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-            use_thumbnail=use_thumbnail,
-        )
-
-        return get_internvl_target_ratios(min_num, max_num)
-
-    def get_num_image_tokens(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-    ) -> int:
-        target_ratios = self.resolve_target_ratios(
-            use_thumbnail=False,  # Applied in calculate_targets
-        )
-
-        num_patches, _, _ = calculate_internvl_targets(
-            orig_width=image_width,
-            orig_height=image_height,
-            image_size=self.image_size,
-            target_ratios=target_ratios,
-            use_thumbnail=self.use_thumbnail,
-        )
-
-        return num_patches * self.num_image_token
-
-    def _images_to_pixel_values_lst(
-        self,
-        images: list[Image.Image],
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-    ) -> list[torch.Tensor]:
-        min_num, max_num = self.resolve_min_max_num(
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-            use_thumbnail=False,  # Applied in image_to_pixel_values
-        )
-
-        return [
-            image_to_pixel_values_internvl(
-                image,
-                input_size=self.image_size,
-                min_num=min_num,
-                max_num=max_num,
-                use_thumbnail=self.use_thumbnail,
-            )
-            for image in images
-        ]
-
-    def _preprocess_image(
-        self,
-        text: list[str],
-        images: list[Image.Image],
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-    ) -> tuple[list[str], dict[str, torch.Tensor]]:
-        if len(images) == 0:
-            image_inputs = {}
-        else:
-            pixel_values_lst = self._images_to_pixel_values_lst(
-                images,
-                min_dynamic_patch=min_dynamic_patch,
-                max_dynamic_patch=max_dynamic_patch,
-                dynamic_image_size=dynamic_image_size,
-            )
-            image_inputs = {
-                "pixel_values_flat": torch.cat(pixel_values_lst),
-                "image_num_patches": torch.tensor(
-                    [len(item) for item in pixel_values_lst]
-                ),
-            }
-
-            for pixel_values in pixel_values_lst:
-                num_patches = pixel_values.shape[0]
-                feature_size = num_patches * self.num_image_token
-
-                image_repl = self.get_image_repl(feature_size, num_patches)
-                text = [t.replace("<image>", image_repl.full, 1) for t in text]
-        return text, image_inputs
-
-    def _make_batch_input(self, input_item: Any | list[Any] | None = None):
-        if input_item is None:
-            input_item = []
-        if not isinstance(input_item, list):
-            input_item = [input_item]
-        return input_item
-
-    def __call__(
-        self,
-        text: str | list[str] | None = None,
-        images: Image.Image | list[Image.Image] | None = None,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-        return_tensors: str | TensorType | None = None,
-    ) -> BatchFeature:
-        text, images = [self._make_batch_input(x) for x in (text, images)]
-
-        text, image_inputs = self._preprocess_image(
-            text=text,
-            images=images,
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-        )
-
-        text_inputs = self.tokenizer(text)
-
-        combined_outputs = {**text_inputs, **image_inputs}
-
-        return BatchFeature(combined_outputs, tensor_type=return_tensors)
-
-
-class InternVLProcessor(BaseInternVLProcessor):
-    """
-    HF Processor for InternVLChatModel with extended video processing logic.
-
-    Code for video processing is adapted from video example:
-    https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers
-    """
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        tokenizer: TokenizerLike,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-        video_token: str | None = None,
-    ) -> None:
-        super().__init__(
-            config=config,
-            tokenizer=tokenizer,
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-        )
-        # add extra video token for video processing
-        self.video_token = video_token
-
-    @property
-    def image_token_id(self) -> int:
-        return self.tokenizer.get_vocab()[IMG_CONTEXT]
-
-    @property
-    def video_token_id(self) -> int | None:
-        if self.video_token is None:
-            return None
-        return self.tokenizer.get_vocab().get(self.video_token, None)
-
-    @property
-    def supports_video(self) -> bool:
-        return self.video_token_id is not None
-
-    def _videos_to_pixel_values_lst(
-        self,
-        videos: list[npt.NDArray],
-        dynamic_image_size: bool | None = None,
-    ) -> list[torch.Tensor]:
-        min_num, max_num = self.resolve_min_max_num(
-            min_dynamic_patch=1,
-            max_dynamic_patch=1,
-            dynamic_image_size=dynamic_image_size,
-            use_thumbnail=False,  # Applied in image_to_pixel_values
-        )
-
-        return [
-            video_to_pixel_values_internvl(
-                video,
-                input_size=self.image_size,
-                min_num=min_num,
-                max_num=max_num,
-                use_thumbnail=False,
-            )
-            for video in videos
-        ]
-
-    def _preprocess_video(
-        self,
-        text: list[str],
-        videos: list[npt.NDArray],
-        dynamic_image_size: bool | None = None,
-    ):
-        if len(videos) == 0 or not self.supports_video:
-            video_inputs = {}
-        else:
-            pixel_values_lst_video = self._videos_to_pixel_values_lst(
-                videos,
-                dynamic_image_size=dynamic_image_size,
-            )
-            video_inputs = {
-                "pixel_values_flat_video": torch.cat(pixel_values_lst_video),
-                "video_num_patches": torch.tensor(
-                    [len(item) for item in pixel_values_lst_video]
-                ),
-            }
-
-            for pixel_values in pixel_values_lst_video:
-                num_patches = pixel_values.shape[0]
-
-                video_repl = self.get_video_repl(
-                    self.num_image_token, num_patches, self.video_token
-                )
-                text = [t.replace("<video>", video_repl.full, 1) for t in text]
-        return text, video_inputs
-
-    def __call__(
-        self,
-        text: str | list[str] | None = None,
-        images: Image.Image | list[Image.Image] | None = None,
-        videos: npt.NDArray | list[npt.NDArray] | None = None,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-        return_tensors: str | TensorType | None = None,
-    ) -> BatchFeature:
-        text, images, videos = [
-            self._make_batch_input(x) for x in (text, images, videos)
-        ]
-
-        text, image_inputs = self._preprocess_image(
-            text=text,
-            images=images,
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-        )
-
-        text, video_inputs = self._preprocess_video(
-            text=text,
-            videos=videos,
-            dynamic_image_size=dynamic_image_size,
-        )
-
-        text_inputs = self.tokenizer(text)
-
-        combined_outputs = {**text_inputs, **image_inputs, **video_inputs}
-
-        return BatchFeature(combined_outputs, tensor_type=return_tensors)
-
-    def get_image_repl(
-        self,
-        feature_size: int,
-        num_patches: int | None,
-    ) -> PromptUpdateDetails[str]:
-        repl_features = IMG_CONTEXT * feature_size
-        repl_full = IMG_START + repl_features + IMG_END
-
-        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
-
-    def get_video_repl(
-        self,
-        feature_size: int,
-        num_patches: int | None = None,
-        video_context_token: str = IMG_CONTEXT,
-    ) -> PromptUpdateDetails[str]:
-        repl_features = video_context_token * self.num_image_token
-        repl_features_with_sep = IMG_START + repl_features + IMG_END
-        # num_patches is equal to num_frames
-        repl_full = "".join(
-            [f"Frame{i + 1}: {repl_features_with_sep}" for i in range(num_patches)]
-        )
-
-        return PromptUpdateDetails.select_text(repl_full, video_context_token)
-
-
 class BaseInternVLProcessingInfo(BaseProcessingInfo):
    """Basic image-only ProcessingInfo for InternVL-style models."""


--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -8,22 +8,15 @@
 # --------------------------------------------------------

 import copy
-import math
 import warnings
-from abc import ABC, abstractmethod
+from abc import abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
-from dataclasses import dataclass
 from functools import cached_property
-from typing import Annotated, Any, Literal, TypeAlias, TypeVar
+from typing import Annotated, Literal, TypeAlias, TypeVar

-import einops
-import numpy as np
-import numpy.typing as npt
-import regex as re
 import torch
 import torch.nn as nn
-from PIL import Image
-from transformers import BatchFeature, PretrainedConfig, TensorType
+from transformers import BatchFeature

 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
@@ -38,10 +31,6 @@ from vllm.model_executor.models.interfaces import (
    SupportsMultiModal,
    SupportsMultiModalPruning,
 )
-from vllm.model_executor.models.internvl import (
-    calculate_internvl_targets,
-    get_internvl_target_ratios,
-)
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.nemotron_h import NemotronHForCausalLM
 from vllm.model_executor.models.parakeet import ParakeetExtractor, ProjectedParakeet
@@ -83,23 +72,28 @@ from vllm.multimodal.processing.processor import (
    BaseProcessingInfo,
    PromptReplacement,
    PromptUpdate,
-    PromptUpdateDetails,
-    _seq2tokens,
 )
 from vllm.renderers import TokenizeParams
 from vllm.sequence import IntermediateTensors
-from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
+from vllm.tokenizers import cached_tokenizer_from_config
 from vllm.transformers_utils.configs.radio import RadioConfig
+from vllm.transformers_utils.processors.nano_nemotron_vl import (
+    AUDIO_CONTEXT,
+    IMG_CONTEXT,
+    IMG_END,
+    IMG_START,
+    BaseNanoNemotronVLProcessor,
+    DynamicResolutionImageTiler,
+    NanoNemotronVLProcessor,
+    get_internvl_target_ratios,
+)
 from vllm.utils.tensor_schema import TensorSchema, TensorShape

 from .utils import _merge_multimodal_embeddings

 logger = init_logger(__name__)
-# Configure PIL to handle large images without warnings
-# This prevents DecompressionBombWarning for legitimate large images
-Image.MAX_IMAGE_PIXELS = None  # Disable the limit entirely
-# Alternative: Set a specific higher limit
-# Image.MAX_IMAGE_PIXELS = 300000000  # ~300M pixels
+
+MAX_AUDIO_LEN_S = 10 * 60  # 10 minutes


 class NanoNemotronVLAudioFeatureInputs(TensorSchema):
@@ -116,20 +110,6 @@ class NanoNemotronVLAudioFeatureInputs(TensorSchema):
    audio_feature_lengths: Annotated[torch.Tensor, TensorShape("b")]


-MAX_AUDIO_LEN_S = 10 * 60  # 10 minutes
-
-IMG_START = "<img>"
-IMG_END = "</img>"
-IMG_CONTEXT = "<image>"
-AUDIO_START = "<so_start>"
-AUDIO_END = "<so_end>"
-AUDIO_CONTEXT = "<so_embedding>"
-
-# Profiling
-# MAX_FRAMES = 16
-DEFAULT_NUM_TILES = 12
-
-
 class NanoNemotronVLImagePixelInputs(TensorSchema):
    """
    Dimensions:
@@ -213,987 +193,6 @@ NanoNemotronVLVideoInputs: TypeAlias = (
 )


-def dynamic_preprocess(
-    image,
-    *,
-    image_size=512,
-    max_num_tiles=12,
-    use_thumbnail=True,
-    idx=0,
-):
-    orig_width, orig_height = image.size
-
-    target_ratios = get_internvl_target_ratios(1, max_num_tiles)
-
-    blocks, target_width, target_height = calculate_internvl_targets(
-        orig_width=orig_width,
-        orig_height=orig_height,
-        target_ratios=target_ratios,
-        image_size=image_size,
-        use_thumbnail=False,
-    )
-
-    image = np.asarray(
-        image.convert("RGB") if image.mode != "RGB" else image, dtype=np.uint8
-    )
-
-    image = torch.from_numpy(image).unsqueeze(0)  # (1, H, W, 3)
-    image = image.permute(0, 3, 1, 2)  # (1, 3, H, W)
-
-    resized_img = torch.nn.functional.interpolate(
-        image,
-        size=(target_height, target_width),
-        mode="bicubic",
-        align_corners=False,
-        antialias=True,
-    )
-    B, C, H, W = resized_img.shape
-    hp, wp = H // image_size, W // image_size
-    patches = (
-        resized_img.reshape(B, C, hp, image_size, wp, image_size)
-        .permute(0, 2, 4, 1, 3, 5)
-        .reshape(B * hp * wp, C, image_size, image_size)
-        / 255.0
-    )
-
-    if use_thumbnail and patches.shape[0] > 1:
-        thumb = (
-            torch.nn.functional.interpolate(
-                image,
-                size=(image_size, image_size),
-                mode="bicubic",
-                align_corners=False,
-                antialias=True,
-            )
-            / 255.0
-        )
-        patches = torch.cat([patches, thumb], dim=0)
-
-    return list(patches)
-
-
-def image_to_pixel_values(
-    image: Image.Image,
-    *,
-    input_size: int,
-    max_num: int,
-    use_thumbnail: bool,
-    idx: int,
-) -> torch.Tensor:
-    images = dynamic_preprocess(
-        image,
-        image_size=input_size,
-        max_num_tiles=max_num,
-        use_thumbnail=use_thumbnail,
-        idx=idx,
-    )
-
-    pixel_values = torch.stack(images)
-    return pixel_values
-
-
-def video_to_pixel_values(
-    video: npt.NDArray,
-    *,
-    input_size: int,
-    max_num_tiles: int = 1,
-    use_thumbnail: bool,
-) -> torch.Tensor:
-    assert max_num_tiles == 1, "Video modality always uses one tile"
-
-    # (num_frames, H, W, C) -> (num_frames, C, H, W)
-    video_tensor = torch.from_numpy(video).permute(0, 3, 1, 2)
-
-    if video_tensor.shape[2] != input_size or video_tensor.shape[3] != input_size:
-        video_tensor = torch.nn.functional.interpolate(
-            video_tensor,
-            size=(input_size, input_size),
-            mode="bicubic",
-            align_corners=False,
-            antialias=True,
-        )
-
-    video_tensor = video_tensor / 255.0
-
-    return video_tensor
-
-
-def input_conditioner(x, norm_mean, norm_std):
-    return (x - norm_mean) / norm_std
-
-
-def calculate_timestamps(
-    indices: list[int] | torch.Tensor,
-    frame_duration_ms: int,
-):
-    if not isinstance(indices, list):
-        indices = indices.tolist()
-
-    timestamps = [int(i) * frame_duration_ms / 1000.0 for i in indices]
-    return timestamps
-
-
-class DynamicResolutionImageTiler:
-    CONV_MERGING = False
-    PIXEL_SHUFFLE = True
-    USE_THUMBNAIL = False
-
-    def __init__(
-        self,
-        *,
-        max_model_len: int,
-        patch_size: int,
-        min_num_patches: int,
-        max_num_patches: int,
-        downsample_ratio: int,
-        norm_mean: Sequence[float],
-        norm_std: Sequence[float],
-        factor_max: float = 1.0,
-        use_thumbnail: bool = False,
-    ) -> None:
-        assert use_thumbnail is False, "use_thumbnail is not supported"
-        self._patch_size: int = patch_size
-        self._max_model_len = max_model_len
-        self._min_num_patches = min_num_patches
-        self._max_num_patches = max_num_patches if max_num_patches > 0 else float("inf")
-        self._factor_max = factor_max
-        self.norm_mean = torch.tensor(norm_mean).reshape(3, 1, 1)
-        self.norm_std = torch.tensor(norm_std).reshape(3, 1, 1)
-        assert downsample_ratio < 1
-        reduction_factor = 1 / downsample_ratio
-        assert reduction_factor == 2.0
-        self._downsample_ratio = int(reduction_factor) ** (
-            self.PIXEL_SHUFFLE + self.CONV_MERGING
-        )
-        assert self._downsample_ratio == 2
-
-    def _get_num_embeddings(self, width: int, height: int) -> int:
-        num_patches = (width // self._patch_size) * (height // self._patch_size)
-        num_tokens = num_patches // (self._downsample_ratio**2)
-        return num_tokens
-
-    def width_and_height_for_max_num_tokens_available(
-        self,
-        target_num_tokens_post_shuffle: int,
-    ) -> tuple[int, int]:
-        """
-        TODO: optimize this so it squeezes closer to target number of tokens.
-        Calculate image dimensions that produce approximately `target` tokens after
-        pixel_shuffle.
-
-        With pixel_shuffle enabled, each 2x2 patch grid becomes 1 token, so we
-        need 4*B patches to get B tokens.
-
-        Examples:
-        >>> PATCH_SIZE = 16
-        >>> DOWNSAMPLE_RATIO = 0.5
-        >>> tiler = DynamicResolutionImageTiler(
-        ...     max_model_len=16384,
-        ...     patch_size=PATCH_SIZE,
-        ...     downsample_ratio=DOWNSAMPLE_RATIO,
-        ...     min_num_patches=4,
-        ...     max_num_patches=0,
-        ... )
-        >>> width, height = tiler.width_and_height_for_max_num_tokens_available(
-        ...     target_num_tokens_post_shuffle=8192,
-        ... )
-        >>> assert width, height == (2880, 2880)
-        >>> assert (width // PATCH_SIZE) * (
-        ...     height // PATCH_SIZE
-        ... ) // 2**2 == 8100  # tokens post-shuffle
-        >>> assert tiler._get_num_embeddings(width=width, height=height) == 8100
-        """
-        side_pixels = (
-            math.isqrt(target_num_tokens_post_shuffle)
-            * self._downsample_ratio
-            * self._patch_size
-        )
-        assert isinstance(side_pixels, int) and side_pixels % self._patch_size == 0
-        return side_pixels, side_pixels
-
-    def max_num_tokens_available(self, text_prompt_length: int) -> int:
-        return self._max_model_len - text_prompt_length - 4
-
-    def _images_to_pixel_values_lst(
-        self,
-        text_prompt_length: int,
-        images: list[Image.Image],
-    ) -> tuple[list[torch.Tensor], list[int]]:
-        num_tokens_available = self.max_num_tokens_available(text_prompt_length)
-        params_per_image = self.compute_params(images, num_tokens_available)
-
-        feature_sizes = []
-        images = []
-        for param in params_per_image:
-            for t in self.apply_params(param):
-                assert t.ndim == 3, f"{t.ndim=}: expected 3 dim tensor"
-                images.append(t)
-                feature_sizes.append(param.num_embeddings)
-        return images, feature_sizes
-
-    feature_size_cache: dict[Image.Image, int] = {}
-
-    @classmethod
-    def get_cached_feature_size(cls, image: Image.Image) -> int:
-        feature_size = cls.feature_size_cache[id(image)]
-        # hard assert that we only use the feature size once
-        del cls.feature_size_cache[id(image)]
-        return feature_size
-
-    @dataclass
-    class DynamicResolutionParams:
-        media: Image.Image
-        num_tiles: int
-        num_embeddings: int
-        patch_size: tuple[int, int]
-
-    def apply_params(self, params: DynamicResolutionParams) -> list[torch.Tensor]:
-        target_size = (
-            params.patch_size[1] * self._patch_size,
-            params.patch_size[0] * self._patch_size,
-        )
-        image = np.asarray(
-            params.media.convert("RGB") if params.media.mode != "RGB" else params.media,
-            dtype=np.uint8,
-        )
-        resized_img = (
-            torch.nn.functional.interpolate(
-                torch.from_numpy(image).unsqueeze(0).permute(0, 3, 1, 2),
-                size=target_size,
-                mode="bicubic",
-                align_corners=False,
-                antialias=True,
-            )
-            / 255.0
-        )
-        return list(resized_img)
-
-    def process_media(
-        self,
-        media: Image.Image,
-        num_tokens_available: int,
-    ) -> tuple[DynamicResolutionParams, int]:
-        """Process a single media item and return its parameters.
-
-        Args:
-            media: The media item to process
-            num_tokens_available: Number of tokens available for this media
-        Returns:
-            DynamicResolutionParams for the media
-        """
-        current_num_tokens_available = num_tokens_available
-        assert isinstance(media, Image.Image), (
-            "Dynamic resolution is only supported for image media"
-        )
-        orig_width, orig_height = media.width, media.height
-        closest_patch_height = round(orig_height / self._patch_size + 0.5)
-        closest_patch_width = round(orig_width / self._patch_size + 0.5)
-        patches = closest_patch_height * closest_patch_width
-
-        factor = min(
-            math.sqrt(current_num_tokens_available / patches), self._factor_max
-        )
-        target_patch_height = math.floor(factor * closest_patch_height)
-        target_patch_width = math.floor(factor * closest_patch_width)
-
-        # Consider self._min_num_patches if > current_num_tokens_available.
-        if (
-            current_num_tokens_available > self._min_num_patches
-            and target_patch_height * target_patch_width < self._min_num_patches
-        ):
-            up_factor = math.sqrt(
-                self._min_num_patches / (target_patch_height * target_patch_width)
-            )
-            target_patch_height = math.ceil(up_factor * target_patch_height)
-            target_patch_width = math.ceil(up_factor * target_patch_width)
-
-        # Round patch grid to be divisible by 2 (pixel-shuffle OR conv-merging)
-        # or by 4 when BOTH are enabled (two successive 2x reductions)
-        if self.PIXEL_SHUFFLE or self.CONV_MERGING:
-            required_divisor = 4 if (self.PIXEL_SHUFFLE and self.CONV_MERGING) else 2
-
-            rem_h = target_patch_height % required_divisor
-            if rem_h != 0:
-                inc_h = required_divisor - rem_h
-                if (
-                    target_patch_height + inc_h
-                ) * target_patch_width <= current_num_tokens_available:
-                    target_patch_height += inc_h
-                else:
-                    target_patch_height = max(
-                        required_divisor, target_patch_height - rem_h
-                    )
-
-            rem_w = target_patch_width % required_divisor
-            if rem_w != 0:
-                inc_w = required_divisor - rem_w
-                if (
-                    target_patch_height * (target_patch_width + inc_w)
-                    <= current_num_tokens_available
-                ):
-                    target_patch_width += inc_w
-                else:
-                    target_patch_width = max(
-                        required_divisor, target_patch_width - rem_w
-                    )
-
-        # Calculate embeddings for the main dynamic resolution image
-        num_embeddings = self._get_num_embeddings(
-            target_patch_width * self._patch_size,
-            target_patch_height * self._patch_size,
-        )
-
-        token_count = target_patch_width * target_patch_height
-
-        # Add thumbnail embeddings if enabled and image area is below threshold
-        num_tiles = 1  # Base dynamic resolution image
-
-        return self.DynamicResolutionParams(
-            media=media,
-            num_tiles=num_tiles,
-            num_embeddings=num_embeddings,
-            patch_size=(target_patch_width, target_patch_height),
-        ), token_count
-
-    def compute_params(
-        self,
-        media_list: list[Image.Image],
-        num_tokens_available: int | None = None,
-    ) -> list[DynamicResolutionParams]:
-        """Compute parameters for all media with iterative token budgeting.
-
-        Args:
-            media_list: List of media items to process
-            num_tokens_available: Total number of tokens available across all media
-        Returns:
-            List of ImageTilingParams for each media item
-        """
-        num_tokens_available = (
-            num_tokens_available
-            * (4 if self.PIXEL_SHUFFLE else 1)
-            * (4 if self.CONV_MERGING else 1)
-        )
-        # When the number of available token is too small,
-        # allow self._min_num_patches per media and let the sample be truncated.
-        num_tokens_available = max(
-            num_tokens_available, self._min_num_patches * len(media_list)
-        )
-
-        # Clip the number of tokens available per media to >min and <max patches.
-        num_tokens_available_per_media = [
-            max(min(num_tokens_available, self._max_num_patches), self._min_num_patches)
-            for _ in range(len(media_list))
-        ]
-
-        # prevent infinite loop in any case
-        for _ in range(10):
-            # Step 1: Process each media with current token budget
-            params = []
-            token_counts = []
-
-            for media, tokens_for_media in zip(
-                media_list, num_tokens_available_per_media
-            ):
-                param, token_count = self.process_media(media, tokens_for_media)
-                params.append(param)
-                token_counts.append(token_count)
-                self.feature_size_cache[id(param.media)] = param.num_embeddings
-
-            # Step 2: Check if total tokens is within budget
-            total_tokens = sum(token_counts)
-
-            if total_tokens <= num_tokens_available:
-                # We're within budget, return the params
-                return params
-
-            # Step 3: We're over budget, need to scale down
-            # Calculate scaling factor to get under budget
-            scaling_factor = num_tokens_available / total_tokens
-
-            # Recalculate token budgets for each media based on scaling
-            # Each media gets a proportional share of the total budget
-            scaled_down_num_tokens_available_per_media = [
-                max(self._min_num_patches, int(token_count * scaling_factor))
-                for token_count in token_counts
-            ]
-            scaled_down = any(
-                [
-                    scaled_down_num_tokens_available_per_media[i]
-                    < num_tokens_available_per_media[i]
-                    for i in range(len(num_tokens_available_per_media))
-                ]
-            )
-            # If there wasn't scaling down, we're stuck with min_num_patches per media,
-            # else try with the scaled down num_tokens_available_per_media.
-            if not scaled_down:
-                num_tokens_available_per_media = [self._min_num_patches] * len(
-                    media_list
-                )
-            else:
-                num_tokens_available_per_media = (
-                    scaled_down_num_tokens_available_per_media
-                )
-        ctx = f"{params=} {total_tokens=} {num_tokens_available=}"
-        raise ValueError(
-            f"Should be unreachable - `return params` above must be reached: {ctx}"
-        )
-
-    @staticmethod
-    def stack(images: list[torch.Tensor], patch_size: int) -> torch.Tensor:
-        assert len(images) > 0, "No images to stack"
-
-        def rearrange_img(x):
-            py = x.shape[-2] // patch_size
-            px = x.shape[-1] // patch_size
-            x = einops.rearrange(
-                x,
-                "c (py yy) (px xx) -> (py px) (c yy xx)",
-                py=py,
-                yy=patch_size,
-                px=px,
-                xx=patch_size,
-            )
-            return x
-
-        imgs = [rearrange_img(img) for img in images]
-        pixel_values_flat = torch.cat(imgs, dim=0).unsqueeze(0)
-        return pixel_values_flat
-
-
-class BaseNanoNemotronVLProcessor(ABC):
-    """
-    This model doesn't define its own HF processor,
-    so we implement our own one here.
-
-    The code to insert image tokens is based on:
-    https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
-    """
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        tokenizer: TokenizerLike,
-        *args,
-        max_model_len: int,
-        max_num_tiles: int | None = None,
-        **kwargs,
-    ) -> None:
-        super().__init__()
-
-        self.config = config
-        self.tokenizer = tokenizer
-
-        self.max_num_tiles = max_num_tiles or DEFAULT_NUM_TILES
-        image_size: int = config.force_image_size
-        patch_size: int = config.patch_size
-        downsample_ratio: int = config.downsample_ratio
-
-        self.num_image_token = int(
-            (image_size // patch_size) ** 2 * (downsample_ratio**2)
-        )
-        self.image_size = image_size
-        self.use_thumbnail: bool = config.use_thumbnail
-        self.norm_mean = torch.Tensor(config.norm_mean).reshape(1, 3, 1, 1)
-        self.norm_std = torch.Tensor(config.norm_std).reshape(1, 3, 1, 1)
-
-        self.dynamic_tiler: DynamicResolutionImageTiler | None = None
-        if self.use_dynamic_resolution(config):
-            self.dynamic_tiler = DynamicResolutionImageTiler(
-                max_model_len=max_model_len,
-                patch_size=patch_size,
-                downsample_ratio=downsample_ratio,
-                min_num_patches=config.vision_config.args["min_num_patches"],
-                max_num_patches=config.vision_config.args["max_num_patches"],
-                norm_mean=config.norm_mean,
-                norm_std=config.norm_std,
-            )
-
-    @staticmethod
-    def use_dynamic_resolution(config: PretrainedConfig) -> bool:
-        return "min_num_patches" in config.vision_config.args
-
-    @property
-    @abstractmethod
-    def image_token_id(self) -> int:
-        raise NotImplementedError
-
-    @abstractmethod
-    def get_image_repl(
-        self,
-        feature_size: int,
-        num_patches: int | None,
-    ) -> PromptUpdateDetails[str]:
-        raise NotImplementedError
-
-    def get_num_image_tokens(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-        max_num_tiles: int,
-    ) -> int:
-        target_ratios = get_internvl_target_ratios(1, max_num_tiles)
-
-        num_patches, _, _ = calculate_internvl_targets(
-            orig_width=image_width,
-            orig_height=image_height,
-            target_ratios=target_ratios,
-            image_size=self.image_size,
-            use_thumbnail=self.use_thumbnail,
-        )
-
-        return num_patches * self.num_image_token
-
-    def _images_to_pixel_values_lst(
-        self,
-        images: list[Image.Image],
-        max_num_tiles: int,
-    ) -> list[torch.Tensor]:
-        return [
-            image_to_pixel_values(
-                image,
-                input_size=self.image_size,
-                max_num=max_num_tiles,
-                use_thumbnail=self.use_thumbnail,
-                idx=idx,
-            )
-            for idx, image in enumerate(images)
-        ]
-
-    def _preprocess_image(
-        self,
-        text: list[str],
-        images: list[Image.Image],
-        max_num_tiles: int,
-    ) -> tuple[list[str], dict[str, Any]]:
-        if len(images) == 0:
-            image_inputs = {}
-            return text, image_inputs
-
-        if tiler := self.dynamic_tiler:
-            sans_images = text[0].replace("<image>", "")
-            text_prompt_length = len(
-                self.tokenizer(sans_images, add_special_tokens=False).input_ids
-            )
-            pixel_values_lst, num_tokens_per_image = tiler._images_to_pixel_values_lst(
-                text_prompt_length=text_prompt_length,
-                images=images,
-            )
-            imgs_sizes = [(pv.shape[-2], pv.shape[-1]) for pv in pixel_values_lst]
-            normalized = [
-                input_conditioner(img, tiler.norm_mean, tiler.norm_std)
-                for img in pixel_values_lst
-            ]
-            image_num_patches = torch.tensor([1] * len(num_tokens_per_image))
-            image_inputs = {
-                "pixel_values_flat": normalized,
-                "imgs_sizes": imgs_sizes,
-                "num_tokens_per_image": num_tokens_per_image,
-            }
-        else:
-            pixel_values_lst = self._images_to_pixel_values_lst(images, max_num_tiles)
-            image_num_patches = torch.tensor([len(item) for item in pixel_values_lst])
-            pixel_values_flat = input_conditioner(
-                torch.cat(pixel_values_lst), self.norm_mean, self.norm_std
-            )
-            image_inputs = {
-                "pixel_values_flat": pixel_values_flat,
-                "image_num_patches": image_num_patches,
-            }
-            num_tokens_per_image = [
-                self.num_image_token * len(item) for item in pixel_values_lst
-            ]
-
-        assert len(text) == 1, (
-            "hf_processor is called on the output of get_dummy_text, "
-            "which should be a single string"
-        )
-        parts = [x for x in re.split(r"(<image>)", text[0]) if x]
-        assert parts.count("<image>") == len(pixel_values_lst), (
-            "the number of <image> tokens in the text should be the "
-            "same as the number of images"
-        )
-
-        for i, (feature_size, num_patches) in enumerate(
-            zip(num_tokens_per_image, image_num_patches, strict=True)
-        ):
-            image_repl = self.get_image_repl(feature_size, num_patches)
-            parts[i] = parts[i].replace("<image>", image_repl.full)
-        text = ["".join(parts)]
-
-        return text, image_inputs
-
-    def _make_batch_input(self, input_item: Any | list[Any] | None = None):
-        if input_item is None:
-            input_item = []
-        if not isinstance(input_item, list):
-            input_item = [input_item]
-        return input_item
-
-    @abstractmethod
-    def __call__(
-        self,
-        text: str | list[str] | None = None,
-        images: Image.Image | list[Image.Image] | None = None,
-        return_tensors: str | TensorType | None = None,
-        max_num_tiles: int | None = None,
-    ) -> BatchFeature:
-        raise NotImplementedError
-
-
-class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
-    """
-    HF Processor  with extended video processing logic.
-    Code for video processing is adapted from video example:
-    https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers
-    """
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        tokenizer: TokenizerLike,
-        *,
-        max_model_len: int,
-        max_num_tiles: int | None = None,
-        video_token: str | None = None,
-        video_pruning_rate: float | None = None,
-    ) -> None:
-        super().__init__(
-            config=config,
-            tokenizer=tokenizer,
-            max_model_len=max_model_len,
-            max_num_tiles=max_num_tiles,
-        )
-        # add extra video token for video processing
-        self.video_token = video_token
-        self.video_pruning_rate = video_pruning_rate
-
-        self.audio_extractor: ParakeetExtractor | None = None
-        raw_sound_config = getattr(config, "sound_config", None)
-        if raw_sound_config is not None:
-            self.audio_extractor = ParakeetExtractor(raw_sound_config)
-
-        # Pre-tokenize special tokens for video processing
-        # to avoid repeated tokenization
-        self._img_start_token_ids = tokenizer.encode(
-            IMG_START, add_special_tokens=False
-        )
-        self._img_end_token_ids = tokenizer.encode(IMG_END, add_special_tokens=False)
-        self._img_context_token_ids = tokenizer.encode(
-            IMG_CONTEXT, add_special_tokens=False
-        )
-
-    @property
-    def supports_video(self) -> bool:
-        return self.video_token_id is not None
-
-    @property
-    def video_token_id(self) -> int | None:
-        if self.video_token is None:
-            return None
-        return self.tokenizer.get_vocab().get(self.video_token, None)
-
-    @property
-    def image_token_id(self) -> int:
-        return self.tokenizer.convert_tokens_to_ids(IMG_CONTEXT)
-
-    def _videos_to_pixel_values_lst(
-        self,
-        videos: list[npt.NDArray],
-        max_num_tiles: int,
-    ) -> list[torch.Tensor]:
-        return [
-            video_to_pixel_values(
-                video,
-                input_size=self.image_size,
-                max_num_tiles=max_num_tiles,
-                use_thumbnail=self.use_thumbnail,
-            )
-            for video in videos
-        ]
-
-    def _preprocess_video(
-        self,
-        text: list[str],
-        videos: list[tuple[npt.NDArray, dict[str, Any]]],
-        max_num_tiles: int,
-    ):
-        if len(videos) == 0 or not self.supports_video:
-            video_inputs = {}
-        else:
-            videos_lst = [v[0] for v in videos]
-            video_metadata_lst = [v[1] for v in videos]
-            pixel_values_lst_video = self._videos_to_pixel_values_lst(
-                videos_lst,
-                max_num_tiles=max_num_tiles,
-            )
-
-            # We use frame duration in milliseconds (as integer) to ensure
-            # we have consistent timestamps calculation. At preprocessing
-            # fps parameter is given in fp32, while at inference it is bf16
-            # which leads to inaccurate timestamp calculation and causes
-            # timestamp values to differ.In rare cases this causes
-            # mismatching number of output tokens for tokenized  frame prefixes
-            frame_duration_ms_lst = [
-                int(1000.0 / metadata["fps"]) for metadata in video_metadata_lst
-            ]
-            frames_indices_lst = [
-                metadata["frames_indices"] for metadata in video_metadata_lst
-            ]
-            video_num_patches = torch.tensor(
-                [len(item) for item in pixel_values_lst_video]
-            )
-            video_inputs = {
-                "pixel_values_flat_video": input_conditioner(
-                    torch.cat(pixel_values_lst_video), self.norm_mean, self.norm_std
-                ),
-                "video_num_patches": video_num_patches,
-                "frames_indices": frames_indices_lst,
-                "frame_duration_ms": torch.tensor(frame_duration_ms_lst),
-            }
-
-            image_size: int = self.config.force_image_size
-            patch_size: int = self.config.patch_size
-            downsample_ratio = self.config.downsample_ratio
-            tokens_in_single_frame = int(
-                (image_size * image_size // patch_size**2) * (downsample_ratio**2)
-            )
-
-            for pixel_values, video_metadata, frames_indices, frame_duration_ms in zip(
-                pixel_values_lst_video,
-                video_metadata_lst,
-                frames_indices_lst,
-                frame_duration_ms_lst,
-            ):
-                num_frames = pixel_values.shape[0]
-
-                if (
-                    self.video_pruning_rate is not None
-                    and self.video_pruning_rate > 0.0
-                ):
-                    # Start of EVS-specific code
-                    num_tokens = compute_retained_tokens_count(
-                        tokens_per_frame=tokens_in_single_frame,
-                        num_frames=num_frames,
-                        q=self.video_pruning_rate,
-                    )
-
-                    # Here we just need placeholders that won't actually be replaced -
-                    # we just need to make sure the total number of tokens is correct
-                    # assign all tokens to the first frame
-                    tokens_per_frame = [num_tokens] + [0] * (num_frames - 1)
-
-                    # End of EVS-specific code
-                else:
-                    tokens_per_frame = [tokens_in_single_frame] * num_frames
-
-                video_repl = self.get_video_repl(
-                    tokens_per_frame=tokens_per_frame,
-                    frames_indices=frames_indices,
-                    frame_duration_ms=frame_duration_ms,
-                    tokenizer=self.tokenizer,
-                    img_start_token_ids=self._img_start_token_ids,
-                    img_end_token_ids=self._img_end_token_ids,
-                    img_context_token_ids=self._img_context_token_ids,
-                )
-
-                # video_repl.full is a list of token IDs
-                # Convert token IDs back to text for the HF processor flow
-                video_repl_text = self.tokenizer.decode(
-                    video_repl.full, skip_special_tokens=False
-                )
-                text = [t.replace("<video>", video_repl_text, 1) for t in text]
-
-        return text, video_inputs
-
-    def _preprocess_audio(
-        self,
-        text: list[str],
-        audios: list[npt.NDArray],
-    ):
-        if len(audios) == 0:
-            return text, {}
-        assert self.audio_extractor is not None
-
-        extractor = self.audio_extractor
-
-        parts = [x for x in re.split(f"({re.escape(AUDIO_CONTEXT)})", text[0]) if x]
-        token_count = parts.count(AUDIO_CONTEXT)
-        if token_count != len(audios):
-            raise ValueError(
-                "Number of audio tokens in text does not match the number "
-                f"of audios (tokens={token_count}, audios={len(audios)})."
-            )
-        audio_index = 0
-        for idx, part in enumerate(parts):
-            if part == AUDIO_CONTEXT:
-                audio_repl = self.get_audio_repl(audios[audio_index])
-                parts[idx] = audio_repl.full
-                audio_index += 1
-        text = ["".join(parts)]
-        audio_inputs = extractor(
-            audios,
-            sampling_rate=extractor.sampling_rate,
-            return_tensors="pt",
-        )
-        input_audio_features = audio_inputs.input_features
-        feature_attention_mask = audio_inputs.attention_mask
-        audio_feature_lengths = feature_attention_mask.sum(dim=1)
-        audio_inputs = {
-            "input_audio_features": input_audio_features,
-            "feature_attention_mask": feature_attention_mask,
-            "audio_feature_lengths": audio_feature_lengths,
-        }
-
-        return text, audio_inputs
-
-    def __call__(
-        self,
-        text: str | list[str] | None = None,
-        images: Image.Image | list[Image.Image] | None = None,
-        videos: list[tuple[npt.NDArray, dict[str, Any]]] | None = None,
-        audios: AudioItem | list[AudioItem] | None = None,
-        return_tensors: str | TensorType | None = None,
-        max_num_tiles: int | None = None,
-    ) -> BatchFeature:
-        # Use default if not provided
-        if max_num_tiles is None:
-            max_num_tiles = self.max_num_tiles
-
-        text, images, videos, audios = [
-            self._make_batch_input(x) for x in (text, images, videos, audios)
-        ]
-
-        text, image_inputs = self._preprocess_image(
-            text=text,
-            images=images,
-            max_num_tiles=max_num_tiles,
-        )
-
-        text, video_inputs = self._preprocess_video(
-            text=text,
-            videos=videos,
-            max_num_tiles=1,
-        )
-
-        text, audio_inputs = self._preprocess_audio(
-            text=text,
-            audios=audios,
-        )
-
-        text_inputs = self.tokenizer(text, add_special_tokens=False)
-
-        combined_inputs = {**text_inputs, **video_inputs, **audio_inputs}
-
-        if self.dynamic_tiler is None:
-            batch = BatchFeature(
-                {**combined_inputs, **image_inputs},
-                tensor_type=return_tensors,
-            )
-        else:
-            batch = BatchFeature(combined_inputs, tensor_type=return_tensors)
-            # allow images to be exempt from the BatchFeature validation:
-            # We will .stack() them in _parse_and_validate_image_input
-            batch.update(image_inputs)
-        return batch
-
-    def get_image_repl(
-        self,
-        feature_size: int,
-        num_patches: int | None,
-    ) -> PromptUpdateDetails[str]:
-        repl_features = IMG_CONTEXT * feature_size
-        repl_full = IMG_START + repl_features + IMG_END
-
-        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
-
-    def get_audio_repl(
-        self,
-        audio: npt.NDArray,
-    ) -> PromptUpdateDetails[str]:
-        assert self.audio_extractor is not None
-        num_tokens = self.audio_extractor.audio_token_count(len(audio))
-        repl_full = f"{AUDIO_START}{AUDIO_CONTEXT * num_tokens}{AUDIO_END}"
-        return PromptUpdateDetails.select_text(repl_full, AUDIO_CONTEXT)
-
-    @classmethod
-    def get_video_repl(
-        cls,
-        *,
-        tokens_per_frame: list[int],
-        frames_indices: list[int],
-        frame_duration_ms: int,
-        tokenizer: TokenizerLike,
-        img_start_token_ids: list[int],
-        img_end_token_ids: list[int],
-        img_context_token_ids: list[int],
-    ) -> PromptUpdateDetails[list[int]]:
-        """
-        Build prompt replacement for a video.
-        The replacement returned is not actually used to replace the placeholder
-        tokens - it's just used to make sure we allocate the correct number
-        of tokens.
-        Actual replacement is done in embed_multimodal of
-        NemotronH_Nano_VL_V2
-        (specifically in _process_video_input -> _create_final_video_embeddings).
-        There, we create the final embeddings with text embeddings for indicator tokens
-        and video embeddings for video tokens.
-        This is a single function that handles all cases - non EVS, EVS dummy, EVS real.
-        The differentiation is done via tokens_per_frame parameter.
-        - non EVS case - constant value same value across all frames
-        - EVS dummy - Doesn't matter how tokens are distributed between frames - just
-                        make sure the total number of tokens is correct.
-        - EVS real (called from get_real_video_repl_for_evs) - different value per frame
-        Args:
-            tokens_per_frame (list[int]): number of tokens per frame
-            frames_indices (list[int]): frame indices
-            frame_duration_ms (int): duration of each frame in milliseconds
-            tokenizer (TokenizerLike): tokenizer to use for tokenizing frame separators
-            img_start_token_ids (list[int]): pre-tokenized IMG_START tokens
-            img_end_token_ids (list[int]): pre-tokenized IMG_END tokens
-            img_context_token_ids (list[int]): pre-tokenized IMG_CONTEXT tokens
-        """
-        # TODO: Add support of frame_duration_ms to be None
-        # At preprocessing step we should allow absent / metadata without
-        # frames_indices field.
-        timestamps_enabled = frame_duration_ms is not None
-
-        if timestamps_enabled:
-            timestamps = calculate_timestamps(frames_indices, frame_duration_ms)
-
-            assert len(timestamps) == len(tokens_per_frame), (
-                "timestamps and tokens_per_frame must have the same length"
-            )
-            frame_separators = [
-                f"Frame {i + 1} sampled at {timestamp:.2f} seconds: "
-                for i, timestamp in enumerate(timestamps)
-            ]
-        else:
-            frame_separators = [
-                f"Frame {i + 1}: " for i, _ in enumerate(tokens_per_frame)
-            ]
-
-        # Tokenize frame separator independently
-        frame_separators_tokenized = [
-            _seq2tokens(tokenizer, sep) for sep in frame_separators
-        ]
-
-        # Tokenize each component independently to avoid tokenizer merging tokens
-        # across boundaries. This ensures consistent tokenization regardless of
-        # num_tokens_per_frame values.
-        all_token_ids = []
-        for i, num_tokens in enumerate(tokens_per_frame):
-            frame_sep_token_ids = frame_separators_tokenized[i]
-            all_token_ids.extend(frame_sep_token_ids)
-
-            # Add pre-tokenized special tokens
-            all_token_ids.extend(img_start_token_ids)
-            all_token_ids.extend(img_context_token_ids * num_tokens)
-            all_token_ids.extend(img_end_token_ids)
-
-        return PromptUpdateDetails.from_seq(all_token_ids)
-
-
 class BaseNanoNemotronVLProcessingInfo(BaseProcessingInfo):
    """Basic image-only ProcessingInfo for InternVL-style models."""


--- a/vllm/model_executor/models/nemotron_parse.py
+++ b/vllm/model_executor/models/nemotron_parse.py
@@ -11,18 +11,13 @@ import math
 from collections.abc import Iterable, Mapping, Sequence
 from typing import Annotated, Literal

-import numpy as np
 import torch
 import torch.nn as nn
 from einops import rearrange
-from PIL import Image
-from timm.data.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
-from torchvision import transforms as T
 from transformers import (
    BartConfig,
    BatchFeature,
    PretrainedConfig,
-    TensorType,
 )

 from vllm.config import CacheConfig, VllmConfig
@@ -59,13 +54,12 @@ from vllm.multimodal.processing import (
    PromptUpdate,
 )
 from vllm.renderers import TokenizeParams
-from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.configs.radio import RadioConfig
+from vllm.transformers_utils.processors.nemotron_parse import NemotronParseProcessor
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from vllm.v1.attention.backend import AttentionType

 logger = init_logger(__name__)
-DEFAULT_FINAL_IMAGE_SIZE = (2048, 1648)


 class BartScaledWordEmbedding(VocabParallelEmbedding):
@@ -372,231 +366,6 @@ class NemotronParsePixelInputs(TensorSchema):
    data: Annotated[torch.Tensor, TensorShape("b", 3, "h", "w")]


-class NemotronParseImageProcessor:
-    """
-    NemotronParse Image Processor
-    """
-
-    def __init__(
-        self,
-        final_size: tuple = DEFAULT_FINAL_IMAGE_SIZE,
-        **kwargs,
-    ):
-        # Ensure final_size is properly formatted
-        if isinstance(final_size, (list, tuple)) and len(final_size) >= 2:
-            self.final_size = (int(final_size[0]), int(final_size[1]))
-        elif isinstance(final_size, (int, float)):
-            self.final_size = (int(final_size), int(final_size))
-        else:
-            self.final_size = DEFAULT_FINAL_IMAGE_SIZE  # Default fallback
-
-        self.norm_mean = torch.Tensor(OPENAI_CLIP_MEAN).reshape(1, 3, 1, 1)
-        self.norm_std = torch.Tensor(OPENAI_CLIP_STD).reshape(1, 3, 1, 1)
-
-        # Create transforms
-        self._create_transforms()
-
-    def _create_transforms(self):
-        """Create transform objects."""
-        try:
-            import albumentations as A
-        except ImportError as err:
-            raise ImportError(
-                "The package `albumentations` is required to use "
-                "NemotronParse model. Please install it with `pip install "
-                "albumentations`."
-            ) from err
-
-        # Ensure final_size is a tuple of integers
-        if isinstance(self.final_size, (list, tuple)):
-            self.target_height, self.target_width = (
-                int(self.final_size[0]),
-                int(self.final_size[1]),
-            )
-        else:
-            self.target_height = self.target_width = int(self.final_size)
-
-        import cv2
-
-        self.transform = A.Compose(
-            [
-                A.PadIfNeeded(
-                    min_height=self.target_height,
-                    min_width=self.target_width,
-                    border_mode=cv2.BORDER_CONSTANT,
-                    fill=[255, 255, 255],
-                    p=1.0,
-                ),
-            ]
-        )
-
-        self.torch_transform = T.Compose(
-            [
-                T.ToTensor(),
-            ]
-        )
-
-    def _resize_with_aspect_ratio(self, image: np.ndarray) -> np.ndarray:
-        """Resize image maintaining aspect ratio (exact replica of original
-        LongestMaxSizeHW)."""
-        height, width = image.shape[:2]
-        max_size_height = self.target_height
-        max_size_width = self.target_width
-
-        # Original LongestMaxSizeHW algorithm from custom_augmentations.py
-        aspect_ratio = width / height
-        new_height = height
-        new_width = width
-
-        # If height too big then scale image down
-        if height > max_size_height:
-            new_height = max_size_height
-            new_width = int(new_height * aspect_ratio)
-
-        # If width too big, scale image down further
-        if new_width > max_size_width:
-            new_width = max_size_width
-            new_height = int(new_width / aspect_ratio)
-
-        # Use cv2.INTER_LINEAR like the original
-        import cv2
-
-        return cv2.resize(
-            image, (new_width, new_height), interpolation=cv2.INTER_LINEAR
-        )
-
-    def _pad_to_size(self, image: np.ndarray) -> np.ndarray:
-        """Pad image to target size with white padding (matches A.PadIfNeeded
-        behavior)."""
-        h, w = image.shape[:2]
-        min_height, min_width = self.target_height, self.target_width
-
-        # Only pad if image is smaller than target (matches A.PadIfNeeded logic)
-        pad_h = max(0, min_height - h)
-        pad_w = max(0, min_width - w)
-
-        if pad_h == 0 and pad_w == 0:
-            return image
-
-        # A.PadIfNeeded pads to bottom-right with constant value
-        if len(image.shape) == 3:
-            # Color image - pad bottom and right with white (255, 255, 255)
-            padded = np.pad(
-                image,
-                ((0, pad_h), (0, pad_w), (0, 0)),
-                mode="constant",
-                constant_values=255,
-            )
-        else:
-            # Grayscale image - pad with white (255)
-            padded = np.pad(
-                image, ((0, pad_h), (0, pad_w)), mode="constant", constant_values=255
-            )
-
-        return padded
-
-    def preprocess(
-        self,
-        images: Image.Image | list[Image.Image],
-        **kwargs,
-    ) -> dict[str, torch.Tensor]:
-        """
-        Preprocess an image or batch of images for the NemotronParse model.
-
-        Args:
-            images: Input image(s)
-        """
-        # Ensure images is a list
-        if not isinstance(images, list):
-            images = [images]
-
-        # Convert PIL images to numpy arrays if needed
-        processed_images = []
-        for image in images:
-            if isinstance(image, Image.Image):
-                image = np.asarray(image)
-            processed_images.append(image)
-
-        # Apply NemotronParse-specific transforms
-        pixel_values = []
-        for image in processed_images:
-            # Manual resize with aspect ratio preservation
-            # (replaces LongestMaxSizeHW)
-            processed_image = self._resize_with_aspect_ratio(image)
-
-            # Apply remaining albumentations transforms if available
-            if self.transform is not None:
-                transformed = self.transform(image=processed_image)
-                processed_image = transformed["image"]
-            else:
-                # Fallback: just pad to target size
-                processed_image = self._pad_to_size(processed_image)
-
-            # Convert to tensor
-            pixel_values_tensor = self.torch_transform(processed_image)
-
-            # Handle grayscale images
-            if pixel_values_tensor.shape[0] == 1:
-                pixel_values_tensor = pixel_values_tensor.expand(3, -1, -1)
-
-            pixel_values.append(pixel_values_tensor)
-
-        # Stack into batch
-        pixel_values = torch.stack(pixel_values)
-
-        # Normalize pixel values
-        normalized_values = (pixel_values - self.norm_mean) / self.norm_std
-        return {"pixel_values": normalized_values}
-
-    def __call__(
-        self, images: Image.Image | list[Image.Image], **kwargs
-    ) -> dict[str, torch.Tensor]:
-        return self.preprocess(images, **kwargs)
-
-
-class NemotronParseProcessor:
-    """
-    NemotronParse Processor
-    """
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        tokenizer: TokenizerLike,
-        **kwargs,
-    ) -> None:
-        super().__init__()
-
-        self.config = config
-        self.tokenizer = tokenizer
-
-        self.image_processor = NemotronParseImageProcessor(final_size=config.image_size)
-
-    def _make_batch_input(self, input_item=None):
-        if input_item is None:
-            input_item = []
-        if not isinstance(input_item, list):
-            input_item = [input_item]
-        return input_item
-
-    def __call__(
-        self,
-        text: str | None = None,
-        images: Image.Image | list[Image.Image] | None = None,
-        return_tensors: str | TensorType | None = None,
-        **kwargs,
-    ) -> BatchFeature:
-        text, images = [self._make_batch_input(x) for x in (text, images)]
-        image_inputs = {} if len(images) == 0 else self.image_processor(images)
-
-        text_inputs = self.tokenizer(text, add_special_tokens=False, **kwargs)
-        combined_outputs = BatchFeature(
-            data={**text_inputs, **image_inputs},
-            tensor_type=return_tensors,
-        )
-        return combined_outputs
-
-
 class NemotronParseProcessingInfo(BaseProcessingInfo):
    def get_hf_config(self):
        return self.ctx.get_hf_config()

--- a/vllm/model_executor/models/nemotron_vl.py
+++ b/vllm/model_executor/models/nemotron_vl.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py
-# --------------------------------------------------------
-# InternVL
-# Copyright (c) 2023 OpenGVLab
-# Licensed under The MIT License [see LICENSE for details]
-# --------------------------------------------------------
 import math
-from abc import ABC
 from collections.abc import Iterable

 import torch
 import torch.nn as nn
-import torchvision.transforms as T
-from PIL import Image
 from transformers import AutoModel, PretrainedConfig
-from transformers.image_processing_utils_fast import BaseImageProcessorFast

 from vllm.config import VllmConfig
 from vllm.model_executor.layers.linear import ReplicatedLinear
@@ -30,16 +19,16 @@ from vllm.model_executor.models.internvl import (
    InternVLImageEmbeddingInputs,
    InternVLImageInputs,
    InternVLImagePixelInputs,
-    InternVLProcessor,
 )
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.siglip import SiglipVisionModel
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import convert_image_mode
-from vllm.multimodal.processing import PromptUpdateDetails
 from vllm.sequence import IntermediateTensors
-from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.processor import cached_image_processor_from_config
+from vllm.transformers_utils.processors.nemotron_vl import (
+    LlamaNemotronVLEmbedProcessor,
+    NemotronVLProcessor,
+)
 from vllm.transformers_utils.repo_utils import get_hf_file_to_dict

 from .interfaces import (
@@ -58,310 +47,6 @@ from .utils import (
 )


-def build_transform(input_size: int):
-    return T.Compose(
-        [
-            T.Lambda(lambda img: convert_image_mode(img, "RGB")),
-            T.Resize(
-                (input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
-            ),
-            T.ToTensor(),
-        ]
-    )
-
-
-# adapted from https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1
-def find_closest_aspect_ratio(
-    aspect_ratio: float,
-    target_ratios: list[tuple[int, int]],
-    *,
-    width: int,
-    height: int,
-    image_size: int,
-) -> tuple[int, int]:
-    best_factor = float("-inf")
-    best_ratio = (1, 1)
-    area = width * height
-
-    for rw, rh in target_ratios:
-        target_aspect_ratio = rw / rh
-        size_factor = min((rw * rh * image_size * image_size) / area, 0.6)
-        ratio_closeness = min(
-            target_aspect_ratio / aspect_ratio, aspect_ratio / target_aspect_ratio
-        )
-        factor = size_factor * ratio_closeness
-
-        if factor > best_factor:
-            best_factor = factor
-            best_ratio = (rw, rh)
-
-    return best_ratio
-
-
-def calculate_nemotron_vl_targets(
-    *,
-    orig_width: int,
-    orig_height: int,
-    target_ratios: list[tuple[int, int]],
-    image_size: int,
-    use_thumbnail: bool,
-) -> tuple[int, int, int]:
-    aspect_ratio = orig_width / orig_height
-
-    # find the closest aspect ratio to the target
-    target_aspect_ratio = find_closest_aspect_ratio(
-        aspect_ratio,
-        target_ratios,
-        width=orig_width,
-        height=orig_height,
-        image_size=image_size,
-    )
-
-    # calculate the target width and height
-    target_width = image_size * target_aspect_ratio[0]
-    target_height = image_size * target_aspect_ratio[1]
-    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
-
-    # add thumbnail image if num_blocks != 1
-    if use_thumbnail and blocks != 1:
-        blocks += 1
-
-    return blocks, target_width, target_height
-
-
-def dynamic_preprocess_nemotron_vl(
-    image: Image.Image,
-    *,
-    target_ratios: list[tuple[int, int]],
-    image_size: int,
-    use_thumbnail: bool,
-) -> list[Image.Image]:
-    orig_width, orig_height = image.size
-
-    # calculate the number of blocks without thumbnail
-    blocks, target_width, target_height = calculate_nemotron_vl_targets(
-        orig_width=orig_width,
-        orig_height=orig_height,
-        target_ratios=target_ratios,
-        image_size=image_size,
-        use_thumbnail=False,
-    )
-
-    # resize the image
-    resized_img = image.resize((target_width, target_height))
-    processed_images = []
-    for i in range(blocks):
-        box = (
-            (i % (target_width // image_size)) * image_size,
-            (i // (target_width // image_size)) * image_size,
-            ((i % (target_width // image_size)) + 1) * image_size,
-            ((i // (target_width // image_size)) + 1) * image_size,
-        )
-        # split the image
-        split_img = resized_img.crop(box)
-        processed_images.append(split_img)
-
-    assert len(processed_images) == blocks
-
-    if use_thumbnail and len(processed_images) != 1:
-        thumbnail_img = image.resize((image_size, image_size))
-        processed_images.append(thumbnail_img)
-
-    return processed_images
-
-
-def get_nemotron_vl_target_ratios(
-    min_num: int,
-    max_num: int,
-) -> list[tuple[int, int]]:
-    target_ratios = {
-        (i, j)
-        for n in range(min_num, max_num + 1)
-        for i in range(1, n + 1)
-        for j in range(1, n + 1)
-        if min_num <= i * j <= max_num
-    }
-    return sorted(target_ratios, key=lambda x: x[0] * x[1])
-
-
-def image_to_pixel_values_nemotron_vl(
-    image: Image.Image,
-    *,
-    input_size: int,
-    min_num: int,
-    max_num: int,
-    use_thumbnail: bool,
-    transform: T.Compose | None = None,
-) -> torch.Tensor:
-    target_ratios = get_nemotron_vl_target_ratios(min_num, max_num)
-
-    if transform is None:
-        transform = build_transform(input_size=input_size)
-
-    images = dynamic_preprocess_nemotron_vl(
-        image,
-        target_ratios=target_ratios,
-        image_size=input_size,
-        use_thumbnail=use_thumbnail,
-    )
-
-    pixel_values = torch.stack([transform(image) for image in images])
-    return pixel_values
-
-
-class NemotronVLProcessor(InternVLProcessor):
-    IMG_START = "<img>"
-    IMG_END = "</img>"
-    IMG_CONTEXT = "<image>"
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        tokenizer: TokenizerLike,
-        image_processor: BaseImageProcessorFast | None = None,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-    ) -> None:
-        ABC.__init__(self)
-        self.config = config
-        self.tokenizer = tokenizer
-        self.image_processor = image_processor
-        image_size: int = config.force_image_size
-        patch_size: int = config.patch_size
-
-        if min_dynamic_patch is None:
-            min_dynamic_patch = 1
-        assert isinstance(min_dynamic_patch, int)
-
-        if max_dynamic_patch is None:
-            max_dynamic_patch = self.image_processor.max_num_tiles
-        assert isinstance(max_dynamic_patch, int)
-
-        if dynamic_image_size is None:
-            dynamic_image_size = True
-        assert isinstance(dynamic_image_size, bool)
-
-        self.num_image_token = int(
-            (image_size // patch_size) ** 2 * (config.downsample_ratio**2)
-        )
-        self.image_size = image_size
-        self.min_dynamic_patch = min_dynamic_patch
-        self.max_dynamic_patch = max_dynamic_patch
-        self.dynamic_image_size = dynamic_image_size
-
-        if image_processor is not None:
-            self.use_thumbnail = image_processor.use_thumbnail
-        else:
-            self.use_thumbnail = getattr(config, "use_thumbnail", True)
-
-    @property
-    def image_token_id(self) -> int:
-        return self.tokenizer.get_vocab()[self.IMG_CONTEXT]
-
-    def _get_transform(self) -> T.Compose:
-        return build_transform(input_size=self.image_size)
-
-    def get_num_image_tokens(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-    ) -> int:
-        target_ratios = self.resolve_target_ratios(
-            use_thumbnail=False,  # Applied in calculate_targets
-        )
-
-        num_patches, _, _ = calculate_nemotron_vl_targets(
-            orig_width=image_width,
-            orig_height=image_height,
-            image_size=self.image_size,
-            target_ratios=target_ratios,
-            use_thumbnail=self.use_thumbnail,
-        )
-
-        return num_patches * self.num_image_token
-
-    def _images_to_pixel_values_lst(
-        self,
-        images: list[Image.Image],
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-    ) -> list[torch.Tensor]:
-        min_num, max_num = self.resolve_min_max_num(
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-            use_thumbnail=False,  # Applied in image_to_pixel_values
-        )
-
-        return [
-            image_to_pixel_values_nemotron_vl(
-                image,
-                input_size=self.image_size,
-                min_num=min_num,
-                max_num=max_num,
-                use_thumbnail=self.use_thumbnail,
-                transform=self._get_transform(),
-            )
-            for image in images
-        ]
-
-    def _replace_image_tokens(
-        self,
-        text: list[str],
-        pixel_values_lst: list[torch.Tensor],
-    ) -> list[str]:
-        """Replace <image> placeholders with image tokens."""
-        for pixel_values in pixel_values_lst:
-            num_patches = pixel_values.shape[0]
-            feature_size = num_patches * self.num_image_token
-            image_repl = self.get_image_repl(feature_size, num_patches)
-            # Use temporary placeholder to avoid replacing tokens we just inserted
-            NVL_IMAGE_CONTEXT = image_repl.full.replace("<image>", "<NVL_IMG_CONTEXT>")
-            text = [t.replace("<image>", NVL_IMAGE_CONTEXT, 1) for t in text]
-        return [t.replace("<NVL_IMG_CONTEXT>", self.IMG_CONTEXT) for t in text]
-
-    def _preprocess_image(
-        self,
-        text: list[str],
-        images: list[Image.Image],
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-    ) -> tuple[list[str], dict[str, torch.Tensor]]:
-        if len(images) == 0:
-            image_inputs = {}
-        else:
-            pixel_values_lst = self._images_to_pixel_values_lst(
-                images,
-                min_dynamic_patch=min_dynamic_patch,
-                max_dynamic_patch=max_dynamic_patch,
-                dynamic_image_size=dynamic_image_size,
-            )
-            image_inputs = {
-                "pixel_values_flat": torch.cat(pixel_values_lst),
-                "image_num_patches": torch.tensor(
-                    [len(item) for item in pixel_values_lst]
-                ),
-            }
-
-            text = self._replace_image_tokens(text, pixel_values_lst)
-        return text, image_inputs
-
-    def get_image_repl(
-        self,
-        feature_size: int,
-        num_patches: int | None,
-    ) -> PromptUpdateDetails[str]:
-        repl_features = self.IMG_CONTEXT * feature_size
-        repl_full = self.IMG_START + repl_features + self.IMG_END
-
-        return PromptUpdateDetails.select_text(repl_full, self.IMG_CONTEXT)
-
-
 class NemotronVLProcessingInfo(BaseInternVLProcessingInfo):
    """Processing info for Nemotron VL models."""

@@ -700,91 +385,6 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor
 #   - Pooler output instead of generative logits
 # --------------------------------------------------------

-# SigLIP normalization constants
-SIGLIP_MEAN = (0.5, 0.5, 0.5)
-SIGLIP_STD = (0.5, 0.5, 0.5)
-
-
-def build_siglip_transform(input_size: int):
-    """Build transform for SigLIP vision encoder with normalization.
-
-    Extends the base transform from nemotron_vl with SigLIP-specific normalization.
-    """
-    base_transform = build_transform(input_size=input_size)
-    return T.Compose(
-        [
-            base_transform,
-            T.Normalize(mean=SIGLIP_MEAN, std=SIGLIP_STD),
-        ]
-    )
-
-
-class LlamaNemotronVLEmbedProcessor(NemotronVLProcessor):
-    """
-    Processor for LlamaNemotronVL embedding model.
-
-    Inherits from NemotronVLProcessor and specializes it for embedding tasks:
-    - Uses SigLIP transform with normalization instead of base transform
-    - Uses different image context token (<IMG_CONTEXT> vs <image>)
-    """
-
-    IMG_CONTEXT = "<IMG_CONTEXT>"
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        tokenizer: TokenizerLike,
-        processor_config: dict,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-    ) -> None:
-        if min_dynamic_patch is None:
-            min_dynamic_patch = processor_config.get(
-                "min_input_tiles",
-                getattr(config, "min_dynamic_patch", 1),
-            )
-        if max_dynamic_patch is None:
-            max_dynamic_patch = processor_config.get(
-                "max_input_tiles",
-                getattr(config, "max_dynamic_patch", 1),
-            )
-        if dynamic_image_size is None:
-            dynamic_image_size = processor_config.get(
-                "dynamic_image_size",
-                getattr(config, "dynamic_image_size", True),
-            )
-        super().__init__(
-            config=config,
-            tokenizer=tokenizer,
-            image_processor=None,
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-        )
-
-    def _get_transform(self) -> T.Compose:
-        """Override to add SigLIP normalization."""
-        return build_siglip_transform(input_size=self.image_size)
-
-    def _replace_image_tokens(
-        self,
-        text: list[str],
-        pixel_values_lst: list[torch.Tensor],
-    ) -> list[str]:
-        """Override with simpler token replacement for embedding model.
-
-        No temporary placeholder needed because IMG_CONTEXT is <IMG_CONTEXT>,
-        not <image>, so there's no collision risk.
-        """
-        for pixel_values in pixel_values_lst:
-            num_patches = pixel_values.shape[0]
-            feature_size = num_patches * self.num_image_token
-            image_repl = self.get_image_repl(feature_size, num_patches)
-            text = [t.replace("<image>", image_repl.full, 1) for t in text]
-        return text
-

 class LlamaNemotronVLEmbedProcessingInfo(NemotronVLProcessingInfo):
    """Processing info for LlamaNemotronVL embedding model."""

--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -27,48 +27,16 @@ from vllm.multimodal.processing import (
    PromptUpdate,
    PromptUpdateDetails,
 )
+from vllm.transformers_utils.processors.nvlm_d import IMG_PAD, NVLMProcessor

 from .intern_vit import InternVisionModel
 from .internvl import (
    BaseInternVLDummyInputsBuilder,
    BaseInternVLMultiModalProcessor,
    BaseInternVLProcessingInfo,
-    BaseInternVLProcessor,
    InternVLChatModel,
 )

-IMG_PAD = "<|vision_pad|>"
-
-
-class NVLMProcessor(BaseInternVLProcessor):
-    @property
-    def image_token_id(self) -> int:
-        return self.tokenizer.get_vocab()[IMG_PAD]
-
-    def get_image_repl(
-        self,
-        feature_size: int,
-        num_patches: int | None,
-    ) -> PromptUpdateDetails[str]:
-        if num_patches is None:
-            raise NotImplementedError("Embedding inputs are not supported")
-
-        tile_pos_identifiers = [f"<tile_{i}>" for i in range(1, num_patches)]
-        if self.use_thumbnail:
-            tile_pos_identifiers += ["<tile_global_thumbnail>"]
-
-        context_size = feature_size // num_patches
-        features = "".join(
-            identifier + IMG_PAD * context_size for identifier in tile_pos_identifiers
-        )
-
-        # We include the start and end as well because "<Image><tile" is
-        # tokenized as ["<Image", "><", "tile"], resulting in assertion error
-        # when trying to find "<tile" as a subsequence of "<Image><tile"
-        repl = "<Image>" + features + "</Image>"
-
-        return PromptUpdateDetails.select_text(repl, IMG_PAD)
-

 class NVLMProcessingInfo(BaseInternVLProcessingInfo):
    def get_hf_processor(self, **kwargs: object) -> NVLMProcessor:

--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -12,9 +12,7 @@ from typing import Annotated, Literal, TypeAlias

 import torch
 import torch.nn as nn
-import torchvision.transforms as T
-from PIL import Image
-from transformers import BatchFeature, PretrainedConfig, TensorType
+from transformers import BatchFeature, PretrainedConfig

 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
@@ -26,7 +24,6 @@ from vllm.model_executor.models.intern_vit import (
    InternVisionPatchModel,
 )
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import convert_image_mode
 from vllm.multimodal.inputs import (
    MultiModalDataDict,
    MultiModalFieldConfig,
@@ -44,22 +41,14 @@ from vllm.multimodal.processing import (
    BaseProcessingInfo,
    PromptReplacement,
    PromptUpdate,
-    PromptUpdateDetails,
 )
 from vllm.sequence import IntermediateTensors
-from vllm.tokenizers import TokenizerLike
+from vllm.transformers_utils.processors.skyworkr1v import SkyworkR1VProcessor
 from vllm.utils.tensor_schema import TensorSchema, TensorShape

 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix

-IMG_START = "<img>"
-IMG_END = "</img>"
-IMG_CONTEXT = "<IMG_CONTEXT>"
-
-IMAGENET_MEAN = (0.485, 0.456, 0.406)
-IMAGENET_STD = (0.229, 0.224, 0.225)
-

 class SkyworkR1VImagePixelInputs(TensorSchema):
    """
@@ -106,370 +95,6 @@ SkyworkR1VImageInputs: TypeAlias = (
 )


-# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
-def build_transform(input_size: int):
-    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
-    return T.Compose(
-        [
-            T.Lambda(lambda img: convert_image_mode(img, "RGB")),
-            T.Resize(
-                (input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
-            ),
-            T.ToTensor(),
-            T.Normalize(mean=MEAN, std=STD),
-        ]
-    )
-
-
-# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
-def find_closest_aspect_ratio(
-    aspect_ratio: float,
-    target_ratios: list[tuple[int, int]],
-    *,
-    width: int,
-    height: int,
-    image_size: int,
-) -> tuple[int, int]:
-    best_ratio_diff = float("inf")
-    best_ratio = (1, 1)
-    area = width * height
-    for ratio in target_ratios:
-        target_aspect_ratio = ratio[0] / ratio[1]
-        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
-        if ratio_diff < best_ratio_diff:
-            best_ratio_diff = ratio_diff
-            best_ratio = ratio
-        elif ratio_diff == best_ratio_diff:
-            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
-                best_ratio = ratio
-    return best_ratio
-
-
-def resolve_skyworkr1v_min_max_num(
-    *,
-    min_dynamic_patch: int,
-    max_dynamic_patch: int,
-    dynamic_image_size: bool,
-    use_thumbnail: bool,
-) -> tuple[int, int]:
-    min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
-    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
-
-    if use_thumbnail and max_dynamic_patch != 1:
-        max_dynamic_patch += 1
-
-    return min_dynamic_patch, max_dynamic_patch
-
-
-def get_skyworkr1v_target_ratios(
-    min_num: int,
-    max_num: int,
-) -> list[tuple[int, int]]:
-    target_ratios = {
-        (i, j)
-        for n in range(min_num, max_num + 1)
-        for i in range(1, n + 1)
-        for j in range(1, n + 1)
-        if min_num <= i * j <= max_num
-    }
-    return sorted(target_ratios, key=lambda x: x[0] * x[1])
-
-
-def calculate_skyworkr1v_targets(
-    *,
-    orig_width: int,
-    orig_height: int,
-    target_ratios: list[tuple[int, int]],
-    image_size: int,
-    use_thumbnail: bool,
-) -> tuple[int, int, int]:
-    aspect_ratio = orig_width / orig_height
-
-    # find the closest aspect ratio to the target
-    target_aspect_ratio = find_closest_aspect_ratio(
-        aspect_ratio,
-        target_ratios,
-        width=orig_width,
-        height=orig_height,
-        image_size=image_size,
-    )
-
-    # calculate the target width and height
-    target_width = image_size * target_aspect_ratio[0]
-    target_height = image_size * target_aspect_ratio[1]
-    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
-
-    # add thumbnail image if num_blocks != 1
-    if use_thumbnail and blocks != 1:
-        blocks += 1
-
-    return blocks, target_width, target_height
-
-
-def dynamic_preprocess_skyworkr1v(
-    image: Image.Image,
-    *,
-    target_ratios: list[tuple[int, int]],
-    image_size: int,
-    use_thumbnail: bool,
-) -> list[Image.Image]:
-    orig_width, orig_height = image.size
-
-    # calculate the number of blocks without thumbnail
-    blocks, target_width, target_height = calculate_skyworkr1v_targets(
-        orig_width=orig_width,
-        orig_height=orig_height,
-        target_ratios=target_ratios,
-        image_size=image_size,
-        use_thumbnail=False,
-    )
-
-    # resize the image
-    resized_img = image.resize((target_width, target_height))
-    processed_images = []
-    for i in range(blocks):
-        box = (
-            (i % (target_width // image_size)) * image_size,
-            (i // (target_width // image_size)) * image_size,
-            ((i % (target_width // image_size)) + 1) * image_size,
-            ((i // (target_width // image_size)) + 1) * image_size,
-        )
-        # split the image
-        split_img = resized_img.crop(box)
-        processed_images.append(split_img)
-
-    assert len(processed_images) == blocks
-
-    if use_thumbnail and len(processed_images) != 1:
-        thumbnail_img = image.resize((image_size, image_size))
-        processed_images.append(thumbnail_img)
-
-    return processed_images
-
-
-# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B
-def image_to_pixel_values_skyworkr1v(
-    image: Image.Image,
-    *,
-    input_size: int,
-    min_num: int,
-    max_num: int,
-    use_thumbnail: bool,
-) -> torch.Tensor:
-    target_ratios = get_skyworkr1v_target_ratios(min_num, max_num)
-
-    transform = build_transform(input_size=input_size)
-    images = dynamic_preprocess_skyworkr1v(
-        image,
-        target_ratios=target_ratios,
-        image_size=input_size,
-        use_thumbnail=use_thumbnail,
-    )
-
-    pixel_values = torch.stack([transform(image) for image in images])
-    return pixel_values
-
-
-class SkyworkR1VProcessor:
-    """
-    This model doesn't define its own HF processor,
-    so we implement our own one here.
-
-    The code to insert image tokens is based on:
-    https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py#L252
-    """
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        tokenizer: TokenizerLike,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-    ) -> None:
-        super().__init__()
-
-        self.config = config
-        self.tokenizer = tokenizer
-
-        image_size: int = config.vision_config.image_size
-        patch_size: int = config.vision_config.patch_size
-
-        if min_dynamic_patch is None:
-            min_dynamic_patch = config.min_dynamic_patch
-        assert isinstance(min_dynamic_patch, int)
-
-        if max_dynamic_patch is None:
-            max_dynamic_patch = config.max_dynamic_patch
-        assert isinstance(max_dynamic_patch, int)
-
-        if dynamic_image_size is None:
-            dynamic_image_size = config.dynamic_image_size
-        assert isinstance(dynamic_image_size, bool)
-
-        self.num_image_token = int(
-            (image_size // patch_size) ** 2 * (config.downsample_ratio**2)
-        )
-        self.image_size = image_size
-        self.min_dynamic_patch = min_dynamic_patch
-        self.max_dynamic_patch = max_dynamic_patch
-        self.dynamic_image_size = dynamic_image_size
-        self.use_thumbnail: bool = config.use_thumbnail
-
-    @property
-    def image_token_id(self) -> int:
-        return self.tokenizer.get_vocab()[IMG_CONTEXT]
-
-    def get_image_repl(
-        self,
-        feature_size: int,
-        num_patches: int | None,
-    ) -> PromptUpdateDetails[str]:
-        repl_features = IMG_CONTEXT * feature_size
-        repl_full = IMG_START + repl_features + IMG_END
-
-        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
-
-    def resolve_min_max_num(
-        self,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-        use_thumbnail: bool | None = None,
-    ) -> tuple[int, int]:
-        min_dynamic_patch = (
-            self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
-        )
-        max_dynamic_patch = (
-            self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
-        )
-        dynamic_image_size = (
-            self.dynamic_image_size
-            if dynamic_image_size is None
-            else dynamic_image_size
-        )
-        use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
-
-        return resolve_skyworkr1v_min_max_num(
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-            use_thumbnail=use_thumbnail,
-        )
-
-    def resolve_target_ratios(
-        self,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-        use_thumbnail: bool | None = None,
-    ) -> list[tuple[int, int]]:
-        min_num, max_num = self.resolve_min_max_num(
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-            use_thumbnail=use_thumbnail,
-        )
-
-        return get_skyworkr1v_target_ratios(min_num, max_num)
-
-    def get_num_image_tokens(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-    ) -> int:
-        target_ratios = self.resolve_target_ratios(
-            use_thumbnail=False,  # Applied in calculate_targets
-        )
-
-        num_patches, _, _ = calculate_skyworkr1v_targets(
-            orig_width=image_width,
-            orig_height=image_height,
-            image_size=self.image_size,
-            target_ratios=target_ratios,
-            use_thumbnail=self.use_thumbnail,
-        )
-
-        return num_patches * self.num_image_token
-
-    def _images_to_pixel_values_lst(
-        self,
-        images: list[Image.Image],
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-    ) -> list[torch.Tensor]:
-        min_num, max_num = self.resolve_min_max_num(
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-            use_thumbnail=False,  # Applied in image_to_pixel_values
-        )
-
-        return [
-            image_to_pixel_values_skyworkr1v(
-                image,
-                input_size=self.image_size,
-                min_num=min_num,
-                max_num=max_num,
-                use_thumbnail=self.use_thumbnail,
-            )
-            for image in images
-        ]
-
-    def __call__(
-        self,
-        text: str | list[str] | None = None,
-        images: Image.Image | list[Image.Image] | None = None,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-        return_tensors: str | TensorType | None = None,
-    ) -> BatchFeature:
-        if text is None:
-            text = []
-        if not isinstance(text, list):
-            text = [text]
-        if images is None:
-            images = []
-        if not isinstance(images, list):
-            images = [images]
-
-        if len(images) == 0:
-            image_inputs = {}
-        else:
-            pixel_values_lst = self._images_to_pixel_values_lst(
-                images,
-                min_dynamic_patch=min_dynamic_patch,
-                max_dynamic_patch=max_dynamic_patch,
-                dynamic_image_size=dynamic_image_size,
-            )
-            image_inputs = {
-                "pixel_values_flat": torch.cat(pixel_values_lst),
-                "image_num_patches": torch.tensor(
-                    [len(item) for item in pixel_values_lst]
-                ),
-            }
-
-            for pixel_values in pixel_values_lst:
-                num_patches = pixel_values.shape[0]
-                feature_size = num_patches * self.num_image_token
-
-                image_repl = self.get_image_repl(feature_size, num_patches)
-
-                text = [t.replace("<image>", image_repl.full, 1) for t in text]
-
-        text_inputs = self.tokenizer(text)
-
-        combined_outputs = {**text_inputs, **image_inputs}
-
-        return BatchFeature(combined_outputs, tensor_type=return_tensors)
-
-
 class SkyworkR1VProcessingInfo(BaseProcessingInfo):
    def get_hf_processor(self, **kwargs: object) -> SkyworkR1VProcessor:
        return self.ctx.init_processor(

--- a/vllm/transformers_utils/processors/__init__.py
+++ b/vllm/transformers_utils/processors/__init__.py
@@ -13,35 +13,53 @@ import importlib
 __all__ = [
    "BagelProcessor",
    "DeepseekVLV2Processor",
+    "Eagle2_5_VLProcessor",
    "FireRedASR2Processor",
    "FunASRProcessor",
    "GLM4VProcessor",
+    "H2OVLProcessor",
    "HunYuanVLProcessor",
    "HunYuanVLImageProcessor",
+    "InternVLProcessor",
    "KimiAudioProcessor",
    "MistralCommonPixtralProcessor",
    "MistralCommonVoxtralProcessor",
+    "NanoNemotronVLProcessor",
+    "NemotronParseProcessor",
+    "NemotronVLProcessor",
+    "LlamaNemotronVLEmbedProcessor",
+    "NVLMProcessor",
    "OvisProcessor",
    "Ovis2_5Processor",
    "QwenVLProcessor",
    "Qwen3ASRProcessor",
+    "SkyworkR1VProcessor",
 ]

 _CLASS_TO_MODULE: dict[str, str] = {
    "BagelProcessor": "vllm.transformers_utils.processors.bagel",
    "DeepseekVLV2Processor": "vllm.transformers_utils.processors.deepseek_vl2",
+    "Eagle2_5_VLProcessor": "vllm.transformers_utils.processors.eagle2_5_vl",
    "FireRedASR2Processor": "vllm.transformers_utils.processors.fireredasr2",
    "FunASRProcessor": "vllm.transformers_utils.processors.funasr",
    "GLM4VProcessor": "vllm.transformers_utils.processors.glm4v",
+    "H2OVLProcessor": "vllm.transformers_utils.processors.h2ovl",
    "HunYuanVLProcessor": "vllm.transformers_utils.processors.hunyuan_vl",
    "HunYuanVLImageProcessor": "vllm.transformers_utils.processors.hunyuan_vl_image",
+    "InternVLProcessor": "vllm.transformers_utils.processors.internvl",
    "KimiAudioProcessor": "vllm.transformers_utils.processors.kimi_audio",
    "MistralCommonPixtralProcessor": "vllm.transformers_utils.processors.pixtral",
    "MistralCommonVoxtralProcessor": "vllm.transformers_utils.processors.voxtral",
+    "NanoNemotronVLProcessor": "vllm.transformers_utils.processors.nano_nemotron_vl",
+    "NemotronParseProcessor": "vllm.transformers_utils.processors.nemotron_parse",
+    "NemotronVLProcessor": "vllm.transformers_utils.processors.nemotron_vl",
+    "LlamaNemotronVLEmbedProcessor": "vllm.transformers_utils.processors.nemotron_vl",
+    "NVLMProcessor": "vllm.transformers_utils.processors.nvlm_d",
    "OvisProcessor": "vllm.transformers_utils.processors.ovis",
    "Ovis2_5Processor": "vllm.transformers_utils.processors.ovis2_5",
    "QwenVLProcessor": "vllm.transformers_utils.processors.qwen_vl",
    "Qwen3ASRProcessor": "vllm.transformers_utils.processors.qwen3_asr",
+    "SkyworkR1VProcessor": "vllm.transformers_utils.processors.skyworkr1v",
 }



--- a/vllm/transformers_utils/processors/eagle2_5_vl.py
+++ b/vllm/transformers_utils/processors/eagle2_5_vl.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from NVIDIA Eagle2.5-VL model
+# https://huggingface.co/nvidia/Eagle2.5-8B
+from transformers import PretrainedConfig
+
+from vllm.multimodal.processing import PromptUpdateDetails
+from vllm.tokenizers import TokenizerLike
+
+from .internvl import IMG_CONTEXT, IMG_END, IMG_START, BaseInternVLProcessor
+
+
+class Eagle2_5_VLProcessor(BaseInternVLProcessor):
+    """
+    Custom processor for Eagle2.5-VL model.
+    Extends BaseInternVLProcessor with Eagle-specific token handling.
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: TokenizerLike,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+    ) -> None:
+        # Skip super().__init__() to avoid config manipulation
+        # Directly initialize all required attributes
+        self.config = config
+        self.tokenizer = tokenizer
+
+        # Image size with force_image_size override
+        image_size: int = config.vision_config.image_size
+        if hasattr(config, "force_image_size") and config.force_image_size:
+            image_size = config.force_image_size
+
+        patch_size: int = config.vision_config.patch_size
+        downsample_ratio: float = getattr(config, "downsample_ratio", 0.5)
+
+        # Compute num_image_token
+        self.num_image_token = int(
+            (image_size // patch_size) ** 2 * (downsample_ratio**2)
+        )
+        self.image_size = image_size
+
+        # Dynamic patch settings with defaults
+        self.min_dynamic_patch = (
+            min_dynamic_patch
+            if min_dynamic_patch is not None
+            else getattr(config, "min_dynamic_patch", 1)
+        )
+        self.max_dynamic_patch = (
+            max_dynamic_patch
+            if max_dynamic_patch is not None
+            else getattr(config, "max_dynamic_patch", 12)
+        )
+        self.dynamic_image_size = (
+            dynamic_image_size
+            if dynamic_image_size is not None
+            else getattr(config, "dynamic_image_size", True)
+        )
+        self.use_thumbnail: bool = getattr(config, "use_thumbnail", True)
+
+    @property
+    def image_token_id(self) -> int:
+        """Get the image token ID from config or tokenizer."""
+        if hasattr(self.config, "image_token_index"):
+            return self.config.image_token_index
+        # Fallback to tokenizer vocab - use <IMG_CONTEXT> (ID: 151667)
+        vocab = self.tokenizer.get_vocab()
+        if IMG_CONTEXT in vocab:
+            return vocab[IMG_CONTEXT]
+        raise ValueError(f"Cannot find image token '{IMG_CONTEXT}' in vocabulary")
+
+    def get_image_repl(
+        self,
+        feature_size: int,
+        num_patches: int | None,
+    ) -> PromptUpdateDetails[str]:
+        """Get image replacement string for prompt."""
+        repl_features = IMG_CONTEXT * feature_size
+        repl_full = IMG_START + repl_features + IMG_END
+
+        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
--- a/vllm/transformers_utils/processors/h2ovl.py
+++ b/vllm/transformers_utils/processors/h2ovl.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/modeling_h2ovl_chat.py
+# https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/image_process.py
+# --------------------------------------------------------
+# H2OVL-Mississippi
+# Copyright (c) 2024 H2O.AI
+# Licensed under Apache 2.0 License [see LICENSE for details]
+# --------------------------------------------------------
+import torch
+from PIL import Image
+from transformers import PretrainedConfig
+
+from vllm.multimodal.processing import PromptUpdateDetails
+from vllm.tokenizers import TokenizerLike
+
+from .internvl import (
+    IMG_CONTEXT,
+    IMG_END,
+    IMG_START,
+    BaseInternVLProcessor,
+    build_transform,
+    find_closest_aspect_ratio,
+    get_internvl_target_ratios,
+)
+
+
+def resolve_h2ovl_min_max_num(
+    *,
+    min_dynamic_patch: int,
+    max_dynamic_patch: int,
+    dynamic_image_size: bool,
+    use_thumbnail: bool,
+) -> tuple[int, int]:
+    min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
+    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
+
+    if use_thumbnail and max_dynamic_patch != 1:
+        max_dynamic_patch += 1
+
+    return min_dynamic_patch, max_dynamic_patch
+
+
+def get_h2ovl_target_ratios(
+    min_num: int,
+    max_num: int,
+    *,
+    prior_aspect_ratio: tuple[int, int] | None,
+) -> list[tuple[int, int]]:
+    target_ratios = get_internvl_target_ratios(min_num, max_num)
+
+    # if prior_aspect_ratio is provided, filter the target ratios
+    if prior_aspect_ratio is not None:
+        target_ratios = [
+            ratio
+            for ratio in target_ratios
+            if prior_aspect_ratio[0] % ratio[0] != 0
+            and prior_aspect_ratio[1] % ratio[1] != 0
+        ]
+
+    return target_ratios
+
+
+# modified to include blocks generated in second pass
+def calculate_h2ovl_targets(
+    *,
+    orig_width: int,
+    orig_height: int,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> tuple[int, int, int, tuple[int, int]]:
+    aspect_ratio = orig_width / orig_height
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio,
+        target_ratios,
+        width=orig_width,
+        height=orig_height,
+        image_size=image_size,
+    )
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # add thumbnail image if num_blocks != 1
+    if use_thumbnail and blocks != 1:
+        blocks += 1
+
+    return blocks, target_width, target_height, target_aspect_ratio
+
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
+# refactored to handle prior_aspect_ratio
+def dynamic_preprocess_h2ovl(
+    image: Image.Image,
+    *,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> tuple[list[Image.Image], tuple[int, int]]:
+    orig_width, orig_height = image.size
+
+    # calculate the number of blocks without thumbnail
+    (
+        blocks,
+        target_width,
+        target_height,
+        target_aspect_ratio,
+    ) = calculate_h2ovl_targets(
+        orig_width=orig_width,
+        orig_height=orig_height,
+        target_ratios=target_ratios,
+        image_size=image_size,
+        use_thumbnail=False,
+    )
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+
+    assert len(processed_images) == blocks
+
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+
+    return processed_images, target_aspect_ratio
+
+
+def _preprocess_image(
+    image: Image.Image,
+    *,
+    input_size: int,
+    min_num: int,
+    max_num: int,
+    use_thumbnail: bool,
+    prior_aspect_ratio: tuple[int, int] | None,
+) -> tuple[torch.Tensor, tuple[int, int]]:
+    target_ratios = get_h2ovl_target_ratios(
+        min_num,
+        max_num,
+        prior_aspect_ratio=prior_aspect_ratio,
+    )
+
+    transform = build_transform(input_size=input_size)
+    images, target_aspect_ratio = dynamic_preprocess_h2ovl(
+        image,
+        image_size=input_size,
+        use_thumbnail=use_thumbnail,
+        target_ratios=target_ratios,
+    )
+
+    pixel_values = torch.stack([transform(image) for image in images])
+    return pixel_values, target_aspect_ratio
+
+
+# refactored to use the _preprocess_image function
+def image_to_pixel_values_h2ovl(
+    image: Image.Image,
+    *,
+    input_size: int,
+    min_num: int,
+    max_num: int,
+    use_thumbnail: bool,
+    use_msac: bool,
+) -> torch.Tensor:
+    # when MSAC is turned on, we need to process the image twice
+    if use_msac:
+        # first pass
+        pixel_values1, aspect_ratio1 = _preprocess_image(
+            image,
+            input_size=input_size,
+            min_num=1,
+            max_num=max_num,
+            use_thumbnail=True,
+            prior_aspect_ratio=None,
+        )
+        # second pass
+        pixel_values2, _ = _preprocess_image(
+            image,
+            input_size=input_size,
+            min_num=3,
+            max_num=max_num,
+            use_thumbnail=True,
+            prior_aspect_ratio=aspect_ratio1,
+        )
+        # combine pixel values
+        pixel_values = torch.cat(
+            [pixel_values2[:-1], pixel_values1[:-1], pixel_values2[-1:]], 0
+        )
+
+    else:
+        pixel_values, _ = _preprocess_image(
+            image,
+            input_size=input_size,
+            min_num=min_num,
+            max_num=max_num,
+            use_thumbnail=use_thumbnail,
+            prior_aspect_ratio=None,
+        )
+
+    return pixel_values
+
+
+class H2OVLProcessor(BaseInternVLProcessor):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: TokenizerLike,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        use_msac: bool | None = None,
+    ) -> None:
+        super().__init__(
+            config,
+            tokenizer,
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+        )
+
+        if use_msac is None:
+            use_msac = config.use_msac
+        assert isinstance(use_msac, bool)
+
+        self.use_msac = use_msac
+
+    @property
+    def image_token_id(self) -> int:
+        return self.tokenizer.get_vocab()[IMG_CONTEXT]
+
+    def get_image_repl(
+        self,
+        feature_size: int,
+        num_patches: int | None,
+    ) -> PromptUpdateDetails[str]:
+        repl_features = IMG_CONTEXT * feature_size
+        repl_full = IMG_START + repl_features + IMG_END
+
+        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
+
+    def resolve_min_max_num(
+        self,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        use_thumbnail: bool | None = None,
+    ) -> tuple[int, int]:
+        min_dynamic_patch = (
+            self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
+        )
+        max_dynamic_patch = (
+            self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
+        )
+        dynamic_image_size = (
+            self.dynamic_image_size
+            if dynamic_image_size is None
+            else dynamic_image_size
+        )
+        use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
+
+        return resolve_h2ovl_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
+        )
+
+    def resolve_target_ratios(
+        self,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        use_thumbnail: bool | None = None,
+        prior_aspect_ratio: tuple[int, int] | None = None,
+        override_min_num: int | None = None,
+    ) -> list[tuple[int, int]]:
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
+        )
+        if override_min_num is not None:
+            min_num = override_min_num
+
+        return get_h2ovl_target_ratios(
+            min_num,
+            max_num,
+            prior_aspect_ratio=prior_aspect_ratio,
+        )
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        use_msac: bool | None = None,
+    ) -> int:
+        use_msac = self.use_msac if use_msac is None else use_msac
+
+        use_thumbnail = self.use_thumbnail
+
+        if use_msac:
+            target_ratios_1 = self.resolve_target_ratios(
+                use_thumbnail=False,  # Applied in calculate_targets
+                override_min_num=1,
+            )
+            num_patches_1, _, _, aspect_ratio_1 = calculate_h2ovl_targets(
+                orig_width=image_width,
+                orig_height=image_height,
+                image_size=self.image_size,
+                target_ratios=target_ratios_1,
+                use_thumbnail=True,
+            )
+
+            target_ratios_2 = self.resolve_target_ratios(
+                use_thumbnail=False,  # Applied in calculate_targets
+                prior_aspect_ratio=aspect_ratio_1,
+                override_min_num=3,
+            )
+            num_patches_2, _, _, _ = calculate_h2ovl_targets(
+                orig_width=image_width,
+                orig_height=image_height,
+                image_size=self.image_size,
+                target_ratios=target_ratios_2,
+                use_thumbnail=True,
+            )
+
+            num_patches = num_patches_1 + num_patches_2 - 1
+        else:
+            target_ratios = self.resolve_target_ratios(
+                use_thumbnail=False,  # Applied in calculate_targets
+            )
+            num_patches, _, _, _ = calculate_h2ovl_targets(
+                orig_width=image_width,
+                orig_height=image_height,
+                image_size=self.image_size,
+                target_ratios=target_ratios,
+                use_thumbnail=use_thumbnail,
+            )
+
+        return num_patches * self.num_image_token
+
+    def _images_to_pixel_values_lst(
+        self,
+        images: list[Image.Image],
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+    ) -> list[torch.Tensor]:
+        use_msac = self.use_msac if len(images) == 1 else False
+
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=False,  # Applied in image_to_pixel_values
+        )
+
+        return [
+            image_to_pixel_values_h2ovl(
+                image,
+                input_size=self.image_size,
+                min_num=min_num,
+                max_num=max_num,
+                use_thumbnail=self.use_thumbnail,
+                use_msac=use_msac,
+            )
+            for image in images
+        ]
--- a/vllm/transformers_utils/processors/internvl.py
+++ b/vllm/transformers_utils/processors/internvl.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from abc import ABC, abstractmethod
+from typing import Any, TypeVar
+
+import numpy.typing as npt
+import torch
+import torchvision.transforms as T
+from PIL import Image
+from transformers import BatchFeature, PretrainedConfig, TensorType
+
+from vllm.multimodal.image import convert_image_mode
+from vllm.multimodal.processing import PromptUpdateDetails
+from vllm.tokenizers import TokenizerLike
+
+_T = TypeVar("_T")
+
+IMG_START = "<img>"
+IMG_END = "</img>"
+IMG_CONTEXT = "<IMG_CONTEXT>"
+
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
+def build_transform(input_size: int):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    transform = T.Compose(
+        [
+            T.Lambda(lambda img: convert_image_mode(img, "RGB")),
+            T.Resize(
+                (input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
+            ),
+            T.ToTensor(),
+            T.Normalize(mean=MEAN, std=STD),
+        ]
+    )
+    return transform
+
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
+def find_closest_aspect_ratio(
+    aspect_ratio: float,
+    target_ratios: list[tuple[int, int]],
+    *,
+    width: int,
+    height: int,
+    image_size: int,
+) -> tuple[int, int]:
+    best_ratio_diff = float("inf")
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+
+
+def resolve_internvl_min_max_num(
+    *,
+    min_dynamic_patch: int,
+    max_dynamic_patch: int,
+    dynamic_image_size: bool,
+    use_thumbnail: bool,
+) -> tuple[int, int]:
+    min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
+    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
+
+    if use_thumbnail and max_dynamic_patch != 1:
+        max_dynamic_patch += 1
+
+    return min_dynamic_patch, max_dynamic_patch
+
+
+def get_internvl_target_ratios(
+    min_num: int,
+    max_num: int,
+) -> list[tuple[int, int]]:
+    target_ratios = {
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if min_num <= i * j <= max_num
+    }
+    return sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+
+def calculate_internvl_targets(
+    *,
+    orig_width: int,
+    orig_height: int,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> tuple[int, int, int]:
+    aspect_ratio = orig_width / orig_height
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio,
+        target_ratios,
+        width=orig_width,
+        height=orig_height,
+        image_size=image_size,
+    )
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # add thumbnail image if num_blocks != 1
+    if use_thumbnail and blocks != 1:
+        blocks += 1
+
+    return blocks, target_width, target_height
+
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
+def dynamic_preprocess_internvl(
+    image: Image.Image,
+    *,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> list[Image.Image]:
+    orig_width, orig_height = image.size
+
+    # calculate the number of blocks without thumbnail
+    blocks, target_width, target_height = calculate_internvl_targets(
+        orig_width=orig_width,
+        orig_height=orig_height,
+        target_ratios=target_ratios,
+        image_size=image_size,
+        use_thumbnail=False,
+    )
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+
+    assert len(processed_images) == blocks
+
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+
+    return processed_images
+
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
+def image_to_pixel_values_internvl(
+    image: Image.Image,
+    *,
+    input_size: int,
+    min_num: int,
+    max_num: int,
+    use_thumbnail: bool,
+) -> torch.Tensor:
+    target_ratios = get_internvl_target_ratios(min_num, max_num)
+
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess_internvl(
+        image,
+        target_ratios=target_ratios,
+        image_size=input_size,
+        use_thumbnail=use_thumbnail,
+    )
+
+    pixel_values = torch.stack([transform(image) for image in images])
+    return pixel_values
+
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
+def video_to_pixel_values_internvl(
+    video: npt.NDArray,
+    *,
+    input_size: int,
+    min_num: int,
+    max_num: int,
+    use_thumbnail: bool,
+) -> torch.Tensor:
+    target_ratios = get_internvl_target_ratios(min_num, max_num)
+
+    transform = build_transform(input_size=input_size)
+    frames_list = list[Image.Image]()
+    for frame in video:
+        pil_frame = dynamic_preprocess_internvl(
+            Image.fromarray(frame, mode="RGB"),
+            target_ratios=target_ratios,
+            image_size=input_size,
+            use_thumbnail=use_thumbnail,
+        )
+        assert len(pil_frame) == 1
+        frames_list.extend(pil_frame)
+
+    pixel_values = torch.stack([transform(image) for image in frames_list])
+    return pixel_values
+
+
+class BaseInternVLProcessor(ABC):
+    """
+    This model doesn't define its own HF processor,
+    so we implement our own one here.
+
+    The code to insert image tokens is based on:
+    https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: TokenizerLike,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.tokenizer = tokenizer
+
+        image_size: int = config.vision_config.image_size
+        patch_size: int = config.vision_config.patch_size
+
+        if min_dynamic_patch is None:
+            min_dynamic_patch = config.min_dynamic_patch
+        assert isinstance(min_dynamic_patch, int)
+
+        if max_dynamic_patch is None:
+            max_dynamic_patch = config.max_dynamic_patch
+        assert isinstance(max_dynamic_patch, int)
+
+        if dynamic_image_size is None:
+            dynamic_image_size = config.dynamic_image_size
+        assert isinstance(dynamic_image_size, bool)
+
+        self.num_image_token = int(
+            (image_size // patch_size) ** 2 * (config.downsample_ratio**2)
+        )
+        self.image_size = image_size
+        self.min_dynamic_patch = min_dynamic_patch
+        self.max_dynamic_patch = max_dynamic_patch
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail: bool = config.use_thumbnail
+
+    @property
+    @abstractmethod
+    def image_token_id(self) -> int:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_image_repl(
+        self,
+        feature_size: int,
+        num_patches: int | None,
+    ) -> PromptUpdateDetails[str]:
+        raise NotImplementedError
+
+    def resolve_min_max_num(
+        self,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        use_thumbnail: bool | None = None,
+    ) -> tuple[int, int]:
+        min_dynamic_patch = (
+            self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
+        )
+        max_dynamic_patch = (
+            self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
+        )
+        dynamic_image_size = (
+            self.dynamic_image_size
+            if dynamic_image_size is None
+            else dynamic_image_size
+        )
+        use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
+
+        return resolve_internvl_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
+        )
+
+    def resolve_target_ratios(
+        self,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        use_thumbnail: bool | None = None,
+    ) -> list[tuple[int, int]]:
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
+        )
+
+        return get_internvl_target_ratios(min_num, max_num)
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        target_ratios = self.resolve_target_ratios(
+            use_thumbnail=False,  # Applied in calculate_targets
+        )
+
+        num_patches, _, _ = calculate_internvl_targets(
+            orig_width=image_width,
+            orig_height=image_height,
+            image_size=self.image_size,
+            target_ratios=target_ratios,
+            use_thumbnail=self.use_thumbnail,
+        )
+
+        return num_patches * self.num_image_token
+
+    def _images_to_pixel_values_lst(
+        self,
+        images: list[Image.Image],
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+    ) -> list[torch.Tensor]:
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=False,  # Applied in image_to_pixel_values
+        )
+
+        return [
+            image_to_pixel_values_internvl(
+                image,
+                input_size=self.image_size,
+                min_num=min_num,
+                max_num=max_num,
+                use_thumbnail=self.use_thumbnail,
+            )
+            for image in images
+        ]
+
+    def _preprocess_image(
+        self,
+        text: list[str],
+        images: list[Image.Image],
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+    ) -> tuple[list[str], dict[str, torch.Tensor]]:
+        if len(images) == 0:
+            image_inputs = {}
+        else:
+            pixel_values_lst = self._images_to_pixel_values_lst(
+                images,
+                min_dynamic_patch=min_dynamic_patch,
+                max_dynamic_patch=max_dynamic_patch,
+                dynamic_image_size=dynamic_image_size,
+            )
+            image_inputs = {
+                "pixel_values_flat": torch.cat(pixel_values_lst),
+                "image_num_patches": torch.tensor(
+                    [len(item) for item in pixel_values_lst]
+                ),
+            }
+
+            for pixel_values in pixel_values_lst:
+                num_patches = pixel_values.shape[0]
+                feature_size = num_patches * self.num_image_token
+
+                image_repl = self.get_image_repl(feature_size, num_patches)
+                text = [t.replace("<image>", image_repl.full, 1) for t in text]
+        return text, image_inputs
+
+    def _make_batch_input(self, input_item: _T | list[_T] | None = None) -> list[_T]:
+        if input_item is None:
+            input_item = []
+        if not isinstance(input_item, list):
+            input_item = [input_item]
+        return input_item
+
+    def __call__(
+        self,
+        text: str | list[str] | None = None,
+        images: Image.Image | list[Image.Image] | None = None,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        return_tensors: str | TensorType | None = None,
+        **kwargs,
+    ) -> BatchFeature:
+        text = self._make_batch_input(text)
+        images = self._make_batch_input(images)
+
+        text, image_inputs = self._preprocess_image(
+            text=text,
+            images=images,
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+        )
+
+        text_inputs = self.tokenizer(text)
+
+        combined_outputs = {**text_inputs, **image_inputs}
+
+        return BatchFeature(combined_outputs, tensor_type=return_tensors)
+
+
+class InternVLProcessor(BaseInternVLProcessor):
+    """
+    HF Processor for InternVLChatModel with extended video processing logic.
+
+    Code for video processing is adapted from video example:
+    https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: TokenizerLike,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        video_token: str | None = None,
+    ) -> None:
+        super().__init__(
+            config=config,
+            tokenizer=tokenizer,
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+        )
+        # add extra video token for video processing
+        self.video_token = video_token
+
+    @property
+    def image_token_id(self) -> int:
+        return self.tokenizer.get_vocab()[IMG_CONTEXT]
+
+    @property
+    def video_token_id(self) -> int | None:
+        if self.video_token is None:
+            return None
+        return self.tokenizer.get_vocab().get(self.video_token, None)
+
+    @property
+    def supports_video(self) -> bool:
+        return self.video_token_id is not None
+
+    def _videos_to_pixel_values_lst(
+        self,
+        videos: list[npt.NDArray],
+        dynamic_image_size: bool | None = None,
+    ) -> list[torch.Tensor]:
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=1,
+            max_dynamic_patch=1,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=False,  # Applied in image_to_pixel_values
+        )
+
+        return [
+            video_to_pixel_values_internvl(
+                video,
+                input_size=self.image_size,
+                min_num=min_num,
+                max_num=max_num,
+                use_thumbnail=False,
+            )
+            for video in videos
+        ]
+
+    def _preprocess_video(
+        self,
+        text: list[str],
+        videos: list[npt.NDArray],
+        dynamic_image_size: bool | None = None,
+    ) -> tuple[list[str], dict[str, Any]]:
+        if len(videos) == 0 or not self.supports_video:
+            return text, {}
+
+        video_token = self.video_token
+        assert video_token is not None
+
+        pixel_values_lst_video = self._videos_to_pixel_values_lst(
+            videos,
+            dynamic_image_size=dynamic_image_size,
+        )
+        video_inputs = {
+            "pixel_values_flat_video": torch.cat(pixel_values_lst_video),
+            "video_num_patches": torch.tensor(
+                [len(item) for item in pixel_values_lst_video]
+            ),
+        }
+
+        for pixel_values in pixel_values_lst_video:
+            num_patches = pixel_values.shape[0]
+
+            video_repl = self.get_video_repl(
+                self.num_image_token, num_patches, video_token
+            )
+            text = [t.replace("<video>", video_repl.full, 1) for t in text]
+        return text, video_inputs
+
+    def __call__(
+        self,
+        text: str | list[str] | None = None,
+        images: Image.Image | list[Image.Image] | None = None,
+        videos: npt.NDArray | list[npt.NDArray] | None = None,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        return_tensors: str | TensorType | None = None,
+        **kwargs,
+    ) -> BatchFeature:
+        text = self._make_batch_input(text)
+        images = self._make_batch_input(images)
+        videos = self._make_batch_input(videos)
+
+        text, image_inputs = self._preprocess_image(
+            text=text,
+            images=images,
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+        )
+
+        text, video_inputs = self._preprocess_video(
+            text=text,
+            videos=videos,
+            dynamic_image_size=dynamic_image_size,
+        )
+
+        text_inputs = self.tokenizer(text)
+
+        combined_outputs = {**text_inputs, **image_inputs, **video_inputs}
+
+        return BatchFeature(combined_outputs, tensor_type=return_tensors)
+
+    def get_image_repl(
+        self,
+        feature_size: int,
+        num_patches: int | None,
+    ) -> PromptUpdateDetails[str]:
+        repl_features = IMG_CONTEXT * feature_size
+        repl_full = IMG_START + repl_features + IMG_END
+
+        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
+
+    def get_video_repl(
+        self,
+        feature_size: int,
+        num_patches: int | None,
+        video_context_token: str = IMG_CONTEXT,
+    ) -> PromptUpdateDetails[str]:
+        if num_patches is None:
+            raise NotImplementedError("Embedding inputs are not supported")
+
+        repl_features = video_context_token * self.num_image_token
+        repl_features_with_sep = IMG_START + repl_features + IMG_END
+        # num_patches is equal to num_frames
+        repl_full = "".join(
+            [f"Frame{i + 1}: {repl_features_with_sep}" for i in range(num_patches)]
+        )
+
+        return PromptUpdateDetails.select_text(repl_full, video_context_token)
--- a/vllm/transformers_utils/processors/nano_nemotron_vl.py
+++ b/vllm/transformers_utils/processors/nano_nemotron_vl.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# --------------------------------------------------------
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/internvl.py
+# under Apache-2.0 License
+#     LICENSE is in root directory.
+# --------------------------------------------------------
+
+import math
+from abc import ABC, abstractmethod
+from collections.abc import Sequence
+from dataclasses import dataclass
+from typing import Any, TypeVar
+
+import einops
+import numpy as np
+import numpy.typing as npt
+import regex as re
+import torch
+from PIL import Image
+from transformers import BatchFeature, PretrainedConfig, TensorType
+
+from vllm.model_executor.models.parakeet import ParakeetExtractor
+from vllm.multimodal.evs import compute_retained_tokens_count
+from vllm.multimodal.inputs import AudioItem
+from vllm.multimodal.processing.processor import PromptUpdateDetails, _seq2tokens
+from vllm.tokenizers import TokenizerLike
+
+from .internvl import calculate_internvl_targets, get_internvl_target_ratios
+
+_T = TypeVar("_T")
+
+
+IMG_START = "<img>"
+IMG_END = "</img>"
+IMG_CONTEXT = "<image>"
+AUDIO_START = "<so_start>"
+AUDIO_END = "<so_end>"
+AUDIO_CONTEXT = "<so_embedding>"
+
+# Profiling
+# MAX_FRAMES = 16
+DEFAULT_NUM_TILES = 12
+
+
+def calculate_timestamps(
+    indices: list[int] | torch.Tensor,
+    frame_duration_ms: int,
+):
+    if not isinstance(indices, list):
+        indices = indices.tolist()
+
+    timestamps = [int(i) * frame_duration_ms / 1000.0 for i in indices]
+    return timestamps
+
+
+def input_conditioner(x: torch.Tensor, norm_mean: torch.Tensor, norm_std: torch.Tensor):
+    return (x - norm_mean) / norm_std
+
+
+def dynamic_preprocess(
+    image,
+    *,
+    image_size=512,
+    max_num_tiles=12,
+    use_thumbnail=True,
+    idx=0,
+):
+    orig_width, orig_height = image.size
+
+    target_ratios = get_internvl_target_ratios(1, max_num_tiles)
+
+    blocks, target_width, target_height = calculate_internvl_targets(
+        orig_width=orig_width,
+        orig_height=orig_height,
+        target_ratios=target_ratios,
+        image_size=image_size,
+        use_thumbnail=False,
+    )
+
+    image = np.asarray(
+        image.convert("RGB") if image.mode != "RGB" else image, dtype=np.uint8
+    )
+
+    image = torch.from_numpy(image).unsqueeze(0)  # (1, H, W, 3)
+    image = image.permute(0, 3, 1, 2)  # (1, 3, H, W)
+
+    resized_img = torch.nn.functional.interpolate(
+        image,
+        size=(target_height, target_width),
+        mode="bicubic",
+        align_corners=False,
+        antialias=True,
+    )
+    B, C, H, W = resized_img.shape
+    hp, wp = H // image_size, W // image_size
+    patches = (
+        resized_img.reshape(B, C, hp, image_size, wp, image_size)
+        .permute(0, 2, 4, 1, 3, 5)
+        .reshape(B * hp * wp, C, image_size, image_size)
+        / 255.0
+    )
+
+    if use_thumbnail and patches.shape[0] > 1:
+        thumb = (
+            torch.nn.functional.interpolate(
+                image,
+                size=(image_size, image_size),
+                mode="bicubic",
+                align_corners=False,
+                antialias=True,
+            )
+            / 255.0
+        )
+        patches = torch.cat([patches, thumb], dim=0)
+
+    return list(patches)
+
+
+def image_to_pixel_values(
+    image: Image.Image,
+    *,
+    input_size: int,
+    max_num: int,
+    use_thumbnail: bool,
+    idx: int,
+) -> torch.Tensor:
+    images = dynamic_preprocess(
+        image,
+        image_size=input_size,
+        max_num_tiles=max_num,
+        use_thumbnail=use_thumbnail,
+        idx=idx,
+    )
+
+    pixel_values = torch.stack(images)
+    return pixel_values
+
+
+def video_to_pixel_values(
+    video: npt.NDArray,
+    *,
+    input_size: int,
+    max_num_tiles: int = 1,
+    use_thumbnail: bool,
+) -> torch.Tensor:
+    assert max_num_tiles == 1, "Video modality always uses one tile"
+
+    # (num_frames, H, W, C) -> (num_frames, C, H, W)
+    video_tensor = torch.from_numpy(video).permute(0, 3, 1, 2)
+
+    if video_tensor.shape[2] != input_size or video_tensor.shape[3] != input_size:
+        video_tensor = torch.nn.functional.interpolate(
+            video_tensor,
+            size=(input_size, input_size),
+            mode="bicubic",
+            align_corners=False,
+            antialias=True,
+        )
+
+    video_tensor = video_tensor / 255.0
+
+    return video_tensor
+
+
+class DynamicResolutionImageTiler:
+    CONV_MERGING = False
+    PIXEL_SHUFFLE = True
+    USE_THUMBNAIL = False
+
+    def __init__(
+        self,
+        *,
+        max_model_len: int,
+        patch_size: int,
+        min_num_patches: int,
+        max_num_patches: int,
+        downsample_ratio: int,
+        norm_mean: Sequence[float],
+        norm_std: Sequence[float],
+        factor_max: float = 1.0,
+        use_thumbnail: bool = False,
+    ) -> None:
+        assert use_thumbnail is False, "use_thumbnail is not supported"
+        self._patch_size: int = patch_size
+        self._max_model_len = max_model_len
+        self._min_num_patches = min_num_patches
+        self._max_num_patches = max_num_patches if max_num_patches > 0 else float("inf")
+        self._factor_max = factor_max
+        self.norm_mean = torch.tensor(norm_mean).reshape(3, 1, 1)
+        self.norm_std = torch.tensor(norm_std).reshape(3, 1, 1)
+        assert downsample_ratio < 1
+        reduction_factor = 1 / downsample_ratio
+        assert reduction_factor == 2.0
+        self._downsample_ratio = int(reduction_factor) ** (
+            self.PIXEL_SHUFFLE + self.CONV_MERGING
+        )
+        assert self._downsample_ratio == 2
+
+    def _get_num_embeddings(self, width: int, height: int) -> int:
+        num_patches = (width // self._patch_size) * (height // self._patch_size)
+        num_tokens = num_patches // (self._downsample_ratio**2)
+        return num_tokens
+
+    def width_and_height_for_max_num_tokens_available(
+        self,
+        target_num_tokens_post_shuffle: int,
+    ) -> tuple[int, int]:
+        """
+        TODO: optimize this so it squeezes closer to target number of tokens.
+        Calculate image dimensions that produce approximately `target` tokens after
+        pixel_shuffle.
+
+        With pixel_shuffle enabled, each 2x2 patch grid becomes 1 token, so we
+        need 4*B patches to get B tokens.
+
+        Examples:
+        >>> PATCH_SIZE = 16
+        >>> DOWNSAMPLE_RATIO = 0.5
+        >>> tiler = DynamicResolutionImageTiler(
+        ...     max_model_len=16384,
+        ...     patch_size=PATCH_SIZE,
+        ...     downsample_ratio=DOWNSAMPLE_RATIO,
+        ...     min_num_patches=4,
+        ...     max_num_patches=0,
+        ... )
+        >>> width, height = tiler.width_and_height_for_max_num_tokens_available(
+        ...     target_num_tokens_post_shuffle=8192,
+        ... )
+        >>> assert width, height == (2880, 2880)
+        >>> assert (width // PATCH_SIZE) * (
+        ...     height // PATCH_SIZE
+        ... ) // 2**2 == 8100  # tokens post-shuffle
+        >>> assert tiler._get_num_embeddings(width=width, height=height) == 8100
+        """
+        side_pixels = (
+            math.isqrt(target_num_tokens_post_shuffle)
+            * self._downsample_ratio
+            * self._patch_size
+        )
+        assert isinstance(side_pixels, int) and side_pixels % self._patch_size == 0
+        return side_pixels, side_pixels
+
+    def max_num_tokens_available(self, text_prompt_length: int) -> int:
+        return self._max_model_len - text_prompt_length - 4
+
+    def _images_to_pixel_values_lst(
+        self,
+        text_prompt_length: int,
+        images: list[Image.Image],
+    ) -> tuple[list[torch.Tensor], list[int]]:
+        num_tokens_available = self.max_num_tokens_available(text_prompt_length)
+        params_per_image = self.compute_params(images, num_tokens_available)
+
+        feature_sizes = []
+        images = []
+        for param in params_per_image:
+            for t in self.apply_params(param):
+                assert t.ndim == 3, f"{t.ndim=}: expected 3 dim tensor"
+                images.append(t)
+                feature_sizes.append(param.num_embeddings)
+        return images, feature_sizes
+
+    feature_size_cache: dict[Image.Image, int] = {}
+
+    @classmethod
+    def get_cached_feature_size(cls, image: Image.Image) -> int:
+        feature_size = cls.feature_size_cache[id(image)]
+        # hard assert that we only use the feature size once
+        del cls.feature_size_cache[id(image)]
+        return feature_size
+
+    @dataclass
+    class DynamicResolutionParams:
+        media: Image.Image
+        num_tiles: int
+        num_embeddings: int
+        patch_size: tuple[int, int]
+
+    def apply_params(self, params: DynamicResolutionParams) -> list[torch.Tensor]:
+        target_size = (
+            params.patch_size[1] * self._patch_size,
+            params.patch_size[0] * self._patch_size,
+        )
+        image = np.asarray(
+            params.media.convert("RGB") if params.media.mode != "RGB" else params.media,
+            dtype=np.uint8,
+        )
+        resized_img = (
+            torch.nn.functional.interpolate(
+                torch.from_numpy(image).unsqueeze(0).permute(0, 3, 1, 2),
+                size=target_size,
+                mode="bicubic",
+                align_corners=False,
+                antialias=True,
+            )
+            / 255.0
+        )
+        return list(resized_img)
+
+    def process_media(
+        self,
+        media: Image.Image,
+        num_tokens_available: int,
+    ) -> tuple[DynamicResolutionParams, int]:
+        """Process a single media item and return its parameters.
+
+        Args:
+            media: The media item to process
+            num_tokens_available: Number of tokens available for this media
+        Returns:
+            DynamicResolutionParams for the media
+        """
+        current_num_tokens_available = num_tokens_available
+        assert isinstance(media, Image.Image), (
+            "Dynamic resolution is only supported for image media"
+        )
+        orig_width, orig_height = media.width, media.height
+        closest_patch_height = round(orig_height / self._patch_size + 0.5)
+        closest_patch_width = round(orig_width / self._patch_size + 0.5)
+        patches = closest_patch_height * closest_patch_width
+
+        factor = min(
+            math.sqrt(current_num_tokens_available / patches), self._factor_max
+        )
+        target_patch_height = math.floor(factor * closest_patch_height)
+        target_patch_width = math.floor(factor * closest_patch_width)
+
+        # Consider self._min_num_patches if > current_num_tokens_available.
+        if (
+            current_num_tokens_available > self._min_num_patches
+            and target_patch_height * target_patch_width < self._min_num_patches
+        ):
+            up_factor = math.sqrt(
+                self._min_num_patches / (target_patch_height * target_patch_width)
+            )
+            target_patch_height = math.ceil(up_factor * target_patch_height)
+            target_patch_width = math.ceil(up_factor * target_patch_width)
+
+        # Round patch grid to be divisible by 2 (pixel-shuffle OR conv-merging)
+        # or by 4 when BOTH are enabled (two successive 2x reductions)
+        if self.PIXEL_SHUFFLE or self.CONV_MERGING:
+            required_divisor = 4 if (self.PIXEL_SHUFFLE and self.CONV_MERGING) else 2
+
+            rem_h = target_patch_height % required_divisor
+            if rem_h != 0:
+                inc_h = required_divisor - rem_h
+                if (
+                    target_patch_height + inc_h
+                ) * target_patch_width <= current_num_tokens_available:
+                    target_patch_height += inc_h
+                else:
+                    target_patch_height = max(
+                        required_divisor, target_patch_height - rem_h
+                    )
+
+            rem_w = target_patch_width % required_divisor
+            if rem_w != 0:
+                inc_w = required_divisor - rem_w
+                if (
+                    target_patch_height * (target_patch_width + inc_w)
+                    <= current_num_tokens_available
+                ):
+                    target_patch_width += inc_w
+                else:
+                    target_patch_width = max(
+                        required_divisor, target_patch_width - rem_w
+                    )
+
+        # Calculate embeddings for the main dynamic resolution image
+        num_embeddings = self._get_num_embeddings(
+            target_patch_width * self._patch_size,
+            target_patch_height * self._patch_size,
+        )
+
+        token_count = target_patch_width * target_patch_height
+
+        # Add thumbnail embeddings if enabled and image area is below threshold
+        num_tiles = 1  # Base dynamic resolution image
+
+        return self.DynamicResolutionParams(
+            media=media,
+            num_tiles=num_tiles,
+            num_embeddings=num_embeddings,
+            patch_size=(target_patch_width, target_patch_height),
+        ), token_count
+
+    def compute_params(
+        self,
+        media_list: list[Image.Image],
+        num_tokens_available: int,
+    ) -> list[DynamicResolutionParams]:
+        """Compute parameters for all media with iterative token budgeting.
+
+        Args:
+            media_list: List of media items to process
+            num_tokens_available: Total number of tokens available across all media
+        Returns:
+            List of ImageTilingParams for each media item
+        """
+        num_tokens_available = (
+            num_tokens_available
+            * (4 if self.PIXEL_SHUFFLE else 1)
+            * (4 if self.CONV_MERGING else 1)
+        )
+        # When the number of available token is too small,
+        # allow self._min_num_patches per media and let the sample be truncated.
+        num_tokens_available = max(
+            num_tokens_available, self._min_num_patches * len(media_list)
+        )
+
+        # Clip the number of tokens available per media to >min and <max patches.
+        num_tokens_available_per_media = [
+            int(
+                max(
+                    min(num_tokens_available, self._max_num_patches),
+                    self._min_num_patches,
+                )
+            )
+            for _ in range(len(media_list))
+        ]
+
+        # prevent infinite loop in any case
+        for _ in range(10):
+            # Step 1: Process each media with current token budget
+            params = []
+            token_counts = []
+
+            for media, tokens_for_media in zip(
+                media_list, num_tokens_available_per_media
+            ):
+                param, token_count = self.process_media(media, tokens_for_media)
+                params.append(param)
+                token_counts.append(token_count)
+                self.feature_size_cache[id(param.media)] = param.num_embeddings
+
+            # Step 2: Check if total tokens is within budget
+            total_tokens = sum(token_counts)
+
+            if total_tokens <= num_tokens_available:
+                # We're within budget, return the params
+                return params
+
+            # Step 3: We're over budget, need to scale down
+            # Calculate scaling factor to get under budget
+            scaling_factor = num_tokens_available / total_tokens
+
+            # Recalculate token budgets for each media based on scaling
+            # Each media gets a proportional share of the total budget
+            scaled_down_num_tokens_available_per_media = [
+                max(self._min_num_patches, int(token_count * scaling_factor))
+                for token_count in token_counts
+            ]
+            scaled_down = any(
+                [
+                    scaled_down_num_tokens_available_per_media[i]
+                    < num_tokens_available_per_media[i]
+                    for i in range(len(num_tokens_available_per_media))
+                ]
+            )
+            # If there wasn't scaling down, we're stuck with min_num_patches per media,
+            # else try with the scaled down num_tokens_available_per_media.
+            if not scaled_down:
+                num_tokens_available_per_media = [self._min_num_patches] * len(
+                    media_list
+                )
+            else:
+                num_tokens_available_per_media = (
+                    scaled_down_num_tokens_available_per_media
+                )
+        ctx = f"{params=} {total_tokens=} {num_tokens_available=}"
+        raise ValueError(
+            f"Should be unreachable - `return params` above must be reached: {ctx}"
+        )
+
+    @staticmethod
+    def stack(images: list[torch.Tensor], patch_size: int) -> torch.Tensor:
+        assert len(images) > 0, "No images to stack"
+
+        def rearrange_img(x):
+            py = x.shape[-2] // patch_size
+            px = x.shape[-1] // patch_size
+            x = einops.rearrange(
+                x,
+                "c (py yy) (px xx) -> (py px) (c yy xx)",
+                py=py,
+                yy=patch_size,
+                px=px,
+                xx=patch_size,
+            )
+            return x
+
+        imgs = [rearrange_img(img) for img in images]
+        pixel_values_flat = torch.cat(imgs, dim=0).unsqueeze(0)
+        return pixel_values_flat
+
+
+class BaseNanoNemotronVLProcessor(ABC):
+    """
+    This model doesn't define its own HF processor,
+    so we implement our own one here.
+
+    The code to insert image tokens is based on:
+    https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: TokenizerLike,
+        *args,
+        max_model_len: int,
+        max_num_tiles: int | None = None,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.tokenizer = tokenizer
+
+        self.max_num_tiles = max_num_tiles or DEFAULT_NUM_TILES
+        image_size: int = config.force_image_size
+        patch_size: int = config.patch_size
+        downsample_ratio: int = config.downsample_ratio
+
+        self.num_image_token = int(
+            (image_size // patch_size) ** 2 * (downsample_ratio**2)
+        )
+        self.image_size = image_size
+        self.use_thumbnail: bool = config.use_thumbnail
+        self.norm_mean = torch.Tensor(config.norm_mean).reshape(1, 3, 1, 1)
+        self.norm_std = torch.Tensor(config.norm_std).reshape(1, 3, 1, 1)
+
+        self.dynamic_tiler: DynamicResolutionImageTiler | None = None
+        if self.use_dynamic_resolution(config):
+            self.dynamic_tiler = DynamicResolutionImageTiler(
+                max_model_len=max_model_len,
+                patch_size=patch_size,
+                downsample_ratio=downsample_ratio,
+                min_num_patches=config.vision_config.args["min_num_patches"],
+                max_num_patches=config.vision_config.args["max_num_patches"],
+                norm_mean=config.norm_mean,
+                norm_std=config.norm_std,
+            )
+
+    @staticmethod
+    def use_dynamic_resolution(config: PretrainedConfig) -> bool:
+        return "min_num_patches" in config.vision_config.args
+
+    @property
+    @abstractmethod
+    def image_token_id(self) -> int:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_image_repl(
+        self,
+        feature_size: int,
+        num_patches: int | None,
+    ) -> PromptUpdateDetails[str]:
+        raise NotImplementedError
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        max_num_tiles: int,
+    ) -> int:
+        target_ratios = get_internvl_target_ratios(1, max_num_tiles)
+
+        num_patches, _, _ = calculate_internvl_targets(
+            orig_width=image_width,
+            orig_height=image_height,
+            target_ratios=target_ratios,
+            image_size=self.image_size,
+            use_thumbnail=self.use_thumbnail,
+        )
+
+        return num_patches * self.num_image_token
+
+    def _images_to_pixel_values_lst(
+        self,
+        images: list[Image.Image],
+        max_num_tiles: int,
+    ) -> list[torch.Tensor]:
+        return [
+            image_to_pixel_values(
+                image,
+                input_size=self.image_size,
+                max_num=max_num_tiles,
+                use_thumbnail=self.use_thumbnail,
+                idx=idx,
+            )
+            for idx, image in enumerate(images)
+        ]
+
+    def _preprocess_image(
+        self,
+        text: list[str],
+        images: list[Image.Image],
+        max_num_tiles: int,
+    ) -> tuple[list[str], dict[str, Any]]:
+        if len(images) == 0:
+            return text, {}
+
+        image_inputs: dict[str, Any]
+        if tiler := self.dynamic_tiler:
+            sans_images = text[0].replace("<image>", "")
+            text_prompt_length = len(
+                self.tokenizer(sans_images, add_special_tokens=False).input_ids
+            )
+            pixel_values_lst, num_tokens_per_image = tiler._images_to_pixel_values_lst(
+                text_prompt_length=text_prompt_length,
+                images=images,
+            )
+            imgs_sizes = [(pv.shape[-2], pv.shape[-1]) for pv in pixel_values_lst]
+            normalized = [
+                input_conditioner(img, tiler.norm_mean, tiler.norm_std)
+                for img in pixel_values_lst
+            ]
+            image_num_patches = torch.tensor([1] * len(num_tokens_per_image))
+            image_inputs = {
+                "pixel_values_flat": normalized,
+                "imgs_sizes": imgs_sizes,
+                "num_tokens_per_image": num_tokens_per_image,
+            }
+        else:
+            pixel_values_lst = self._images_to_pixel_values_lst(images, max_num_tiles)
+            image_num_patches = torch.tensor([len(item) for item in pixel_values_lst])
+            pixel_values_flat = input_conditioner(
+                torch.cat(pixel_values_lst), self.norm_mean, self.norm_std
+            )
+            image_inputs = {
+                "pixel_values_flat": pixel_values_flat,
+                "image_num_patches": image_num_patches,
+            }
+            num_tokens_per_image = [
+                self.num_image_token * len(item) for item in pixel_values_lst
+            ]
+
+        assert len(text) == 1, (
+            "hf_processor is called on the output of get_dummy_text, "
+            "which should be a single string"
+        )
+        parts = [x for x in re.split(r"(<image>)", text[0]) if x]
+        assert parts.count("<image>") == len(pixel_values_lst), (
+            "the number of <image> tokens in the text should be the "
+            "same as the number of images"
+        )
+
+        for i, (feature_size, num_patches) in enumerate(
+            zip(num_tokens_per_image, image_num_patches, strict=True)
+        ):
+            image_repl = self.get_image_repl(feature_size, num_patches)
+            parts[i] = parts[i].replace("<image>", image_repl.full)
+        text = ["".join(parts)]
+
+        return text, image_inputs
+
+    def _make_batch_input(self, input_item: _T | list[_T] | None = None) -> list[_T]:
+        if input_item is None:
+            input_item = []
+        if not isinstance(input_item, list):
+            input_item = [input_item]
+        return input_item
+
+    @abstractmethod
+    def __call__(
+        self,
+        text: str | list[str] | None = None,
+        images: Image.Image | list[Image.Image] | None = None,
+        *,
+        return_tensors: str | TensorType | None = None,
+        max_num_tiles: int | None = None,
+        **kwargs,
+    ) -> BatchFeature:
+        raise NotImplementedError
+
+
+class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
+    """
+    HF Processor with extended video processing logic.
+    Code for video processing is adapted from video example:
+    https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: TokenizerLike,
+        *,
+        max_model_len: int,
+        max_num_tiles: int | None = None,
+        video_token: str | None = None,
+        video_pruning_rate: float | None = None,
+    ) -> None:
+        super().__init__(
+            config=config,
+            tokenizer=tokenizer,
+            max_model_len=max_model_len,
+            max_num_tiles=max_num_tiles,
+        )
+        # add extra video token for video processing
+        self.video_token = video_token
+        self.video_pruning_rate = video_pruning_rate
+
+        self.audio_extractor: ParakeetExtractor | None = None
+        raw_sound_config = getattr(config, "sound_config", None)
+        if raw_sound_config is not None:
+            self.audio_extractor = ParakeetExtractor(raw_sound_config)
+
+        # Pre-tokenize special tokens for video processing
+        # to avoid repeated tokenization
+        self._img_start_token_ids = tokenizer.encode(
+            IMG_START, add_special_tokens=False
+        )
+        self._img_end_token_ids = tokenizer.encode(IMG_END, add_special_tokens=False)
+        self._img_context_token_ids = tokenizer.encode(
+            IMG_CONTEXT, add_special_tokens=False
+        )
+
+    @property
+    def supports_video(self) -> bool:
+        return self.video_token_id is not None
+
+    @property
+    def video_token_id(self) -> int | None:
+        if self.video_token is None:
+            return None
+        return self.tokenizer.get_vocab().get(self.video_token, None)
+
+    @property
+    def image_token_id(self) -> int:
+        return self.tokenizer.convert_tokens_to_ids(IMG_CONTEXT)
+
+    def _videos_to_pixel_values_lst(
+        self,
+        videos: list[npt.NDArray],
+        max_num_tiles: int,
+    ) -> list[torch.Tensor]:
+        return [
+            video_to_pixel_values(
+                video,
+                input_size=self.image_size,
+                max_num_tiles=max_num_tiles,
+                use_thumbnail=self.use_thumbnail,
+            )
+            for video in videos
+        ]
+
+    def _preprocess_video(
+        self,
+        text: list[str],
+        videos: list[tuple[npt.NDArray, dict[str, Any]]],
+        max_num_tiles: int,
+    ) -> tuple[list[str], dict[str, Any]]:
+        if len(videos) == 0 or not self.supports_video:
+            return text, {}
+
+        videos_lst = [v[0] for v in videos]
+        video_metadata_lst = [v[1] for v in videos]
+        pixel_values_lst_video = self._videos_to_pixel_values_lst(
+            videos_lst,
+            max_num_tiles=max_num_tiles,
+        )
+
+        # We use frame duration in milliseconds (as integer) to ensure
+        # we have consistent timestamps calculation. At preprocessing
+        # fps parameter is given in fp32, while at inference it is bf16
+        # which leads to inaccurate timestamp calculation and causes
+        # timestamp values to differ.In rare cases this causes
+        # mismatching number of output tokens for tokenized  frame prefixes
+        frame_duration_ms_lst = [
+            int(1000.0 / metadata["fps"]) for metadata in video_metadata_lst
+        ]
+        frames_indices_lst = [
+            metadata["frames_indices"] for metadata in video_metadata_lst
+        ]
+        video_num_patches = torch.tensor([len(item) for item in pixel_values_lst_video])
+        video_inputs = {
+            "pixel_values_flat_video": input_conditioner(
+                torch.cat(pixel_values_lst_video), self.norm_mean, self.norm_std
+            ),
+            "video_num_patches": video_num_patches,
+            "frames_indices": frames_indices_lst,
+            "frame_duration_ms": torch.tensor(frame_duration_ms_lst),
+        }
+
+        image_size: int = self.config.force_image_size
+        patch_size: int = self.config.patch_size
+        downsample_ratio = self.config.downsample_ratio
+        tokens_in_single_frame = int(
+            (image_size * image_size // patch_size**2) * (downsample_ratio**2)
+        )
+
+        for pixel_values, video_metadata, frames_indices, frame_duration_ms in zip(
+            pixel_values_lst_video,
+            video_metadata_lst,
+            frames_indices_lst,
+            frame_duration_ms_lst,
+        ):
+            num_frames = pixel_values.shape[0]
+
+            if self.video_pruning_rate is not None and self.video_pruning_rate > 0.0:
+                # Start of EVS-specific code
+                num_tokens = compute_retained_tokens_count(
+                    tokens_per_frame=tokens_in_single_frame,
+                    num_frames=num_frames,
+                    q=self.video_pruning_rate,
+                )
+
+                # Here we just need placeholders that won't actually be replaced -
+                # we just need to make sure the total number of tokens is correct
+                # assign all tokens to the first frame
+                tokens_per_frame = [num_tokens] + [0] * (num_frames - 1)
+
+                # End of EVS-specific code
+            else:
+                tokens_per_frame = [tokens_in_single_frame] * num_frames
+
+            video_repl = self.get_video_repl(
+                tokens_per_frame=tokens_per_frame,
+                frames_indices=frames_indices,
+                frame_duration_ms=frame_duration_ms,
+                tokenizer=self.tokenizer,
+                img_start_token_ids=self._img_start_token_ids,
+                img_end_token_ids=self._img_end_token_ids,
+                img_context_token_ids=self._img_context_token_ids,
+            )
+
+            # video_repl.full is a list of token IDs
+            # Convert token IDs back to text for the HF processor flow
+            video_repl_text = self.tokenizer.decode(
+                video_repl.full, skip_special_tokens=False
+            )
+            text = [t.replace("<video>", video_repl_text, 1) for t in text]
+
+        return text, video_inputs
+
+    def _preprocess_audio(
+        self,
+        text: list[str],
+        audios: list[npt.NDArray],
+    ) -> tuple[list[str], dict[str, Any]]:
+        if len(audios) == 0:
+            return text, {}
+
+        assert self.audio_extractor is not None
+        extractor = self.audio_extractor
+
+        parts = [x for x in re.split(f"({re.escape(AUDIO_CONTEXT)})", text[0]) if x]
+        token_count = parts.count(AUDIO_CONTEXT)
+        if token_count != len(audios):
+            raise ValueError(
+                "Number of audio tokens in text does not match the number "
+                f"of audios (tokens={token_count}, audios={len(audios)})."
+            )
+        audio_index = 0
+        for idx, part in enumerate(parts):
+            if part == AUDIO_CONTEXT:
+                audio_repl = self.get_audio_repl(audios[audio_index])
+                parts[idx] = audio_repl.full
+                audio_index += 1
+        text = ["".join(parts)]
+        audio_inputs = extractor(
+            audios,
+            sampling_rate=extractor.sampling_rate,
+            return_tensors="pt",
+        )
+        input_audio_features = audio_inputs.input_features
+        feature_attention_mask = audio_inputs.attention_mask
+        audio_feature_lengths = feature_attention_mask.sum(dim=1)
+        audio_inputs = {
+            "input_audio_features": input_audio_features,
+            "feature_attention_mask": feature_attention_mask,
+            "audio_feature_lengths": audio_feature_lengths,
+        }
+
+        return text, audio_inputs
+
+    def __call__(
+        self,
+        text: str | list[str] | None = None,
+        images: Image.Image | list[Image.Image] | None = None,
+        videos: tuple[npt.NDArray, dict[str, Any]]
+        | list[tuple[npt.NDArray, dict[str, Any]]]
+        | None = None,
+        audios: AudioItem | list[AudioItem] | None = None,
+        *,
+        return_tensors: str | TensorType | None = None,
+        max_num_tiles: int | None = None,
+        **kwargs,
+    ) -> BatchFeature:
+        # Use default if not provided
+        if max_num_tiles is None:
+            max_num_tiles = self.max_num_tiles
+
+        text = self._make_batch_input(text)
+        images = self._make_batch_input(images)
+        videos = self._make_batch_input(videos)
+        audios = self._make_batch_input(audios)
+
+        text, image_inputs = self._preprocess_image(
+            text=text,
+            images=images,
+            max_num_tiles=max_num_tiles,
+        )
+
+        text, video_inputs = self._preprocess_video(
+            text=text,
+            videos=videos,
+            max_num_tiles=1,
+        )
+
+        text, audio_inputs = self._preprocess_audio(
+            text=text,
+            audios=audios,
+        )
+
+        text_inputs = self.tokenizer(text, add_special_tokens=False)
+
+        combined_inputs = {**text_inputs, **video_inputs, **audio_inputs}
+
+        if self.dynamic_tiler is None:
+            batch = BatchFeature(
+                {**combined_inputs, **image_inputs},
+                tensor_type=return_tensors,
+            )
+        else:
+            batch = BatchFeature(combined_inputs, tensor_type=return_tensors)
+            # allow images to be exempt from the BatchFeature validation:
+            # We will .stack() them in _parse_and_validate_image_input
+            batch.update(image_inputs)
+        return batch
+
+    def get_image_repl(
+        self,
+        feature_size: int,
+        num_patches: int | None,
+    ) -> PromptUpdateDetails[str]:
+        repl_features = IMG_CONTEXT * feature_size
+        repl_full = IMG_START + repl_features + IMG_END
+
+        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
+
+    def get_audio_repl(
+        self,
+        audio: npt.NDArray,
+    ) -> PromptUpdateDetails[str]:
+        assert self.audio_extractor is not None
+        num_tokens = self.audio_extractor.audio_token_count(len(audio))
+        repl_full = f"{AUDIO_START}{AUDIO_CONTEXT * num_tokens}{AUDIO_END}"
+        return PromptUpdateDetails.select_text(repl_full, AUDIO_CONTEXT)
+
+    @classmethod
+    def get_video_repl(
+        cls,
+        *,
+        tokens_per_frame: list[int],
+        frames_indices: list[int],
+        frame_duration_ms: int,
+        tokenizer: TokenizerLike,
+        img_start_token_ids: list[int],
+        img_end_token_ids: list[int],
+        img_context_token_ids: list[int],
+    ) -> PromptUpdateDetails[list[int]]:
+        """
+        Build prompt replacement for a video.
+        The replacement returned is not actually used to replace the placeholder
+        tokens - it's just used to make sure we allocate the correct number
+        of tokens.
+        Actual replacement is done in embed_multimodal of
+        NemotronH_Nano_VL_V2
+        (specifically in _process_video_input -> _create_final_video_embeddings).
+        There, we create the final embeddings with text embeddings for indicator tokens
+        and video embeddings for video tokens.
+        This is a single function that handles all cases - non EVS, EVS dummy, EVS real.
+        The differentiation is done via tokens_per_frame parameter.
+        - non EVS case - constant value same value across all frames
+        - EVS dummy - Doesn't matter how tokens are distributed between frames - just
+                        make sure the total number of tokens is correct.
+        - EVS real (called from get_real_video_repl_for_evs) - different value per frame
+        Args:
+            tokens_per_frame (list[int]): number of tokens per frame
+            frames_indices (list[int]): frame indices
+            frame_duration_ms (int): duration of each frame in milliseconds
+            tokenizer (TokenizerLike): tokenizer to use for tokenizing frame separators
+            img_start_token_ids (list[int]): pre-tokenized IMG_START tokens
+            img_end_token_ids (list[int]): pre-tokenized IMG_END tokens
+            img_context_token_ids (list[int]): pre-tokenized IMG_CONTEXT tokens
+        """
+        # TODO: Add support of frame_duration_ms to be None
+        # At preprocessing step we should allow absent / metadata without
+        # frames_indices field.
+        timestamps_enabled = frame_duration_ms is not None
+
+        if timestamps_enabled:
+            timestamps = calculate_timestamps(frames_indices, frame_duration_ms)
+
+            assert len(timestamps) == len(tokens_per_frame), (
+                "timestamps and tokens_per_frame must have the same length"
+            )
+            frame_separators = [
+                f"Frame {i + 1} sampled at {timestamp:.2f} seconds: "
+                for i, timestamp in enumerate(timestamps)
+            ]
+        else:
+            frame_separators = [
+                f"Frame {i + 1}: " for i, _ in enumerate(tokens_per_frame)
+            ]
+
+        # Tokenize frame separator independently
+        frame_separators_tokenized = [
+            _seq2tokens(tokenizer, sep) for sep in frame_separators
+        ]
+
+        # Tokenize each component independently to avoid tokenizer merging tokens
+        # across boundaries. This ensures consistent tokenization regardless of
+        # num_tokens_per_frame values.
+        all_token_ids = []
+        for i, num_tokens in enumerate(tokens_per_frame):
+            frame_sep_token_ids = frame_separators_tokenized[i]
+            all_token_ids.extend(frame_sep_token_ids)
+
+            # Add pre-tokenized special tokens
+            all_token_ids.extend(img_start_token_ids)
+            all_token_ids.extend(img_context_token_ids * num_tokens)
+            all_token_ids.extend(img_end_token_ids)
+
+        return PromptUpdateDetails.from_seq(all_token_ids)
--- a/vllm/transformers_utils/processors/nemotron_parse.py
+++ b/vllm/transformers_utils/processors/nemotron_parse.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Adapted from https://github.com/amalad/vllm/blob/nemotron_parse/vllm/model_executor/models/nemotron_parse.py
+# that's based on https://huggingface.co/nvidia/NVIDIA-Nemotron-Parse-v1.1/blob/main/hf_nemotron_parse_modeling.py
+from typing import TypeVar
+
+import numpy as np
+import torch
+from PIL import Image
+from timm.data.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
+from torchvision import transforms as T
+from transformers import BatchFeature, PretrainedConfig, TensorType
+
+from vllm.tokenizers import TokenizerLike
+
+_T = TypeVar("_T")
+
+DEFAULT_FINAL_IMAGE_SIZE = (2048, 1648)
+
+
+class NemotronParseImageProcessor:
+    """
+    NemotronParse Image Processor
+    """
+
+    def __init__(
+        self,
+        final_size: tuple = DEFAULT_FINAL_IMAGE_SIZE,
+        **kwargs,
+    ):
+        # Ensure final_size is properly formatted
+        if isinstance(final_size, (list, tuple)) and len(final_size) >= 2:
+            self.final_size = (int(final_size[0]), int(final_size[1]))
+        elif isinstance(final_size, (int, float)):
+            self.final_size = (int(final_size), int(final_size))
+        else:
+            self.final_size = DEFAULT_FINAL_IMAGE_SIZE  # Default fallback
+
+        self.norm_mean = torch.Tensor(OPENAI_CLIP_MEAN).reshape(1, 3, 1, 1)
+        self.norm_std = torch.Tensor(OPENAI_CLIP_STD).reshape(1, 3, 1, 1)
+
+        # Create transforms
+        self._create_transforms()
+
+    def _create_transforms(self):
+        """Create transform objects."""
+        try:
+            import albumentations as A
+        except ImportError as err:
+            raise ImportError(
+                "The package `albumentations` is required to use "
+                "NemotronParse model. Please install it with `pip install "
+                "albumentations`."
+            ) from err
+
+        # Ensure final_size is a tuple of integers
+        if isinstance(self.final_size, (list, tuple)):
+            self.target_height, self.target_width = (
+                int(self.final_size[0]),
+                int(self.final_size[1]),
+            )
+        else:
+            self.target_height = self.target_width = int(self.final_size)
+
+        import cv2
+
+        self.transform = A.Compose(
+            [
+                A.PadIfNeeded(
+                    min_height=self.target_height,
+                    min_width=self.target_width,
+                    border_mode=cv2.BORDER_CONSTANT,
+                    fill=[255, 255, 255],
+                    p=1.0,
+                ),
+            ]
+        )
+
+        self.torch_transform = T.Compose(
+            [
+                T.ToTensor(),
+            ]
+        )
+
+    def _resize_with_aspect_ratio(self, image: np.ndarray) -> np.ndarray:
+        """Resize image maintaining aspect ratio (exact replica of original
+        LongestMaxSizeHW)."""
+        height, width = image.shape[:2]
+        max_size_height = self.target_height
+        max_size_width = self.target_width
+
+        # Original LongestMaxSizeHW algorithm from custom_augmentations.py
+        aspect_ratio = width / height
+        new_height = height
+        new_width = width
+
+        # If height too big then scale image down
+        if height > max_size_height:
+            new_height = max_size_height
+            new_width = int(new_height * aspect_ratio)
+
+        # If width too big, scale image down further
+        if new_width > max_size_width:
+            new_width = max_size_width
+            new_height = int(new_width / aspect_ratio)
+
+        # Use cv2.INTER_LINEAR like the original
+        import cv2
+
+        return cv2.resize(
+            image, (new_width, new_height), interpolation=cv2.INTER_LINEAR
+        )
+
+    def _pad_to_size(self, image: np.ndarray) -> np.ndarray:
+        """Pad image to target size with white padding (matches A.PadIfNeeded
+        behavior)."""
+        h, w = image.shape[:2]
+        min_height, min_width = self.target_height, self.target_width
+
+        # Only pad if image is smaller than target (matches A.PadIfNeeded logic)
+        pad_h = max(0, min_height - h)
+        pad_w = max(0, min_width - w)
+
+        if pad_h == 0 and pad_w == 0:
+            return image
+
+        # A.PadIfNeeded pads to bottom-right with constant value
+        if len(image.shape) == 3:
+            # Color image - pad bottom and right with white (255, 255, 255)
+            padded = np.pad(
+                image,
+                ((0, pad_h), (0, pad_w), (0, 0)),
+                mode="constant",
+                constant_values=255,
+            )
+        else:
+            # Grayscale image - pad with white (255)
+            padded = np.pad(
+                image, ((0, pad_h), (0, pad_w)), mode="constant", constant_values=255
+            )
+
+        return padded
+
+    def preprocess(
+        self,
+        images: Image.Image | list[Image.Image],
+        **kwargs,
+    ) -> dict[str, torch.Tensor]:
+        """
+        Preprocess an image or batch of images for the NemotronParse model.
+
+        Args:
+            images: Input image(s)
+        """
+        # Ensure images is a list
+        if not isinstance(images, list):
+            images = [images]
+
+        # Convert PIL images to numpy arrays if needed
+        processed_images = []
+        for image in images:
+            if isinstance(image, Image.Image):
+                image = np.asarray(image)
+            processed_images.append(image)
+
+        # Apply NemotronParse-specific transforms
+        pixel_values = []
+        for image in processed_images:
+            # Manual resize with aspect ratio preservation
+            # (replaces LongestMaxSizeHW)
+            processed_image = self._resize_with_aspect_ratio(image)
+
+            # Apply remaining albumentations transforms if available
+            if self.transform is not None:
+                transformed = self.transform(image=processed_image)
+                processed_image = transformed["image"]
+            else:
+                # Fallback: just pad to target size
+                processed_image = self._pad_to_size(processed_image)
+
+            # Convert to tensor
+            pixel_values_tensor = self.torch_transform(processed_image)
+
+            # Handle grayscale images
+            if pixel_values_tensor.shape[0] == 1:
+                pixel_values_tensor = pixel_values_tensor.expand(3, -1, -1)
+
+            pixel_values.append(pixel_values_tensor)
+
+        # Stack into batch
+        pixel_values = torch.stack(pixel_values)
+
+        # Normalize pixel values
+        normalized_values = (pixel_values - self.norm_mean) / self.norm_std
+        return {"pixel_values": normalized_values}
+
+    def __call__(
+        self, images: Image.Image | list[Image.Image], **kwargs
+    ) -> dict[str, torch.Tensor]:
+        return self.preprocess(images, **kwargs)
+
+
+class NemotronParseProcessor:
+    """
+    NemotronParse Processor
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: TokenizerLike,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.tokenizer = tokenizer
+
+        self.image_processor = NemotronParseImageProcessor(final_size=config.image_size)
+
+    def _make_batch_input(self, input_item: _T | list[_T] | None = None) -> list[_T]:
+        if input_item is None:
+            input_item = []
+        if not isinstance(input_item, list):
+            input_item = [input_item]
+        return input_item
+
+    def __call__(
+        self,
+        text: str | list[str] | None = None,
+        images: Image.Image | list[Image.Image] | None = None,
+        return_tensors: str | TensorType | None = None,
+        **kwargs,
+    ) -> BatchFeature:
+        text = self._make_batch_input(text)
+        images = self._make_batch_input(images)
+        image_inputs = {} if len(images) == 0 else self.image_processor(images)
+
+        text_inputs = self.tokenizer(text, add_special_tokens=False, **kwargs)
+        combined_outputs = BatchFeature(
+            data={**text_inputs, **image_inputs},
+            tensor_type=return_tensors,
+        )
+        return combined_outputs
--- a/vllm/transformers_utils/processors/nemotron_vl.py
+++ b/vllm/transformers_utils/processors/nemotron_vl.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC
+
+import torch
+import torchvision.transforms as T
+from PIL import Image
+from transformers import PretrainedConfig
+from transformers.image_processing_utils_fast import BaseImageProcessorFast
+
+from vllm.multimodal.image import convert_image_mode
+from vllm.multimodal.processing import PromptUpdateDetails
+from vllm.tokenizers import TokenizerLike
+
+from .internvl import InternVLProcessor
+
+# Configure PIL to handle large images without warnings
+# This prevents DecompressionBombWarning for legitimate large images
+Image.MAX_IMAGE_PIXELS = None  # Disable the limit entirely
+# Alternative: Set a specific higher limit
+# Image.MAX_IMAGE_PIXELS = 300000000  # ~300M pixels
+
+
+def build_transform(input_size: int):
+    return T.Compose(
+        [
+            T.Lambda(lambda img: convert_image_mode(img, "RGB")),
+            T.Resize(
+                (input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
+            ),
+            T.ToTensor(),
+        ]
+    )
+
+
+# adapted from https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1
+def find_closest_aspect_ratio(
+    aspect_ratio: float,
+    target_ratios: list[tuple[int, int]],
+    *,
+    width: int,
+    height: int,
+    image_size: int,
+) -> tuple[int, int]:
+    best_factor = float("-inf")
+    best_ratio = (1, 1)
+    area = width * height
+
+    for rw, rh in target_ratios:
+        target_aspect_ratio = rw / rh
+        size_factor = min((rw * rh * image_size * image_size) / area, 0.6)
+        ratio_closeness = min(
+            target_aspect_ratio / aspect_ratio, aspect_ratio / target_aspect_ratio
+        )
+        factor = size_factor * ratio_closeness
+
+        if factor > best_factor:
+            best_factor = factor
+            best_ratio = (rw, rh)
+
+    return best_ratio
+
+
+def calculate_nemotron_vl_targets(
+    *,
+    orig_width: int,
+    orig_height: int,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> tuple[int, int, int]:
+    aspect_ratio = orig_width / orig_height
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio,
+        target_ratios,
+        width=orig_width,
+        height=orig_height,
+        image_size=image_size,
+    )
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # add thumbnail image if num_blocks != 1
+    if use_thumbnail and blocks != 1:
+        blocks += 1
+
+    return blocks, target_width, target_height
+
+
+def dynamic_preprocess_nemotron_vl(
+    image: Image.Image,
+    *,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> list[Image.Image]:
+    orig_width, orig_height = image.size
+
+    # calculate the number of blocks without thumbnail
+    blocks, target_width, target_height = calculate_nemotron_vl_targets(
+        orig_width=orig_width,
+        orig_height=orig_height,
+        target_ratios=target_ratios,
+        image_size=image_size,
+        use_thumbnail=False,
+    )
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+
+    assert len(processed_images) == blocks
+
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+
+    return processed_images
+
+
+def get_nemotron_vl_target_ratios(
+    min_num: int,
+    max_num: int,
+) -> list[tuple[int, int]]:
+    target_ratios = {
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if min_num <= i * j <= max_num
+    }
+    return sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+
+def image_to_pixel_values_nemotron_vl(
+    image: Image.Image,
+    *,
+    input_size: int,
+    min_num: int,
+    max_num: int,
+    use_thumbnail: bool,
+    transform: T.Compose | None = None,
+) -> torch.Tensor:
+    target_ratios = get_nemotron_vl_target_ratios(min_num, max_num)
+
+    if transform is None:
+        transform = build_transform(input_size=input_size)
+
+    images = dynamic_preprocess_nemotron_vl(
+        image,
+        target_ratios=target_ratios,
+        image_size=input_size,
+        use_thumbnail=use_thumbnail,
+    )
+
+    pixel_values = torch.stack([transform(image) for image in images])
+    return pixel_values
+
+
+class NemotronVLProcessor(InternVLProcessor):
+    IMG_START = "<img>"
+    IMG_END = "</img>"
+    IMG_CONTEXT = "<image>"
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: TokenizerLike,
+        image_processor: BaseImageProcessorFast,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+    ) -> None:
+        ABC.__init__(self)
+        self.config = config
+        self.tokenizer = tokenizer
+        self.image_processor = image_processor
+        image_size: int = config.force_image_size
+        patch_size: int = config.patch_size
+
+        if min_dynamic_patch is None:
+            min_dynamic_patch = 1
+        assert isinstance(min_dynamic_patch, int)
+
+        if max_dynamic_patch is None:
+            max_dynamic_patch = self.image_processor.max_num_tiles
+        assert isinstance(max_dynamic_patch, int)
+
+        if dynamic_image_size is None:
+            dynamic_image_size = True
+        assert isinstance(dynamic_image_size, bool)
+
+        self.num_image_token = int(
+            (image_size // patch_size) ** 2 * (config.downsample_ratio**2)
+        )
+        self.image_size = image_size
+        self.min_dynamic_patch = min_dynamic_patch
+        self.max_dynamic_patch = max_dynamic_patch
+        self.dynamic_image_size = dynamic_image_size
+
+        if image_processor is not None:
+            self.use_thumbnail = image_processor.use_thumbnail
+        else:
+            self.use_thumbnail = getattr(config, "use_thumbnail", True)
+
+    @property
+    def image_token_id(self) -> int:
+        return self.tokenizer.get_vocab()[self.IMG_CONTEXT]
+
+    def _get_transform(self) -> T.Compose:
+        return build_transform(input_size=self.image_size)
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        target_ratios = self.resolve_target_ratios(
+            use_thumbnail=False,  # Applied in calculate_targets
+        )
+
+        num_patches, _, _ = calculate_nemotron_vl_targets(
+            orig_width=image_width,
+            orig_height=image_height,
+            image_size=self.image_size,
+            target_ratios=target_ratios,
+            use_thumbnail=self.use_thumbnail,
+        )
+
+        return num_patches * self.num_image_token
+
+    def _images_to_pixel_values_lst(
+        self,
+        images: list[Image.Image],
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+    ) -> list[torch.Tensor]:
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=False,  # Applied in image_to_pixel_values
+        )
+
+        return [
+            image_to_pixel_values_nemotron_vl(
+                image,
+                input_size=self.image_size,
+                min_num=min_num,
+                max_num=max_num,
+                use_thumbnail=self.use_thumbnail,
+                transform=self._get_transform(),
+            )
+            for image in images
+        ]
+
+    def _replace_image_tokens(
+        self,
+        text: list[str],
+        pixel_values_lst: list[torch.Tensor],
+    ) -> list[str]:
+        """Replace <image> placeholders with image tokens."""
+        for pixel_values in pixel_values_lst:
+            num_patches = pixel_values.shape[0]
+            feature_size = num_patches * self.num_image_token
+            image_repl = self.get_image_repl(feature_size, num_patches)
+            # Use temporary placeholder to avoid replacing tokens we just inserted
+            NVL_IMAGE_CONTEXT = image_repl.full.replace("<image>", "<NVL_IMG_CONTEXT>")
+            text = [t.replace("<image>", NVL_IMAGE_CONTEXT, 1) for t in text]
+        return [t.replace("<NVL_IMG_CONTEXT>", self.IMG_CONTEXT) for t in text]
+
+    def _preprocess_image(
+        self,
+        text: list[str],
+        images: list[Image.Image],
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+    ) -> tuple[list[str], dict[str, torch.Tensor]]:
+        if len(images) == 0:
+            image_inputs = {}
+        else:
+            pixel_values_lst = self._images_to_pixel_values_lst(
+                images,
+                min_dynamic_patch=min_dynamic_patch,
+                max_dynamic_patch=max_dynamic_patch,
+                dynamic_image_size=dynamic_image_size,
+            )
+            image_inputs = {
+                "pixel_values_flat": torch.cat(pixel_values_lst),
+                "image_num_patches": torch.tensor(
+                    [len(item) for item in pixel_values_lst]
+                ),
+            }
+
+            text = self._replace_image_tokens(text, pixel_values_lst)
+        return text, image_inputs
+
+    def get_image_repl(
+        self,
+        feature_size: int,
+        num_patches: int | None,
+    ) -> PromptUpdateDetails[str]:
+        repl_features = self.IMG_CONTEXT * feature_size
+        repl_full = self.IMG_START + repl_features + self.IMG_END
+
+        return PromptUpdateDetails.select_text(repl_full, self.IMG_CONTEXT)
+
+
+# SigLIP normalization constants
+SIGLIP_MEAN = (0.5, 0.5, 0.5)
+SIGLIP_STD = (0.5, 0.5, 0.5)
+
+
+def build_siglip_transform(input_size: int):
+    """Build transform for SigLIP vision encoder with normalization.
+
+    Extends the base transform from nemotron_vl with SigLIP-specific normalization.
+    """
+    return T.Compose(
+        [
+            build_transform(input_size=input_size),
+            T.Normalize(mean=SIGLIP_MEAN, std=SIGLIP_STD),
+        ]
+    )
+
+
+class LlamaNemotronVLEmbedProcessor(NemotronVLProcessor):
+    """
+    Processor for LlamaNemotronVL embedding model.
+
+    Inherits from NemotronVLProcessor and specializes it for embedding tasks:
+    - Uses SigLIP transform with normalization instead of base transform
+    - Uses different image context token (<IMG_CONTEXT> vs <image>)
+    """
+
+    IMG_CONTEXT = "<IMG_CONTEXT>"
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: TokenizerLike,
+        processor_config: dict,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+    ) -> None:
+        if min_dynamic_patch is None:
+            min_dynamic_patch = processor_config.get(
+                "min_input_tiles",
+                getattr(config, "min_dynamic_patch", 1),
+            )
+        if max_dynamic_patch is None:
+            max_dynamic_patch = processor_config.get(
+                "max_input_tiles",
+                getattr(config, "max_dynamic_patch", 1),
+            )
+        if dynamic_image_size is None:
+            dynamic_image_size = processor_config.get(
+                "dynamic_image_size",
+                getattr(config, "dynamic_image_size", True),
+            )
+        super().__init__(
+            config=config,
+            tokenizer=tokenizer,
+            image_processor=None,
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+        )
+
+    def _get_transform(self) -> T.Compose:
+        """Override to add SigLIP normalization."""
+        return build_siglip_transform(input_size=self.image_size)
+
+    def _replace_image_tokens(
+        self,
+        text: list[str],
+        pixel_values_lst: list[torch.Tensor],
+    ) -> list[str]:
+        """Override with simpler token replacement for embedding model.
+
+        No temporary placeholder needed because IMG_CONTEXT is <IMG_CONTEXT>,
+        not <image>, so there's no collision risk.
+        """
+        for pixel_values in pixel_values_lst:
+            num_patches = pixel_values.shape[0]
+            feature_size = num_patches * self.num_image_token
+            image_repl = self.get_image_repl(feature_size, num_patches)
+            text = [t.replace("<image>", image_repl.full, 1) for t in text]
+        return text
--- a/vllm/transformers_utils/processors/nvlm_d.py
+++ b/vllm/transformers_utils/processors/nvlm_d.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://huggingface.co/nvidia/NVLM-D-72B/blob/main/modeling_nvlm_d.py
+# --------------------------------------------------------
+# NVLM-D
+# Copyright (c) 2024 NVIDIA
+# Licensed under Apache 2.0 License [see LICENSE for details]
+# --------------------------------------------------------
+from vllm.multimodal.processing import PromptUpdateDetails
+
+from .internvl import BaseInternVLProcessor
+
+IMG_PAD = "<|vision_pad|>"
+
+
+class NVLMProcessor(BaseInternVLProcessor):
+    @property
+    def image_token_id(self) -> int:
+        return self.tokenizer.get_vocab()[IMG_PAD]
+
+    def get_image_repl(
+        self,
+        feature_size: int,
+        num_patches: int | None,
+    ) -> PromptUpdateDetails[str]:
+        if num_patches is None:
+            raise NotImplementedError("Embedding inputs are not supported")
+
+        tile_pos_identifiers = [f"<tile_{i}>" for i in range(1, num_patches)]
+        if self.use_thumbnail:
+            tile_pos_identifiers += ["<tile_global_thumbnail>"]
+
+        context_size = feature_size // num_patches
+        features = "".join(
+            identifier + IMG_PAD * context_size for identifier in tile_pos_identifiers
+        )
+
+        # We include the start and end as well because "<Image><tile" is
+        # tokenized as ["<Image", "><", "tile"], resulting in assertion error
+        # when trying to find "<tile" as a subsequence of "<Image><tile"
+        repl = "<Image>" + features + "</Image>"
+
+        return PromptUpdateDetails.select_text(repl, IMG_PAD)
--- a/vllm/transformers_utils/processors/skyworkr1v.py
+++ b/vllm/transformers_utils/processors/skyworkr1v.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py
+# --------------------------------------------------------
+# SkyworkR1V
+# Copyright (c) 2025 Skywork
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+import torch
+import torchvision.transforms as T
+from PIL import Image
+from transformers import BatchFeature, PretrainedConfig, TensorType
+
+from vllm.multimodal.image import convert_image_mode
+from vllm.multimodal.processing import PromptUpdateDetails
+from vllm.tokenizers import TokenizerLike
+
+IMG_START = "<img>"
+IMG_END = "</img>"
+IMG_CONTEXT = "<IMG_CONTEXT>"
+
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+
+
+# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
+def build_transform(input_size: int):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    return T.Compose(
+        [
+            T.Lambda(lambda img: convert_image_mode(img, "RGB")),
+            T.Resize(
+                (input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
+            ),
+            T.ToTensor(),
+            T.Normalize(mean=MEAN, std=STD),
+        ]
+    )
+
+
+# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
+def find_closest_aspect_ratio(
+    aspect_ratio: float,
+    target_ratios: list[tuple[int, int]],
+    *,
+    width: int,
+    height: int,
+    image_size: int,
+) -> tuple[int, int]:
+    best_ratio_diff = float("inf")
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+
+
+def resolve_skyworkr1v_min_max_num(
+    *,
+    min_dynamic_patch: int,
+    max_dynamic_patch: int,
+    dynamic_image_size: bool,
+    use_thumbnail: bool,
+) -> tuple[int, int]:
+    min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
+    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
+
+    if use_thumbnail and max_dynamic_patch != 1:
+        max_dynamic_patch += 1
+
+    return min_dynamic_patch, max_dynamic_patch
+
+
+def get_skyworkr1v_target_ratios(
+    min_num: int,
+    max_num: int,
+) -> list[tuple[int, int]]:
+    target_ratios = {
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if min_num <= i * j <= max_num
+    }
+    return sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+
+def calculate_skyworkr1v_targets(
+    *,
+    orig_width: int,
+    orig_height: int,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> tuple[int, int, int]:
+    aspect_ratio = orig_width / orig_height
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio,
+        target_ratios,
+        width=orig_width,
+        height=orig_height,
+        image_size=image_size,
+    )
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # add thumbnail image if num_blocks != 1
+    if use_thumbnail and blocks != 1:
+        blocks += 1
+
+    return blocks, target_width, target_height
+
+
+def dynamic_preprocess_skyworkr1v(
+    image: Image.Image,
+    *,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> list[Image.Image]:
+    orig_width, orig_height = image.size
+
+    # calculate the number of blocks without thumbnail
+    blocks, target_width, target_height = calculate_skyworkr1v_targets(
+        orig_width=orig_width,
+        orig_height=orig_height,
+        target_ratios=target_ratios,
+        image_size=image_size,
+        use_thumbnail=False,
+    )
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+
+    assert len(processed_images) == blocks
+
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+
+    return processed_images
+
+
+# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B
+def image_to_pixel_values_skyworkr1v(
+    image: Image.Image,
+    *,
+    input_size: int,
+    min_num: int,
+    max_num: int,
+    use_thumbnail: bool,
+) -> torch.Tensor:
+    target_ratios = get_skyworkr1v_target_ratios(min_num, max_num)
+
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess_skyworkr1v(
+        image,
+        target_ratios=target_ratios,
+        image_size=input_size,
+        use_thumbnail=use_thumbnail,
+    )
+
+    pixel_values = torch.stack([transform(image) for image in images])
+    return pixel_values
+
+
+class SkyworkR1VProcessor:
+    """
+    This model doesn't define its own HF processor,
+    so we implement our own one here.
+
+    The code to insert image tokens is based on:
+    https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py#L252
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: TokenizerLike,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.tokenizer = tokenizer
+
+        image_size: int = config.vision_config.image_size
+        patch_size: int = config.vision_config.patch_size
+
+        if min_dynamic_patch is None:
+            min_dynamic_patch = config.min_dynamic_patch
+        assert isinstance(min_dynamic_patch, int)
+
+        if max_dynamic_patch is None:
+            max_dynamic_patch = config.max_dynamic_patch
+        assert isinstance(max_dynamic_patch, int)
+
+        if dynamic_image_size is None:
+            dynamic_image_size = config.dynamic_image_size
+        assert isinstance(dynamic_image_size, bool)
+
+        self.num_image_token = int(
+            (image_size // patch_size) ** 2 * (config.downsample_ratio**2)
+        )
+        self.image_size = image_size
+        self.min_dynamic_patch = min_dynamic_patch
+        self.max_dynamic_patch = max_dynamic_patch
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail: bool = config.use_thumbnail
+
+    @property
+    def image_token_id(self) -> int:
+        return self.tokenizer.get_vocab()[IMG_CONTEXT]
+
+    def get_image_repl(
+        self,
+        feature_size: int,
+        num_patches: int | None,
+    ) -> PromptUpdateDetails[str]:
+        repl_features = IMG_CONTEXT * feature_size
+        repl_full = IMG_START + repl_features + IMG_END
+
+        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
+
+    def resolve_min_max_num(
+        self,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        use_thumbnail: bool | None = None,
+    ) -> tuple[int, int]:
+        min_dynamic_patch = (
+            self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
+        )
+        max_dynamic_patch = (
+            self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
+        )
+        dynamic_image_size = (
+            self.dynamic_image_size
+            if dynamic_image_size is None
+            else dynamic_image_size
+        )
+        use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
+
+        return resolve_skyworkr1v_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
+        )
+
+    def resolve_target_ratios(
+        self,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        use_thumbnail: bool | None = None,
+    ) -> list[tuple[int, int]]:
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
+        )
+
+        return get_skyworkr1v_target_ratios(min_num, max_num)
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        target_ratios = self.resolve_target_ratios(
+            use_thumbnail=False,  # Applied in calculate_targets
+        )
+
+        num_patches, _, _ = calculate_skyworkr1v_targets(
+            orig_width=image_width,
+            orig_height=image_height,
+            image_size=self.image_size,
+            target_ratios=target_ratios,
+            use_thumbnail=self.use_thumbnail,
+        )
+
+        return num_patches * self.num_image_token
+
+    def _images_to_pixel_values_lst(
+        self,
+        images: list[Image.Image],
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+    ) -> list[torch.Tensor]:
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=False,  # Applied in image_to_pixel_values
+        )
+
+        return [
+            image_to_pixel_values_skyworkr1v(
+                image,
+                input_size=self.image_size,
+                min_num=min_num,
+                max_num=max_num,
+                use_thumbnail=self.use_thumbnail,
+            )
+            for image in images
+        ]
+
+    def __call__(
+        self,
+        text: str | list[str] | None = None,
+        images: Image.Image | list[Image.Image] | None = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        return_tensors: str | TensorType | None = None,
+    ) -> BatchFeature:
+        if text is None:
+            text = []
+        if not isinstance(text, list):
+            text = [text]
+        if images is None:
+            images = []
+        if not isinstance(images, list):
+            images = [images]
+
+        if len(images) == 0:
+            image_inputs = {}
+        else:
+            pixel_values_lst = self._images_to_pixel_values_lst(
+                images,
+                min_dynamic_patch=min_dynamic_patch,
+                max_dynamic_patch=max_dynamic_patch,
+                dynamic_image_size=dynamic_image_size,
+            )
+            image_inputs = {
+                "pixel_values_flat": torch.cat(pixel_values_lst),
+                "image_num_patches": torch.tensor(
+                    [len(item) for item in pixel_values_lst]
+                ),
+            }
+
+            for pixel_values in pixel_values_lst:
+                num_patches = pixel_values.shape[0]
+                feature_size = num_patches * self.num_image_token
+
+                image_repl = self.get_image_repl(feature_size, num_patches)
+
+                text = [t.replace("<image>", image_repl.full, 1) for t in text]
+
+        text_inputs = self.tokenizer(text)
+
+        combined_outputs = {**text_inputs, **image_inputs}
+
+        return BatchFeature(combined_outputs, tensor_type=return_tensors)