[2/3] Refactor InternVL-based processors (#37324)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>

[2/3] Refactor InternVL-based processors (#37324)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
99267c23 · Cyrus Leung · GitHub · 525f2eeb · 99267c23 · 99267c23
Unverified Commit 99267c23 authored Mar 18, 2026 by Cyrus Leung Committed by GitHub Mar 18, 2026
18 changed files
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -489,13 +489,14 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            self.image_size = self.vision_config.image_size

        def __call__(self, text: str, images: Image | list[Image], **kwargs):
-            from vllm.model_executor.models.h2ovl import (
-                IMG_CONTEXT,
-                IMG_END,
-                IMG_START,
+            from vllm.transformers_utils.processors.h2ovl import (
                image_to_pixel_values_h2ovl,
            )

+            IMG_START = "<img>"
+            IMG_END = "</img>"
+            IMG_CONTEXT = "<IMG_CONTEXT>"
+
            images = [images] if isinstance(images, Image) else images
            pixel_values = [
                image_to_pixel_values_h2ovl(
@@ -751,16 +752,17 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            self.image_size = self.vision_config.image_size

        def __call__(self, text: str, images: Image | list[Image], **kwargs):
-            from vllm.model_executor.models.skyworkr1v import (
-                IMG_CONTEXT,
-                IMG_END,
-                IMG_START,
-                image_to_pixel_values_skyworkr1v,
+            from vllm.transformers_utils.processors.internvl import (
+                image_to_pixel_values_internvl,
            )

+            IMG_START = "<img>"
+            IMG_END = "</img>"
+            IMG_CONTEXT = "<IMG_CONTEXT>"
+
            images = [images] if isinstance(images, Image) else images
            pixel_values = [
-                image_to_pixel_values_skyworkr1v(
+                image_to_pixel_values_internvl(
                    image,
                    input_size=self.image_size,
                    min_num=self.min_num,
@@ -815,14 +817,15 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            videos: npt.NDArray | list[npt.NDArray] = None,
            **kwargs,
        ):
-            from vllm.model_executor.models.internvl import (
-                IMG_CONTEXT,
-                IMG_END,
-                IMG_START,
+            from vllm.transformers_utils.processors.internvl import (
                image_to_pixel_values_internvl,
                video_to_pixel_values_internvl,
            )

+            IMG_START = "<img>"
+            IMG_END = "</img>"
+            IMG_CONTEXT = "<IMG_CONTEXT>"
+
            images = [images] if isinstance(images, Image) else images
            videos = [videos] if isinstance(videos, np.ndarray) else videos
            if images is not None:

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -779,7 +779,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        "rednote-hilab/dots.ocr", trust_remote_code=True
    ),
    "Eagle2_5_VLForConditionalGeneration": _HfExamplesInfo(
-        "nvidia/Eagle2.5-8B", trust_remote_code=True, is_available_online=False
+        "nvidia/Eagle2.5-8B",
+        trust_remote_code=True,
    ),
    "Emu3ForConditionalGeneration": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
    "Ernie4_5_VLMoeForConditionalGeneration": _HfExamplesInfo(

--- a/vllm/model_executor/models/eagle2_5_vl.py
+++ b/vllm/model_executor/models/eagle2_5_vl.py
@@ -16,7 +16,10 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.siglip import SiglipVisionModel
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.processors.eagle2_5_vl import Eagle2_5_VLProcessor
+from vllm.transformers_utils.processors.internvl import (
+    InternVLImageProcessor,
+    InternVLProcessor,
+)
 from vllm.utils.tensor_schema import TensorSchema, TensorShape

 from .interfaces import (
@@ -68,12 +71,35 @@ Eagle2_5_VLImageInputs: TypeAlias = (
 class Eagle2_5_VLProcessingInfo(BaseInternVLProcessingInfo):
    """Processing info for Eagle2.5-VL model."""

-    def get_hf_processor(self, **kwargs) -> Eagle2_5_VLProcessor:
-        return self.ctx.init_processor(
-            Eagle2_5_VLProcessor,
-            config=self.ctx.get_hf_config(),
+    def get_image_processor(self, **kwargs):
+        config = self.get_hf_config()
+        vision_config = config.vision_config
+
+        kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
+        kwargs.setdefault(
+            "image_size", config.force_image_size or vision_config.image_size
+        )
+        kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
+        kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
+        kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
+        kwargs.setdefault("use_thumbnail", config.use_thumbnail)
+
+        return InternVLImageProcessor(**kwargs)
+
+    def get_hf_processor(self, **kwargs) -> InternVLProcessor:
+        config = self.get_hf_config()
+        vision_config = config.vision_config
+
+        image_processor = self.get_image_processor(**kwargs)
+        image_size = image_processor.image_size
+        patch_size = vision_config.patch_size
+        downsample_ratio = config.downsample_ratio
+        image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
+
+        return InternVLProcessor(
            tokenizer=self.get_tokenizer(),
-            **kwargs,
+            image_processor=image_processor,
+            image_seq_length=image_seq_length,
        )



--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -395,13 +395,13 @@ class GLM4VProcessingInfo(BaseProcessingInfo):
        vision_config = config.vision_config

        image_size = vision_config["image_size"]
+        kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
        kwargs.setdefault("size", {"width": image_size, "height": image_size})

        return GLM4VImageProcessorFast(**kwargs)

    def get_hf_processor(self, **kwargs: object) -> GLM4VProcessor:
-        return self.ctx.init_processor(
-            GLM4VProcessor,
+        return GLM4VProcessor(
            tokenizer=self.get_tokenizer(),
            image_processor=self.get_image_processor(**kwargs),
        )

--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -28,7 +28,7 @@ from vllm.multimodal.processing.processor import (
    PromptUpdate,
    TimingContext,
 )
-from vllm.transformers_utils.processors.h2ovl import H2OVLProcessor
+from vllm.transformers_utils.processors.h2ovl import H2OVLImageProcessor, H2OVLProcessor

 from .intern_vit import InternVisionModel
 from .internvl import (
@@ -40,12 +40,34 @@ from .internvl import (


 class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
+    def get_image_processor(self, **kwargs):
+        config = self.get_hf_config()
+        vision_config = config.vision_config
+
+        kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
+        kwargs.setdefault("image_size", vision_config.image_size)
+        kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
+        kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
+        kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
+        kwargs.setdefault("use_thumbnail", config.use_thumbnail)
+        kwargs.setdefault("use_msac", config.use_msac)
+
+        return H2OVLImageProcessor(**kwargs)
+
    def get_hf_processor(self, **kwargs: object) -> H2OVLProcessor:
-        return self.ctx.init_processor(
-            H2OVLProcessor,
-            config=self.get_hf_config(),
+        config = self.get_hf_config()
+        vision_config = config.vision_config
+
+        image_processor = self.get_image_processor(**kwargs)
+        image_size = image_processor.image_size
+        patch_size = vision_config.patch_size
+        downsample_ratio = config.downsample_ratio
+        image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
+
+        return H2OVLProcessor(
            tokenizer=self.get_tokenizer(),
-            **kwargs,
+            image_processor=image_processor,
+            image_seq_length=image_seq_length,
        )

    def get_num_image_tokens(
@@ -106,7 +128,7 @@ class H2OVLMultiModalProcessor(BaseInternVLMultiModalProcessor[H2OVLProcessingIn
            if num_patches is not None:
                assert isinstance(num_patches, int)

-            return hf_processor.get_image_repl(feature_size, num_patches)
+            return hf_processor.get_image_repl(num_patches, num_features=feature_size)

        return [
            PromptReplacement(

--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -9,6 +9,7 @@
 # --------------------------------------------------------
 from abc import abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
+from functools import cached_property
 from typing import Annotated, Literal, TypeAlias, TypeVar

 import torch
@@ -45,8 +46,9 @@ from vllm.multimodal.processing import (
 )
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.processors.internvl import (
-    BaseInternVLProcessor,
+    InternVLImageProcessor,
    InternVLProcessor,
+    InternVLVideoProcessor,
 )
 from vllm.utils.tensor_schema import TensorSchema, TensorShape

@@ -123,7 +125,7 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
    """Basic image-only ProcessingInfo for InternVL-style models."""

    @abstractmethod
-    def get_hf_processor(self, **kwargs: object) -> BaseInternVLProcessor:
+    def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
        raise NotImplementedError

    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
@@ -134,7 +136,7 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
        *,
        image_width: int,
        image_height: int,
-        processor: BaseInternVLProcessor,
+        processor: InternVLProcessor,
    ) -> int:
        return processor.get_num_image_tokens(
            image_width=image_width,
@@ -143,8 +145,9 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):

    def get_image_size_with_most_features(self) -> ImageSize:
        processor = self.get_hf_processor()
+        image_processor = processor.image_processor

-        base_size = processor.image_size
+        base_size = image_processor.image_size
        target_ratios = processor.resolve_target_ratios()

        largest_feature_size, largest_feature_pinpoint = 0, None
@@ -226,7 +229,7 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
        )

        hf_processor = self.info.get_hf_processor(**mm_kwargs)
-        image_token_id = hf_processor.image_token_id
+        image_token_id = hf_processor.ctx_image_token_id

        # Since there may be extra tokens in the feature placeholders,
        # we need to pass the image token ID to the model to select the
@@ -291,7 +294,7 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
            if num_patches is not None:
                assert isinstance(num_patches, int)

-            return hf_processor.get_image_repl(feature_size, num_patches)
+            return hf_processor.get_image_repl(num_patches, num_features=feature_size)

        return [
            PromptReplacement(
@@ -305,23 +308,73 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
 class InternVLProcessingInfo(BaseInternVLProcessingInfo):
    """InternVL ProcessingInfo extended for video processing"""

-    @property
-    def supports_video(self):
-        return self.get_hf_processor().supports_video
+    def get_image_processor(self, **kwargs):
+        config = self.get_hf_config()
+        vision_config = config.vision_config

-    def get_supported_mm_limits(self):
-        video_limit = {"video": None} if self.supports_video else {}
-        return {**super().get_supported_mm_limits(), **video_limit}
+        kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
+        kwargs.setdefault("image_size", vision_config.image_size)
+        kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
+        kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
+        kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
+        kwargs.setdefault("use_thumbnail", config.use_thumbnail)
+
+        return InternVLImageProcessor(**kwargs)
+
+    def get_video_processor(self, **kwargs):
+        config = self.get_hf_config()
+        vision_config = config.vision_config

-    def get_video_token(self) -> str | None:
+        kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
+        kwargs.setdefault("image_size", vision_config.image_size)
+
+        return InternVLVideoProcessor(**kwargs)
+
+    @cached_property
+    def ctx_video_token(self):
        text_model_type = self.get_hf_config().get_text_config().model_type
-        video_token_map = {
+        ctx_video_token_map = {
            "qwen2": "<|video_pad|>",
            "qwen3": "<|video_pad|>",
            "qwen3_moe": "<|video_pad|>",
            "gpt_oss": "<|reserved_200000|>",
        }
-        return video_token_map.get(text_model_type)
+
+        if text_model_type not in ctx_video_token_map:
+            return None
+
+        ctx_video_token = ctx_video_token_map[text_model_type]
+        if ctx_video_token not in self.get_tokenizer().get_vocab():
+            return None
+
+        return ctx_video_token
+
+    def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
+        config = self.get_hf_config()
+        vision_config = config.vision_config
+
+        image_processor = self.get_image_processor(**kwargs)
+        image_size = image_processor.image_size
+        patch_size = vision_config.patch_size
+        downsample_ratio = config.downsample_ratio
+        image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
+
+        ctx_video_token = self.ctx_video_token
+        video_processor = (
+            self.get_video_processor(**kwargs) if ctx_video_token else None
+        )
+
+        return InternVLProcessor(
+            tokenizer=self.get_tokenizer(),
+            image_processor=image_processor,
+            video_processor=video_processor,
+            image_seq_length=image_seq_length,
+            ctx_video_token=ctx_video_token,
+        )
+
+    def get_supported_mm_limits(self):
+        video_limit = {"video": None} if self.ctx_video_token else {}
+        return {**super().get_supported_mm_limits(), **video_limit}

    def get_num_frames_with_most_features(
        self,
@@ -332,22 +385,14 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo):
        max_videos = mm_counts.get("video", 0)

        processor = self.get_hf_processor()
+        num_image_token = processor.image_seq_length

        max_image_tokens = self.get_max_image_tokens() * max_images
-        max_total_frames = (seq_len - max_image_tokens) // processor.num_image_token
+        max_total_frames = (seq_len - max_image_tokens) // num_image_token
        max_frames_per_video = max_total_frames // max(max_videos, 1)

        return max(max_frames_per_video, 1)

-    def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
-        return self.ctx.init_processor(
-            InternVLProcessor,
-            config=self.get_hf_config(),
-            tokenizer=self.get_tokenizer(),
-            video_token=self.get_video_token(),
-            **kwargs,
-        )
-

 class InternVLDummyInputsBuilder(
    BaseInternVLDummyInputsBuilder[InternVLProcessingInfo]
@@ -366,7 +411,7 @@ class InternVLDummyInputsBuilder(
        mm_options: Mapping[str, BaseDummyOptions],
    ) -> MultiModalDataDict:
        dummy_image = super().get_dummy_mm_data(seq_len, mm_counts, mm_options)
-        if self.info.supports_video:
+        if self.info.ctx_video_token:
            config = self.info.get_hf_config()
            image_size: int = config.vision_config.image_size
            target_num_frames = self.info.get_num_frames_with_most_features(
@@ -405,11 +450,9 @@ class InternVLMultiModalProcessor(
        )

        hf_processor = self.info.get_hf_processor(**mm_kwargs)
-        if (
-            self.info.supports_video
-            and (video_token_id := hf_processor.video_token_id) is not None
-        ):
+        if (video_token_id := hf_processor.ctx_video_token_id) is not None:
            processed_outputs["video_token_id"] = torch.tensor(video_token_id)
+
        return processed_outputs

    def _get_mm_fields_config(
@@ -418,7 +461,7 @@ class InternVLMultiModalProcessor(
        hf_processor_mm_kwargs: Mapping[str, object],
    ) -> Mapping[str, MultiModalFieldConfig]:
        image_fields = super()._get_mm_fields_config(hf_inputs, hf_processor_mm_kwargs)
-        if self.info.supports_video:
+        if self.info.ctx_video_token:
            video_num_patches = hf_inputs.get("video_num_patches", torch.empty(0))
            num_videos = len(video_num_patches)
            video_fields = dict(
@@ -444,6 +487,8 @@ class InternVLMultiModalProcessor(
            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
            out_mm_kwargs=out_mm_kwargs,
        )
+        if self.info.ctx_video_token is None:
+            return prompt_repl

        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)

@@ -456,26 +501,20 @@ class InternVLMultiModalProcessor(
            video_num_patches = []

        def get_video_replacement_internvl(item_idx: int):
-            feature_size = hf_processor.num_image_token
            num_patches = video_num_patches[item_idx]
            if num_patches is not None:
                assert isinstance(num_patches, int)

-            return hf_processor.get_video_repl(
-                feature_size, num_patches, video_context_token=hf_processor.video_token
-            )
-
-        if self.info.supports_video:
-            prompt_repl = [
-                *prompt_repl,
-                PromptReplacement(
-                    modality="video",
-                    target="<video>",
-                    replacement=get_video_replacement_internvl,
-                ),
-            ]
+            return hf_processor.get_video_repl(num_patches)

-        return prompt_repl
+        return [
+            *prompt_repl,
+            PromptReplacement(
+                modality="video",
+                target="<video>",
+                replacement=get_video_replacement_internvl,
+            ),
+        ]


 @MULTIMODAL_REGISTRY.register_processor(

--- a/vllm/model_executor/models/nemotron_vl.py
+++ b/vllm/model_executor/models/nemotron_vl.py
@@ -26,8 +26,10 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.processor import cached_image_processor_from_config
 from vllm.transformers_utils.processors.nemotron_vl import (
+    LlamaNemotronNanoVLImageProcessor,
+    LlamaNemotronNanoVLProcessor,
+    LlamaNemotronVLEmbedImageProcessor,
    LlamaNemotronVLEmbedProcessor,
-    NemotronVLProcessor,
 )
 from vllm.transformers_utils.repo_utils import get_hf_file_to_dict

@@ -50,19 +52,34 @@ from .utils import (
 class NemotronVLProcessingInfo(BaseInternVLProcessingInfo):
    """Processing info for Nemotron VL models."""

-    def get_hf_processor(self, **kwargs: object) -> NemotronVLProcessor:
-        return self.ctx.init_processor(
-            NemotronVLProcessor,
-            config=self.get_hf_config(),
-            tokenizer=self.get_tokenizer(),
-            image_processor=self.get_image_processor(),
-            **kwargs,
+    def get_image_processor(self, **kwargs: object):
+        kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
+        orig_processor = cached_image_processor_from_config(
+            self.ctx.model_config, **kwargs
        )

-    def get_image_processor(self, **kwargs: object):
-        return cached_image_processor_from_config(
-            self.ctx.model_config,
-            **kwargs,
+        return LlamaNemotronNanoVLImageProcessor(
+            image_size=orig_processor.image_size,
+            min_dynamic_patch=1,
+            max_dynamic_patch=orig_processor.max_num_tiles,
+            dynamic_image_size=True,
+            use_thumbnail=orig_processor.use_thumbnail,
+        )
+
+    def get_hf_processor(self, **kwargs: object) -> LlamaNemotronNanoVLProcessor:
+        config = self.get_hf_config()
+        vision_config = config.vision_config
+
+        image_processor = self.get_image_processor(**kwargs)
+        image_size = image_processor.image_size
+        patch_size = vision_config.patch_size
+        downsample_ratio = config.downsample_ratio
+        image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
+
+        return LlamaNemotronNanoVLProcessor(
+            tokenizer=self.get_tokenizer(),
+            image_processor=image_processor,
+            image_seq_length=image_seq_length,
        )


@@ -386,29 +403,58 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor
 # --------------------------------------------------------


-class LlamaNemotronVLEmbedProcessingInfo(NemotronVLProcessingInfo):
+class LlamaNemotronVLEmbedProcessingInfo(BaseInternVLProcessingInfo):
    """Processing info for LlamaNemotronVL embedding model."""

-    def get_hf_processor(self, **kwargs: object) -> LlamaNemotronVLEmbedProcessor:
-        """Override to create embedding-specific processor without image_processor."""
+    def get_image_processor(self, **kwargs):
        model_config = self.ctx.model_config
-        processor_config = {}
-        if model_config.model is not None:
-            processor_config = (
-                get_hf_file_to_dict(
-                    "processor_config.json",
-                    model_config.model,
-                    model_config.revision,
-                )
-                or {}
+
+        config = self.get_hf_config()
+        processor_config = (
+            get_hf_file_to_dict(
+                "processor_config.json",
+                model_config.model,
+                model_config.revision,
            )
+            or {}
+        )
+
+        min_dynamic_patch = processor_config.get(
+            "min_input_tiles",
+            getattr(config, "min_dynamic_patch", 1),
+        )
+        max_dynamic_patch = processor_config.get(
+            "max_input_tiles",
+            getattr(config, "max_dynamic_patch", 1),
+        )
+        dynamic_image_size = processor_config.get(
+            "dynamic_image_size",
+            getattr(config, "dynamic_image_size", True),
+        )
+
+        kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
+        kwargs.setdefault("image_size", config.force_image_size)
+        kwargs.setdefault("min_dynamic_patch", min_dynamic_patch)
+        kwargs.setdefault("max_dynamic_patch", max_dynamic_patch)
+        kwargs.setdefault("dynamic_image_size", dynamic_image_size)
+        kwargs.setdefault("use_thumbnail", True)
+
+        return LlamaNemotronVLEmbedImageProcessor(**kwargs)
+
+    def get_hf_processor(self, **kwargs: object) -> LlamaNemotronVLEmbedProcessor:
+        config = self.get_hf_config()
+        vision_config = config.vision_config
+
+        image_processor = self.get_image_processor(**kwargs)
+        image_size = image_processor.image_size
+        patch_size = vision_config.patch_size
+        downsample_ratio = config.downsample_ratio
+        image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))

-        return self.ctx.init_processor(
-            LlamaNemotronVLEmbedProcessor,
-            config=self.get_hf_config(),
+        return LlamaNemotronVLEmbedProcessor(
            tokenizer=self.get_tokenizer(),
-            processor_config=processor_config,
-            **kwargs,
+            image_processor=image_processor,
+            image_seq_length=image_seq_length,
        )



--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -27,7 +27,8 @@ from vllm.multimodal.processing import (
    PromptUpdate,
    PromptUpdateDetails,
 )
-from vllm.transformers_utils.processors.nvlm_d import IMG_PAD, NVLMProcessor
+from vllm.transformers_utils.processors.internvl import InternVLImageProcessor
+from vllm.transformers_utils.processors.nvlm_d import NVLMProcessor

 from .intern_vit import InternVisionModel
 from .internvl import (
@@ -39,12 +40,33 @@ from .internvl import (


 class NVLMProcessingInfo(BaseInternVLProcessingInfo):
+    def get_image_processor(self, **kwargs):
+        config = self.get_hf_config()
+        vision_config = config.vision_config
+
+        kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
+        kwargs.setdefault("image_size", vision_config.image_size)
+        kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
+        kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
+        kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
+        kwargs.setdefault("use_thumbnail", config.use_thumbnail)
+
+        return InternVLImageProcessor(**kwargs)
+
    def get_hf_processor(self, **kwargs: object) -> NVLMProcessor:
-        return self.ctx.init_processor(
-            NVLMProcessor,
-            config=self.get_hf_config(),
+        config = self.get_hf_config()
+        vision_config = config.vision_config
+
+        image_processor = self.get_image_processor(**kwargs)
+        image_size = image_processor.image_size
+        patch_size = vision_config.patch_size
+        downsample_ratio = config.downsample_ratio
+        image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
+
+        return NVLMProcessor(
            tokenizer=self.get_tokenizer(),
-            **kwargs,
+            image_processor=image_processor,
+            image_seq_length=image_seq_length,
        )


@@ -117,9 +139,11 @@ class NVLMMultiModalProcessor(BaseInternVLMultiModalProcessor[NVLMProcessingInfo
            if num_patches is not None:
                assert isinstance(num_patches, int)

-            repl = hf_processor.get_image_repl(feature_size, num_patches)
+            repl = hf_processor.get_image_repl(num_patches, num_features=feature_size)

-            return PromptUpdateDetails.select_text(repl.full + "\n", IMG_PAD)
+            return PromptUpdateDetails.select_text(
+                repl.full + "\n", hf_processor.ctx_image_token
+            )

        # See note in dummy data regarding why we have the extra newline
        return [

--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -440,13 +440,13 @@ class QwenVLProcessingInfo(BaseProcessingInfo):
        vision_config = config.visual

        image_size = vision_config["image_size"]
+        kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
        kwargs.setdefault("size", {"width": image_size, "height": image_size})

        return QwenVLImageProcessorFast(**kwargs)

    def get_hf_processor(self, **kwargs: object) -> QwenVLProcessor:
-        return self.ctx.init_processor(
-            QwenVLProcessor,
+        return QwenVLProcessor(
            tokenizer=self.get_tokenizer(),
            image_processor=self.get_image_processor(**kwargs),
        )

--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -43,7 +43,10 @@ from vllm.multimodal.processing import (
    PromptUpdate,
 )
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.processors.skyworkr1v import SkyworkR1VProcessor
+from vllm.transformers_utils.processors.internvl import (
+    InternVLImageProcessor,
+    InternVLProcessor,
+)
 from vllm.utils.tensor_schema import TensorSchema, TensorShape

 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -96,12 +99,33 @@ SkyworkR1VImageInputs: TypeAlias = (


 class SkyworkR1VProcessingInfo(BaseProcessingInfo):
-    def get_hf_processor(self, **kwargs: object) -> SkyworkR1VProcessor:
-        return self.ctx.init_processor(
-            SkyworkR1VProcessor,
-            config=self.get_hf_config(),
+    def get_image_processor(self, **kwargs):
+        config = self.get_hf_config()
+        vision_config = config.vision_config
+
+        kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
+        kwargs.setdefault("image_size", vision_config.image_size)
+        kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
+        kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
+        kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
+        kwargs.setdefault("use_thumbnail", config.use_thumbnail)
+
+        return InternVLImageProcessor(**kwargs)
+
+    def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
+        config = self.get_hf_config()
+        vision_config = config.vision_config
+
+        image_processor = self.get_image_processor(**kwargs)
+        image_size = image_processor.image_size
+        patch_size = vision_config.patch_size
+        downsample_ratio = config.downsample_ratio
+        image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
+
+        return InternVLProcessor(
            tokenizer=self.get_tokenizer(),
-            **kwargs,
+            image_processor=image_processor,
+            image_seq_length=image_seq_length,
        )

    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
@@ -112,7 +136,7 @@ class SkyworkR1VProcessingInfo(BaseProcessingInfo):
        *,
        image_width: int,
        image_height: int,
-        processor: SkyworkR1VProcessor,
+        processor: InternVLProcessor,
    ) -> int:
        return processor.get_num_image_tokens(
            image_width=image_width,
@@ -121,8 +145,9 @@ class SkyworkR1VProcessingInfo(BaseProcessingInfo):

    def get_image_size_with_most_features(self) -> ImageSize:
        processor = self.get_hf_processor()
+        image_processor = processor.image_processor

-        base_size = processor.image_size
+        base_size = image_processor.image_size
        target_ratios = processor.resolve_target_ratios()

        largest_feature_size, largest_feature_pinpoint = 0, None
@@ -187,7 +212,7 @@ class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[SkyworkR1VProcessing
        )

        hf_processor = self.info.get_hf_processor(**mm_kwargs)
-        image_token_id = hf_processor.image_token_id
+        image_token_id = hf_processor.ctx_image_token_id

        # Since there may be extra tokens in the feature placeholders,
        # we need to pass the image token ID to the model to select the
@@ -252,7 +277,7 @@ class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[SkyworkR1VProcessing
            if num_patches is not None:
                assert isinstance(num_patches, int)

-            return hf_processor.get_image_repl(feature_size, num_patches)
+            return hf_processor.get_image_repl(num_patches, num_features=feature_size)

        return [
            PromptReplacement(

--- a/vllm/transformers_utils/processors/__init__.py
+++ b/vllm/transformers_utils/processors/__init__.py
@@ -14,7 +14,6 @@ __all__ = [
    "BagelProcessor",
    "CohereASRProcessor",
    "DeepseekVLV2Processor",
-    "Eagle2_5_VLProcessor",
    "FireRedASR2Processor",
    "FunASRProcessor",
    "GLM4VProcessor",
@@ -34,14 +33,12 @@ __all__ = [
    "Ovis2_5Processor",
    "QwenVLProcessor",
    "Qwen3ASRProcessor",
-    "SkyworkR1VProcessor",
 ]

 _CLASS_TO_MODULE: dict[str, str] = {
    "BagelProcessor": "vllm.transformers_utils.processors.bagel",
    "CohereASRProcessor": "vllm.transformers_utils.processors.cohere_asr",
    "DeepseekVLV2Processor": "vllm.transformers_utils.processors.deepseek_vl2",
-    "Eagle2_5_VLProcessor": "vllm.transformers_utils.processors.eagle2_5_vl",
    "FireRedASR2Processor": "vllm.transformers_utils.processors.fireredasr2",
    "FunASRProcessor": "vllm.transformers_utils.processors.funasr",
    "GLM4VProcessor": "vllm.transformers_utils.processors.glm4v",
@@ -61,7 +58,6 @@ _CLASS_TO_MODULE: dict[str, str] = {
    "Ovis2_5Processor": "vllm.transformers_utils.processors.ovis2_5",
    "QwenVLProcessor": "vllm.transformers_utils.processors.qwen_vl",
    "Qwen3ASRProcessor": "vllm.transformers_utils.processors.qwen3_asr",
-    "SkyworkR1VProcessor": "vllm.transformers_utils.processors.skyworkr1v",
 }



--- a/vllm/transformers_utils/processors/eagle2_5_vl.py
+++ b/vllm/transformers_utils/processors/eagle2_5_vl.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# Adapted from NVIDIA Eagle2.5-VL model
-# https://huggingface.co/nvidia/Eagle2.5-8B
-from transformers import PretrainedConfig
-
-from vllm.multimodal.processing import PromptUpdateDetails
-from vllm.tokenizers import TokenizerLike
-
-from .internvl import IMG_CONTEXT, IMG_END, IMG_START, BaseInternVLProcessor
-
-
-class Eagle2_5_VLProcessor(BaseInternVLProcessor):
-    """
-    Custom processor for Eagle2.5-VL model.
-    Extends BaseInternVLProcessor with Eagle-specific token handling.
-    """
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        tokenizer: TokenizerLike,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-    ) -> None:
-        # Skip super().__init__() to avoid config manipulation
-        # Directly initialize all required attributes
-        self.config = config
-        self.tokenizer = tokenizer
-
-        # Image size with force_image_size override
-        image_size: int = config.vision_config.image_size
-        if hasattr(config, "force_image_size") and config.force_image_size:
-            image_size = config.force_image_size
-
-        patch_size: int = config.vision_config.patch_size
-        downsample_ratio: float = getattr(config, "downsample_ratio", 0.5)
-
-        # Compute num_image_token
-        self.num_image_token = int(
-            (image_size // patch_size) ** 2 * (downsample_ratio**2)
-        )
-        self.image_size = image_size
-
-        # Dynamic patch settings with defaults
-        self.min_dynamic_patch = (
-            min_dynamic_patch
-            if min_dynamic_patch is not None
-            else getattr(config, "min_dynamic_patch", 1)
-        )
-        self.max_dynamic_patch = (
-            max_dynamic_patch
-            if max_dynamic_patch is not None
-            else getattr(config, "max_dynamic_patch", 12)
-        )
-        self.dynamic_image_size = (
-            dynamic_image_size
-            if dynamic_image_size is not None
-            else getattr(config, "dynamic_image_size", True)
-        )
-        self.use_thumbnail: bool = getattr(config, "use_thumbnail", True)
-
-    @property
-    def image_token_id(self) -> int:
-        """Get the image token ID from config or tokenizer."""
-        if hasattr(self.config, "image_token_index"):
-            return self.config.image_token_index
-        # Fallback to tokenizer vocab - use <IMG_CONTEXT> (ID: 151667)
-        vocab = self.tokenizer.get_vocab()
-        if IMG_CONTEXT in vocab:
-            return vocab[IMG_CONTEXT]
-        raise ValueError(f"Cannot find image token '{IMG_CONTEXT}' in vocabulary")
-
-    def get_image_repl(
-        self,
-        feature_size: int,
-        num_patches: int | None,
-    ) -> PromptUpdateDetails[str]:
-        """Get image replacement string for prompt."""
-        repl_features = IMG_CONTEXT * feature_size
-        repl_full = IMG_START + repl_features + IMG_END
-
-        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
--- a/vllm/transformers_utils/processors/h2ovl.py
+++ b/vllm/transformers_utils/processors/h2ovl.py
@@ -10,16 +10,12 @@
 # --------------------------------------------------------
 import torch
 from PIL import Image
-from transformers import PretrainedConfig

-from vllm.multimodal.processing import PromptUpdateDetails
-from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers.hf import HfTokenizer

 from .internvl import (
-    IMG_CONTEXT,
-    IMG_END,
-    IMG_START,
-    BaseInternVLProcessor,
+    InternVLImageProcessor,
+    InternVLProcessor,
    build_transform,
    find_closest_aspect_ratio,
    get_internvl_target_ratios,
@@ -217,45 +213,26 @@ def image_to_pixel_values_h2ovl(
    return pixel_values


-class H2OVLProcessor(BaseInternVLProcessor):
+class H2OVLImageProcessor(InternVLImageProcessor):
    def __init__(
        self,
-        config: PretrainedConfig,
-        tokenizer: TokenizerLike,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-        use_msac: bool | None = None,
+        image_size: int,
+        min_dynamic_patch: int,
+        max_dynamic_patch: int,
+        dynamic_image_size: bool,
+        use_thumbnail: bool,
+        use_msac: bool,
    ) -> None:
        super().__init__(
-            config,
-            tokenizer,
+            image_size=image_size,
            min_dynamic_patch=min_dynamic_patch,
            max_dynamic_patch=max_dynamic_patch,
            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
        )

-        if use_msac is None:
-            use_msac = config.use_msac
-        assert isinstance(use_msac, bool)
-
        self.use_msac = use_msac

-    @property
-    def image_token_id(self) -> int:
-        return self.tokenizer.get_vocab()[IMG_CONTEXT]
-
-    def get_image_repl(
-        self,
-        feature_size: int,
-        num_patches: int | None,
-    ) -> PromptUpdateDetails[str]:
-        repl_features = IMG_CONTEXT * feature_size
-        repl_full = IMG_START + repl_features + IMG_END
-
-        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
-
    def resolve_min_max_num(
        self,
        *,
@@ -264,18 +241,14 @@ class H2OVLProcessor(BaseInternVLProcessor):
        dynamic_image_size: bool | None = None,
        use_thumbnail: bool | None = None,
    ) -> tuple[int, int]:
-        min_dynamic_patch = (
-            self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
-        )
-        max_dynamic_patch = (
-            self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
-        )
-        dynamic_image_size = (
-            self.dynamic_image_size
-            if dynamic_image_size is None
-            else dynamic_image_size
-        )
-        use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
+        if min_dynamic_patch is None:
+            min_dynamic_patch = self.min_dynamic_patch
+        if max_dynamic_patch is None:
+            max_dynamic_patch = self.max_dynamic_patch
+        if dynamic_image_size is None:
+            dynamic_image_size = self.dynamic_image_size
+        if use_thumbnail is None:
+            use_thumbnail = self.use_thumbnail

        return resolve_h2ovl_min_max_num(
            min_dynamic_patch=min_dynamic_patch,
@@ -284,6 +257,57 @@ class H2OVLProcessor(BaseInternVLProcessor):
            use_thumbnail=use_thumbnail,
        )

+    def _images_to_pixel_values_lst(
+        self,
+        images: list[Image.Image],
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+    ) -> list[torch.Tensor]:
+        use_msac = self.use_msac if len(images) == 1 else False
+
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=False,  # Applied in image_to_pixel_values
+        )
+
+        return [
+            image_to_pixel_values_h2ovl(
+                image,
+                input_size=self.image_size,
+                min_num=min_num,
+                max_num=max_num,
+                use_thumbnail=self.use_thumbnail,
+                use_msac=use_msac,
+            )
+            for image in images
+        ]
+
+
+class H2OVLProcessor(InternVLProcessor):
+    def __init__(
+        self,
+        image_processor: H2OVLImageProcessor,
+        tokenizer: HfTokenizer,
+        *,
+        image_seq_length: int,
+        start_image_token: str = "<img>",
+        end_image_token: str = "</img>",
+        ctx_image_token: str = "<IMG_CONTEXT>",
+    ) -> None:
+        super().__init__(
+            image_processor=image_processor,
+            tokenizer=tokenizer,
+            image_seq_length=image_seq_length,
+            start_image_token=start_image_token,
+            end_image_token=end_image_token,
+            ctx_image_token=ctx_image_token,
+        )
+
+        self.image_processor: H2OVLImageProcessor
+
    def resolve_target_ratios(
        self,
        *,
@@ -294,7 +318,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
        prior_aspect_ratio: tuple[int, int] | None = None,
        override_min_num: int | None = None,
    ) -> list[tuple[int, int]]:
-        min_num, max_num = self.resolve_min_max_num(
+        min_num, max_num = self.image_processor.resolve_min_max_num(
            min_dynamic_patch=min_dynamic_patch,
            max_dynamic_patch=max_dynamic_patch,
            dynamic_image_size=dynamic_image_size,
@@ -316,9 +340,10 @@ class H2OVLProcessor(BaseInternVLProcessor):
        image_height: int,
        use_msac: bool | None = None,
    ) -> int:
-        use_msac = self.use_msac if use_msac is None else use_msac
+        image_processor = self.image_processor
+        use_msac = image_processor.use_msac if use_msac is None else use_msac

-        use_thumbnail = self.use_thumbnail
+        use_thumbnail = image_processor.use_thumbnail

        if use_msac:
            target_ratios_1 = self.resolve_target_ratios(
@@ -328,7 +353,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
            num_patches_1, _, _, aspect_ratio_1 = calculate_h2ovl_targets(
                orig_width=image_width,
                orig_height=image_height,
-                image_size=self.image_size,
+                image_size=image_processor.image_size,
                target_ratios=target_ratios_1,
                use_thumbnail=True,
            )
@@ -341,7 +366,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
            num_patches_2, _, _, _ = calculate_h2ovl_targets(
                orig_width=image_width,
                orig_height=image_height,
-                image_size=self.image_size,
+                image_size=image_processor.image_size,
                target_ratios=target_ratios_2,
                use_thumbnail=True,
            )
@@ -354,37 +379,9 @@ class H2OVLProcessor(BaseInternVLProcessor):
            num_patches, _, _, _ = calculate_h2ovl_targets(
                orig_width=image_width,
                orig_height=image_height,
-                image_size=self.image_size,
+                image_size=image_processor.image_size,
                target_ratios=target_ratios,
                use_thumbnail=use_thumbnail,
            )

-        return num_patches * self.num_image_token
-
-    def _images_to_pixel_values_lst(
-        self,
-        images: list[Image.Image],
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-    ) -> list[torch.Tensor]:
-        use_msac = self.use_msac if len(images) == 1 else False
-
-        min_num, max_num = self.resolve_min_max_num(
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-            use_thumbnail=False,  # Applied in image_to_pixel_values
-        )
-
-        return [
-            image_to_pixel_values_h2ovl(
-                image,
-                input_size=self.image_size,
-                min_num=min_num,
-                max_num=max_num,
-                use_thumbnail=self.use_thumbnail,
-                use_msac=use_msac,
-            )
-            for image in images
-        ]
+        return num_patches * self.image_seq_length
--- a/vllm/transformers_utils/processors/internvl.py
+++ b/vllm/transformers_utils/processors/internvl.py
@@ -7,24 +7,17 @@
 # Copyright (c) 2023 OpenGVLab
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
-from abc import ABC, abstractmethod
-from typing import Any, TypeVar

 import numpy.typing as npt
 import torch
 import torchvision.transforms as T
 from PIL import Image
-from transformers import BatchFeature, PretrainedConfig, TensorType
+from transformers import BatchFeature, TensorType
+from transformers.processing_utils import ProcessorMixin

 from vllm.multimodal.image import convert_image_mode
 from vllm.multimodal.processing import PromptUpdateDetails
-from vllm.tokenizers import TokenizerLike
-
-_T = TypeVar("_T")
-
-IMG_START = "<img>"
-IMG_END = "</img>"
-IMG_CONTEXT = "<IMG_CONTEXT>"
+from vllm.tokenizers.hf import HfTokenizer

 IMAGENET_MEAN = (0.485, 0.456, 0.406)
 IMAGENET_STD = (0.229, 0.224, 0.225)
@@ -33,7 +26,7 @@ IMAGENET_STD = (0.229, 0.224, 0.225)
 # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
 def build_transform(input_size: int):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
-    transform = T.Compose(
+    return T.Compose(
        [
            T.Lambda(lambda img: convert_image_mode(img, "RGB")),
            T.Resize(
@@ -43,7 +36,6 @@ def build_transform(input_size: int):
            T.Normalize(mean=MEAN, std=STD),
        ]
    )
-    return transform


 # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
@@ -223,65 +215,20 @@ def video_to_pixel_values_internvl(
    return pixel_values


-class BaseInternVLProcessor(ABC):
-    """
-    This model doesn't define its own HF processor,
-    so we implement our own one here.
-
-    The code to insert image tokens is based on:
-    https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
-    """
-
+class InternVLImageProcessor:
    def __init__(
        self,
-        config: PretrainedConfig,
-        tokenizer: TokenizerLike,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
+        image_size: int,
+        min_dynamic_patch: int,
+        max_dynamic_patch: int,
+        dynamic_image_size: bool,
+        use_thumbnail: bool,
    ) -> None:
-        super().__init__()
-
-        self.config = config
-        self.tokenizer = tokenizer
-
-        image_size: int = config.vision_config.image_size
-        patch_size: int = config.vision_config.patch_size
-
-        if min_dynamic_patch is None:
-            min_dynamic_patch = config.min_dynamic_patch
-        assert isinstance(min_dynamic_patch, int)
-
-        if max_dynamic_patch is None:
-            max_dynamic_patch = config.max_dynamic_patch
-        assert isinstance(max_dynamic_patch, int)
-
-        if dynamic_image_size is None:
-            dynamic_image_size = config.dynamic_image_size
-        assert isinstance(dynamic_image_size, bool)
-
-        self.num_image_token = int(
-            (image_size // patch_size) ** 2 * (config.downsample_ratio**2)
-        )
        self.image_size = image_size
        self.min_dynamic_patch = min_dynamic_patch
        self.max_dynamic_patch = max_dynamic_patch
        self.dynamic_image_size = dynamic_image_size
-        self.use_thumbnail: bool = config.use_thumbnail
-
-    @property
-    @abstractmethod
-    def image_token_id(self) -> int:
-        raise NotImplementedError
-
-    @abstractmethod
-    def get_image_repl(
-        self,
-        feature_size: int,
-        num_patches: int | None,
-    ) -> PromptUpdateDetails[str]:
-        raise NotImplementedError
+        self.use_thumbnail = use_thumbnail

    def resolve_min_max_num(
        self,
@@ -291,18 +238,14 @@ class BaseInternVLProcessor(ABC):
        dynamic_image_size: bool | None = None,
        use_thumbnail: bool | None = None,
    ) -> tuple[int, int]:
-        min_dynamic_patch = (
-            self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
-        )
-        max_dynamic_patch = (
-            self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
-        )
-        dynamic_image_size = (
-            self.dynamic_image_size
-            if dynamic_image_size is None
-            else dynamic_image_size
-        )
-        use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
+        if min_dynamic_patch is None:
+            min_dynamic_patch = self.min_dynamic_patch
+        if max_dynamic_patch is None:
+            max_dynamic_patch = self.max_dynamic_patch
+        if dynamic_image_size is None:
+            dynamic_image_size = self.dynamic_image_size
+        if use_thumbnail is None:
+            use_thumbnail = self.use_thumbnail

        return resolve_internvl_min_max_num(
            min_dynamic_patch=min_dynamic_patch,
@@ -311,43 +254,6 @@ class BaseInternVLProcessor(ABC):
            use_thumbnail=use_thumbnail,
        )

-    def resolve_target_ratios(
-        self,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-        use_thumbnail: bool | None = None,
-    ) -> list[tuple[int, int]]:
-        min_num, max_num = self.resolve_min_max_num(
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-            use_thumbnail=use_thumbnail,
-        )
-
-        return get_internvl_target_ratios(min_num, max_num)
-
-    def get_num_image_tokens(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-    ) -> int:
-        target_ratios = self.resolve_target_ratios(
-            use_thumbnail=False,  # Applied in calculate_targets
-        )
-
-        num_patches, _, _ = calculate_internvl_targets(
-            orig_width=image_width,
-            orig_height=image_height,
-            image_size=self.image_size,
-            target_ratios=target_ratios,
-            use_thumbnail=self.use_thumbnail,
-        )
-
-        return num_patches * self.num_image_token
-
    def _images_to_pixel_values_lst(
        self,
        images: list[Image.Image],
@@ -355,7 +261,14 @@ class BaseInternVLProcessor(ABC):
        max_dynamic_patch: int | None = None,
        dynamic_image_size: bool | None = None,
    ) -> list[torch.Tensor]:
-        min_num, max_num = self.resolve_min_max_num(
+        if min_dynamic_patch is None:
+            min_dynamic_patch = self.min_dynamic_patch
+        if max_dynamic_patch is None:
+            max_dynamic_patch = self.max_dynamic_patch
+        if dynamic_image_size is None:
+            dynamic_image_size = self.dynamic_image_size
+
+        min_num, max_num = resolve_internvl_min_max_num(
            min_dynamic_patch=min_dynamic_patch,
            max_dynamic_patch=max_dynamic_patch,
            dynamic_image_size=dynamic_image_size,
@@ -373,49 +286,9 @@ class BaseInternVLProcessor(ABC):
            for image in images
        ]

-    def _preprocess_image(
-        self,
-        text: list[str],
-        images: list[Image.Image],
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-    ) -> tuple[list[str], dict[str, torch.Tensor]]:
-        if len(images) == 0:
-            image_inputs = {}
-        else:
-            pixel_values_lst = self._images_to_pixel_values_lst(
-                images,
-                min_dynamic_patch=min_dynamic_patch,
-                max_dynamic_patch=max_dynamic_patch,
-                dynamic_image_size=dynamic_image_size,
-            )
-            image_inputs = {
-                "pixel_values_flat": torch.cat(pixel_values_lst),
-                "image_num_patches": torch.tensor(
-                    [len(item) for item in pixel_values_lst]
-                ),
-            }
-
-            for pixel_values in pixel_values_lst:
-                num_patches = pixel_values.shape[0]
-                feature_size = num_patches * self.num_image_token
-
-                image_repl = self.get_image_repl(feature_size, num_patches)
-                text = [t.replace("<image>", image_repl.full, 1) for t in text]
-        return text, image_inputs
-
-    def _make_batch_input(self, input_item: _T | list[_T] | None = None) -> list[_T]:
-        if input_item is None:
-            input_item = []
-        if not isinstance(input_item, list):
-            input_item = [input_item]
-        return input_item
-
    def __call__(
        self,
-        text: str | list[str] | None = None,
-        images: Image.Image | list[Image.Image] | None = None,
+        images: Image.Image | list[Image.Image],
        *,
        min_dynamic_patch: int | None = None,
        max_dynamic_patch: int | None = None,
@@ -423,120 +296,173 @@ class BaseInternVLProcessor(ABC):
        return_tensors: str | TensorType | None = None,
        **kwargs,
    ) -> BatchFeature:
-        text = self._make_batch_input(text)
-        images = self._make_batch_input(images)
+        images_lst = [images] if not isinstance(images, list) else images

-        text, image_inputs = self._preprocess_image(
-            text=text,
-            images=images,
+        pixel_values_lst = self._images_to_pixel_values_lst(
+            images_lst,
            min_dynamic_patch=min_dynamic_patch,
            max_dynamic_patch=max_dynamic_patch,
            dynamic_image_size=dynamic_image_size,
        )

-        text_inputs = self.tokenizer(text)
+        image_inputs = {
+            "pixel_values_flat": torch.cat(pixel_values_lst),
+            "image_num_patches": torch.tensor([len(item) for item in pixel_values_lst]),
+        }
+        return BatchFeature(image_inputs, tensor_type=return_tensors)

-        combined_outputs = {**text_inputs, **image_inputs}

-        return BatchFeature(combined_outputs, tensor_type=return_tensors)
+class InternVLVideoProcessor:
+    def __init__(
+        self,
+        image_size: int,
+    ) -> None:
+        self.image_size = image_size
+
+    def _videos_to_pixel_values_lst(
+        self,
+        videos: list[npt.NDArray],
+    ) -> list[torch.Tensor]:
+        return [
+            video_to_pixel_values_internvl(
+                video,
+                input_size=self.image_size,
+                min_num=1,
+                max_num=1,
+                use_thumbnail=False,
+            )
+            for video in videos
+        ]
+
+    def __call__(
+        self,
+        videos: npt.NDArray | list[npt.NDArray],
+        *,
+        return_tensors: str | TensorType | None = None,
+        **kwargs,
+    ) -> BatchFeature:
+        videos_lst = [videos] if not isinstance(videos, list) else videos

+        pixel_values_lst = self._videos_to_pixel_values_lst(videos_lst)
+
+        image_inputs = {
+            "pixel_values_flat_video": torch.cat(pixel_values_lst),
+            "video_num_patches": torch.tensor([len(item) for item in pixel_values_lst]),
+        }
+        return BatchFeature(image_inputs, tensor_type=return_tensors)

-class InternVLProcessor(BaseInternVLProcessor):
+
+class InternVLProcessor(ProcessorMixin):
    """
-    HF Processor for InternVLChatModel with extended video processing logic.
+    This model doesn't define its own HF processor,
+    so we implement our own one here.
+
+    The code to insert image tokens is based on:
+    https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252

    Code for video processing is adapted from video example:
    https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers
    """

+    attributes = ["image_processor", "tokenizer", "video_processor"]
+
    def __init__(
        self,
-        config: PretrainedConfig,
-        tokenizer: TokenizerLike,
+        image_processor: InternVLImageProcessor,
+        tokenizer: HfTokenizer,
+        video_processor: InternVLVideoProcessor | None = None,
+        *,
+        image_seq_length: int,
+        start_image_token: str = "<img>",
+        end_image_token: str = "</img>",
+        ctx_image_token: str = "<IMG_CONTEXT>",
+        ctx_video_token: str | None = None,
+    ) -> None:
+        self.image_processor = image_processor
+        self.tokenizer = tokenizer
+        self.video_processor = video_processor
+
+        self.image_seq_length = image_seq_length
+        self.start_image_token = start_image_token
+        self.end_image_token = end_image_token
+        self.ctx_image_token = ctx_image_token
+        self.ctx_video_token = ctx_video_token
+
+        self.start_image_token_id = tokenizer.convert_tokens_to_ids(start_image_token)
+        self.end_image_token_id = tokenizer.convert_tokens_to_ids(end_image_token)
+        self.ctx_image_token_id = tokenizer.convert_tokens_to_ids(ctx_image_token)
+        self.ctx_video_token_id = (
+            None
+            if ctx_video_token is None
+            else tokenizer.convert_tokens_to_ids(ctx_video_token)
+        )
+
+    def resolve_target_ratios(
+        self,
        *,
        min_dynamic_patch: int | None = None,
        max_dynamic_patch: int | None = None,
        dynamic_image_size: bool | None = None,
-        video_token: str | None = None,
-    ) -> None:
-        super().__init__(
-            config=config,
-            tokenizer=tokenizer,
+        use_thumbnail: bool | None = None,
+    ) -> list[tuple[int, int]]:
+        min_num, max_num = self.image_processor.resolve_min_max_num(
            min_dynamic_patch=min_dynamic_patch,
            max_dynamic_patch=max_dynamic_patch,
            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
        )
-        # add extra video token for video processing
-        self.video_token = video_token
-
-    @property
-    def image_token_id(self) -> int:
-        return self.tokenizer.get_vocab()[IMG_CONTEXT]

-    @property
-    def video_token_id(self) -> int | None:
-        if self.video_token is None:
-            return None
-        return self.tokenizer.get_vocab().get(self.video_token, None)
-
-    @property
-    def supports_video(self) -> bool:
-        return self.video_token_id is not None
+        return get_internvl_target_ratios(min_num, max_num)

-    def _videos_to_pixel_values_lst(
+    def get_num_image_tokens(
        self,
-        videos: list[npt.NDArray],
-        dynamic_image_size: bool | None = None,
-    ) -> list[torch.Tensor]:
-        min_num, max_num = self.resolve_min_max_num(
-            min_dynamic_patch=1,
-            max_dynamic_patch=1,
-            dynamic_image_size=dynamic_image_size,
-            use_thumbnail=False,  # Applied in image_to_pixel_values
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        image_processor = self.image_processor
+        target_ratios = self.resolve_target_ratios(
+            use_thumbnail=False,  # Applied in calculate_targets
        )

-        return [
-            video_to_pixel_values_internvl(
-                video,
-                input_size=self.image_size,
-                min_num=min_num,
-                max_num=max_num,
-                use_thumbnail=False,
-            )
-            for video in videos
-        ]
+        num_patches, _, _ = calculate_internvl_targets(
+            orig_width=image_width,
+            orig_height=image_height,
+            image_size=image_processor.image_size,
+            target_ratios=target_ratios,
+            use_thumbnail=image_processor.use_thumbnail,
+        )

-    def _preprocess_video(
+        return num_patches * self.image_seq_length
+
+    def get_image_repl(
        self,
-        text: list[str],
-        videos: list[npt.NDArray],
-        dynamic_image_size: bool | None = None,
-    ) -> tuple[list[str], dict[str, Any]]:
-        if len(videos) == 0 or not self.supports_video:
-            return text, {}
+        num_patches: int | None,
+        num_features: int | None = None,
+    ) -> PromptUpdateDetails[str]:
+        if num_patches is None:
+            assert num_features is not None
+        else:
+            num_features = num_patches * self.image_seq_length

-        video_token = self.video_token
-        assert video_token is not None
+        repl_features = self.ctx_image_token * num_features
+        repl_full = self.start_image_token + repl_features + self.end_image_token

-        pixel_values_lst_video = self._videos_to_pixel_values_lst(
-            videos,
-            dynamic_image_size=dynamic_image_size,
-        )
-        video_inputs = {
-            "pixel_values_flat_video": torch.cat(pixel_values_lst_video),
-            "video_num_patches": torch.tensor(
-                [len(item) for item in pixel_values_lst_video]
-            ),
-        }
+        return PromptUpdateDetails.select_text(repl_full, self.ctx_image_token)

-        for pixel_values in pixel_values_lst_video:
-            num_patches = pixel_values.shape[0]
+    def get_video_repl(self, num_patches: int) -> PromptUpdateDetails[str]:
+        assert self.ctx_video_token is not None

-            video_repl = self.get_video_repl(
-                self.num_image_token, num_patches, video_token
-            )
-            text = [t.replace("<video>", video_repl.full, 1) for t in text]
-        return text, video_inputs
+        repl_features = self.ctx_video_token * self.image_seq_length
+        repl_features_with_sep = (
+            self.start_image_token + repl_features + self.end_image_token
+        )
+        # num_patches is equal to num_frames
+        repl_full = "".join(
+            [f"Frame{i + 1}: {repl_features_with_sep}" for i in range(num_patches)]
+        )
+
+        return PromptUpdateDetails.select_text(repl_full, self.ctx_video_token)

    def __call__(
        self,
@@ -550,54 +476,88 @@ class InternVLProcessor(BaseInternVLProcessor):
        return_tensors: str | TensorType | None = None,
        **kwargs,
    ) -> BatchFeature:
-        text = self._make_batch_input(text)
-        images = self._make_batch_input(images)
-        videos = self._make_batch_input(videos)
+        if images is not None:
+            image_inputs = self.image_processor(
+                images=images,
+                min_dynamic_patch=min_dynamic_patch,
+                max_dynamic_patch=max_dynamic_patch,
+                dynamic_image_size=dynamic_image_size,
+                return_tensors=return_tensors,
+            )
+            image_num_patches = image_inputs["image_num_patches"]
+        else:
+            image_inputs = {}
+            image_num_patches = []

-        text, image_inputs = self._preprocess_image(
-            text=text,
-            images=images,
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-        )
+        if videos is not None:
+            if self.video_processor is None:
+                raise ValueError("This model does not support video inputs")

-        text, video_inputs = self._preprocess_video(
-            text=text,
-            videos=videos,
-            dynamic_image_size=dynamic_image_size,
-        )
+            video_inputs = self.video_processor(
+                videos=videos,
+                return_tensors=return_tensors,
+            )
+            video_num_patches = video_inputs["video_num_patches"]
+        else:
+            video_inputs = {}
+            video_num_patches = []

-        text_inputs = self.tokenizer(text)
+        if text is not None:
+            if not isinstance(text, list):
+                text = [text]

-        combined_outputs = {**text_inputs, **image_inputs, **video_inputs}
+            if image_inputs:
+                image_token = "<image>"
+                image_index = 0
+                processed_text = list[str]()
+                replace_strings = list[str]()

-        return BatchFeature(combined_outputs, tensor_type=return_tensors)
+                for prompt in text:
+                    new_prompt = prompt

-    def get_image_repl(
-        self,
-        feature_size: int,
-        num_patches: int | None,
-    ) -> PromptUpdateDetails[str]:
-        repl_features = IMG_CONTEXT * feature_size
-        repl_full = IMG_START + repl_features + IMG_END
+                    while image_token in new_prompt:
+                        new_prompt = new_prompt.replace(image_token, "<placeholder>", 1)
+                        image_repl = self.get_image_repl(image_num_patches[image_index])
+                        replace_strings.append(image_repl.full)
+                        image_index += 1

-        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
+                    while "<placeholder>" in new_prompt:
+                        replace_str = replace_strings.pop(0)
+                        new_prompt = new_prompt.replace("<placeholder>", replace_str, 1)

-    def get_video_repl(
-        self,
-        feature_size: int,
-        num_patches: int | None,
-        video_context_token: str = IMG_CONTEXT,
-    ) -> PromptUpdateDetails[str]:
-        if num_patches is None:
-            raise NotImplementedError("Embedding inputs are not supported")
+                    processed_text.append(new_prompt)

-        repl_features = video_context_token * self.num_image_token
-        repl_features_with_sep = IMG_START + repl_features + IMG_END
-        # num_patches is equal to num_frames
-        repl_full = "".join(
-            [f"Frame{i + 1}: {repl_features_with_sep}" for i in range(num_patches)]
-        )
+                text = processed_text
+
+            if video_inputs:
+                video_token = "<video>"
+                video_index = 0
+                processed_text = list[str]()
+                replace_strings = list[str]()
+
+                assert video_token is not None
+
+                for prompt in text:
+                    new_prompt = prompt
+
+                    while video_token in new_prompt:
+                        new_prompt = new_prompt.replace(video_token, "<placeholder>", 1)
+                        video_repl = self.get_video_repl(video_num_patches[video_index])
+                        replace_strings.append(video_repl.full)
+                        video_index += 1
+
+                    while "<placeholder>" in new_prompt:
+                        replace_str = replace_strings.pop(0)
+                        new_prompt = new_prompt.replace("<placeholder>", replace_str, 1)

-        return PromptUpdateDetails.select_text(repl_full, video_context_token)
+                    processed_text.append(new_prompt)
+
+                text = processed_text
+
+            text_inputs = self.tokenizer(text, return_tensors=return_tensors)
+        else:
+            text_inputs = {}
+
+        combined_outputs = {**text_inputs, **image_inputs, **video_inputs}
+
+        return BatchFeature(combined_outputs, tensor_type=return_tensors)
--- a/vllm/transformers_utils/processors/nano_nemotron_vl.py
+++ b/vllm/transformers_utils/processors/nano_nemotron_vl.py
@@ -25,7 +25,7 @@ from vllm.model_executor.models.parakeet import ParakeetExtractor
 from vllm.multimodal.evs import compute_retained_tokens_count
 from vllm.multimodal.inputs import AudioItem
 from vllm.multimodal.processing.processor import PromptUpdateDetails, _seq2tokens
-from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers.hf import HfTokenizer

 from .internvl import calculate_internvl_targets, get_internvl_target_ratios

@@ -508,7 +508,7 @@ class BaseNanoNemotronVLProcessor(ABC):
    def __init__(
        self,
        config: PretrainedConfig,
-        tokenizer: TokenizerLike,
+        tokenizer: HfTokenizer,
        *args,
        max_model_len: int,
        max_num_tiles: int | None = None,
@@ -689,7 +689,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
    def __init__(
        self,
        config: PretrainedConfig,
-        tokenizer: TokenizerLike,
+        tokenizer: HfTokenizer,
        *,
        max_model_len: int,
        max_num_tiles: int | None = None,
@@ -961,7 +961,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
        tokens_per_frame: list[int],
        frames_indices: list[int],
        frame_duration_ms: int,
-        tokenizer: TokenizerLike,
+        tokenizer: HfTokenizer,
        img_start_token_ids: list[int],
        img_end_token_ids: list[int],
        img_context_token_ids: list[int],
@@ -986,7 +986,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
            tokens_per_frame (list[int]): number of tokens per frame
            frames_indices (list[int]): frame indices
            frame_duration_ms (int): duration of each frame in milliseconds
-            tokenizer (TokenizerLike): tokenizer to use for tokenizing frame separators
+            tokenizer (HfTokenizer): tokenizer to use for tokenizing frame separators
            img_start_token_ids (list[int]): pre-tokenized IMG_START tokens
            img_end_token_ids (list[int]): pre-tokenized IMG_END tokens
            img_context_token_ids (list[int]): pre-tokenized IMG_CONTEXT tokens

--- a/vllm/transformers_utils/processors/nemotron_vl.py
+++ b/vllm/transformers_utils/processors/nemotron_vl.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from abc import ABC

 import torch
 import torchvision.transforms as T
 from PIL import Image
-from transformers import PretrainedConfig
-from transformers.image_processing_utils_fast import BaseImageProcessorFast

 from vllm.multimodal.image import convert_image_mode
-from vllm.multimodal.processing import PromptUpdateDetails
-from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers.hf import HfTokenizer

-from .internvl import InternVLProcessor
+from .internvl import InternVLImageProcessor, InternVLProcessor

 # Configure PIL to handle large images without warnings
 # This prevents DecompressionBombWarning for legitimate large images
@@ -172,59 +168,61 @@ def image_to_pixel_values_nemotron_vl(
    return pixel_values


-class NemotronVLProcessor(InternVLProcessor):
-    IMG_START = "<img>"
-    IMG_END = "</img>"
-    IMG_CONTEXT = "<image>"
-
-    def __init__(
+class LlamaNemotronNanoVLImageProcessor(InternVLImageProcessor):
+    def _images_to_pixel_values_lst(
        self,
-        config: PretrainedConfig,
-        tokenizer: TokenizerLike,
-        image_processor: BaseImageProcessorFast,
-        *,
+        images: list[Image.Image],
        min_dynamic_patch: int | None = None,
        max_dynamic_patch: int | None = None,
        dynamic_image_size: bool | None = None,
-    ) -> None:
-        ABC.__init__(self)
-        self.config = config
-        self.tokenizer = tokenizer
-        self.image_processor = image_processor
-        image_size: int = config.force_image_size
-        patch_size: int = config.patch_size
-
-        if min_dynamic_patch is None:
-            min_dynamic_patch = 1
-        assert isinstance(min_dynamic_patch, int)
-
-        if max_dynamic_patch is None:
-            max_dynamic_patch = self.image_processor.max_num_tiles
-        assert isinstance(max_dynamic_patch, int)
-
-        if dynamic_image_size is None:
-            dynamic_image_size = True
-        assert isinstance(dynamic_image_size, bool)
-
-        self.num_image_token = int(
-            (image_size // patch_size) ** 2 * (config.downsample_ratio**2)
+    ) -> list[torch.Tensor]:
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=False,  # Applied in image_to_pixel_values
        )
-        self.image_size = image_size
-        self.min_dynamic_patch = min_dynamic_patch
-        self.max_dynamic_patch = max_dynamic_patch
-        self.dynamic_image_size = dynamic_image_size

-        if image_processor is not None:
-            self.use_thumbnail = image_processor.use_thumbnail
-        else:
-            self.use_thumbnail = getattr(config, "use_thumbnail", True)
+        return [
+            image_to_pixel_values_nemotron_vl(
+                image,
+                input_size=self.image_size,
+                min_num=min_num,
+                max_num=max_num,
+                use_thumbnail=self.use_thumbnail,
+                transform=build_transform(self.image_size),
+            )
+            for image in images
+        ]
+
+
+class LlamaNemotronNanoVLProcessor(InternVLProcessor):
+    """
+    This model doesn't define its own HF processor,
+    so we implement our own one here.

-    @property
-    def image_token_id(self) -> int:
-        return self.tokenizer.get_vocab()[self.IMG_CONTEXT]
+    The image processor is given by:
+    https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1/blob/main/image_processing.py
+    """

-    def _get_transform(self) -> T.Compose:
-        return build_transform(input_size=self.image_size)
+    def __init__(
+        self,
+        image_processor: LlamaNemotronNanoVLImageProcessor,
+        tokenizer: HfTokenizer,
+        *,
+        image_seq_length: int,
+        start_image_token: str = "<img>",
+        end_image_token: str = "</img>",
+        ctx_image_token: str = "<image>",
+    ) -> None:
+        super().__init__(
+            image_processor=image_processor,
+            tokenizer=tokenizer,
+            image_seq_length=image_seq_length,
+            start_image_token=start_image_token,
+            end_image_token=end_image_token,
+            ctx_image_token=ctx_image_token,
+        )

    def get_num_image_tokens(
        self,
@@ -232,6 +230,7 @@ class NemotronVLProcessor(InternVLProcessor):
        image_width: int,
        image_height: int,
    ) -> int:
+        image_processor = self.image_processor
        target_ratios = self.resolve_target_ratios(
            use_thumbnail=False,  # Applied in calculate_targets
        )
@@ -239,13 +238,33 @@ class NemotronVLProcessor(InternVLProcessor):
        num_patches, _, _ = calculate_nemotron_vl_targets(
            orig_width=image_width,
            orig_height=image_height,
-            image_size=self.image_size,
+            image_size=image_processor.image_size,
            target_ratios=target_ratios,
-            use_thumbnail=self.use_thumbnail,
+            use_thumbnail=image_processor.use_thumbnail,
        )

-        return num_patches * self.num_image_token
+        return num_patches * self.image_seq_length
+

+# SigLIP normalization constants
+SIGLIP_MEAN = (0.5, 0.5, 0.5)
+SIGLIP_STD = (0.5, 0.5, 0.5)
+
+
+def build_siglip_transform(input_size: int):
+    """Build transform for SigLIP vision encoder with normalization.
+
+    Extends the base transform from nemotron_vl with SigLIP-specific normalization.
+    """
+    return T.Compose(
+        [
+            build_transform(input_size=input_size),
+            T.Normalize(mean=SIGLIP_MEAN, std=SIGLIP_STD),
+        ]
+    )
+
+
+class LlamaNemotronVLEmbedImageProcessor(InternVLImageProcessor):
    def _images_to_pixel_values_lst(
        self,
        images: list[Image.Image],
@@ -267,83 +286,13 @@ class NemotronVLProcessor(InternVLProcessor):
                min_num=min_num,
                max_num=max_num,
                use_thumbnail=self.use_thumbnail,
-                transform=self._get_transform(),
+                transform=build_siglip_transform(self.image_size),
            )
            for image in images
        ]

-    def _replace_image_tokens(
-        self,
-        text: list[str],
-        pixel_values_lst: list[torch.Tensor],
-    ) -> list[str]:
-        """Replace <image> placeholders with image tokens."""
-        for pixel_values in pixel_values_lst:
-            num_patches = pixel_values.shape[0]
-            feature_size = num_patches * self.num_image_token
-            image_repl = self.get_image_repl(feature_size, num_patches)
-            # Use temporary placeholder to avoid replacing tokens we just inserted
-            NVL_IMAGE_CONTEXT = image_repl.full.replace("<image>", "<NVL_IMG_CONTEXT>")
-            text = [t.replace("<image>", NVL_IMAGE_CONTEXT, 1) for t in text]
-        return [t.replace("<NVL_IMG_CONTEXT>", self.IMG_CONTEXT) for t in text]
-
-    def _preprocess_image(
-        self,
-        text: list[str],
-        images: list[Image.Image],
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-    ) -> tuple[list[str], dict[str, torch.Tensor]]:
-        if len(images) == 0:
-            image_inputs = {}
-        else:
-            pixel_values_lst = self._images_to_pixel_values_lst(
-                images,
-                min_dynamic_patch=min_dynamic_patch,
-                max_dynamic_patch=max_dynamic_patch,
-                dynamic_image_size=dynamic_image_size,
-            )
-            image_inputs = {
-                "pixel_values_flat": torch.cat(pixel_values_lst),
-                "image_num_patches": torch.tensor(
-                    [len(item) for item in pixel_values_lst]
-                ),
-            }
-
-            text = self._replace_image_tokens(text, pixel_values_lst)
-        return text, image_inputs
-
-    def get_image_repl(
-        self,
-        feature_size: int,
-        num_patches: int | None,
-    ) -> PromptUpdateDetails[str]:
-        repl_features = self.IMG_CONTEXT * feature_size
-        repl_full = self.IMG_START + repl_features + self.IMG_END
-
-        return PromptUpdateDetails.select_text(repl_full, self.IMG_CONTEXT)
-
-
-# SigLIP normalization constants
-SIGLIP_MEAN = (0.5, 0.5, 0.5)
-SIGLIP_STD = (0.5, 0.5, 0.5)

-
-def build_siglip_transform(input_size: int):
-    """Build transform for SigLIP vision encoder with normalization.
-
-    Extends the base transform from nemotron_vl with SigLIP-specific normalization.
-    """
-    return T.Compose(
-        [
-            build_transform(input_size=input_size),
-            T.Normalize(mean=SIGLIP_MEAN, std=SIGLIP_STD),
-        ]
-    )
-
-
-class LlamaNemotronVLEmbedProcessor(NemotronVLProcessor):
+class LlamaNemotronVLEmbedProcessor(InternVLProcessor):
    """
    Processor for LlamaNemotronVL embedding model.

@@ -352,59 +301,44 @@ class LlamaNemotronVLEmbedProcessor(NemotronVLProcessor):
    - Uses different image context token (<IMG_CONTEXT> vs <image>)
    """

-    IMG_CONTEXT = "<IMG_CONTEXT>"
-
    def __init__(
        self,
-        config: PretrainedConfig,
-        tokenizer: TokenizerLike,
-        processor_config: dict,
+        image_processor: LlamaNemotronVLEmbedImageProcessor,
+        tokenizer: HfTokenizer,
        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
+        image_seq_length: int,
+        start_image_token: str = "<img>",
+        end_image_token: str = "</img>",
+        ctx_image_token: str = "<IMG_CONTEXT>",
    ) -> None:
-        if min_dynamic_patch is None:
-            min_dynamic_patch = processor_config.get(
-                "min_input_tiles",
-                getattr(config, "min_dynamic_patch", 1),
-            )
-        if max_dynamic_patch is None:
-            max_dynamic_patch = processor_config.get(
-                "max_input_tiles",
-                getattr(config, "max_dynamic_patch", 1),
-            )
-        if dynamic_image_size is None:
-            dynamic_image_size = processor_config.get(
-                "dynamic_image_size",
-                getattr(config, "dynamic_image_size", True),
-            )
        super().__init__(
-            config=config,
+            image_processor=image_processor,
            tokenizer=tokenizer,
-            image_processor=None,
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
+            image_seq_length=image_seq_length,
+            start_image_token=start_image_token,
+            end_image_token=end_image_token,
+            ctx_image_token=ctx_image_token,
        )

-    def _get_transform(self) -> T.Compose:
-        """Override to add SigLIP normalization."""
-        return build_siglip_transform(input_size=self.image_size)
+        self.image_processor: LlamaNemotronVLEmbedImageProcessor

-    def _replace_image_tokens(
+    def get_num_image_tokens(
        self,
-        text: list[str],
-        pixel_values_lst: list[torch.Tensor],
-    ) -> list[str]:
-        """Override with simpler token replacement for embedding model.
-
-        No temporary placeholder needed because IMG_CONTEXT is <IMG_CONTEXT>,
-        not <image>, so there's no collision risk.
-        """
-        for pixel_values in pixel_values_lst:
-            num_patches = pixel_values.shape[0]
-            feature_size = num_patches * self.num_image_token
-            image_repl = self.get_image_repl(feature_size, num_patches)
-            text = [t.replace("<image>", image_repl.full, 1) for t in text]
-        return text
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        image_processor = self.image_processor
+        target_ratios = self.resolve_target_ratios(
+            use_thumbnail=False,  # Applied in calculate_targets
+        )
+
+        num_patches, _, _ = calculate_nemotron_vl_targets(
+            orig_width=image_width,
+            orig_height=image_height,
+            image_size=image_processor.image_size,
+            target_ratios=target_ratios,
+            use_thumbnail=image_processor.use_thumbnail,
+        )
+
+        return num_patches * self.image_seq_length
--- a/vllm/transformers_utils/processors/nvlm_d.py
+++ b/vllm/transformers_utils/processors/nvlm_d.py
@@ -8,37 +8,54 @@
 # Licensed under Apache 2.0 License [see LICENSE for details]
 # --------------------------------------------------------
 from vllm.multimodal.processing import PromptUpdateDetails
+from vllm.tokenizers.hf import HfTokenizer

-from .internvl import BaseInternVLProcessor
+from .internvl import InternVLImageProcessor, InternVLProcessor

-IMG_PAD = "<|vision_pad|>"

-
-class NVLMProcessor(BaseInternVLProcessor):
-    @property
-    def image_token_id(self) -> int:
-        return self.tokenizer.get_vocab()[IMG_PAD]
+class NVLMProcessor(InternVLProcessor):
+    def __init__(
+        self,
+        image_processor: InternVLImageProcessor,
+        tokenizer: HfTokenizer,
+        *,
+        image_seq_length: int,
+        start_image_token: str = "<Image>",
+        end_image_token: str = "</Image>",
+        ctx_image_token: str = "<|vision_pad|>",
+    ) -> None:
+        super().__init__(
+            image_processor=image_processor,
+            tokenizer=tokenizer,
+            image_seq_length=image_seq_length,
+            start_image_token=start_image_token,
+            end_image_token=end_image_token,
+            ctx_image_token=ctx_image_token,
+        )

    def get_image_repl(
        self,
-        feature_size: int,
        num_patches: int | None,
+        num_features: int | None = None,
    ) -> PromptUpdateDetails[str]:
        if num_patches is None:
            raise NotImplementedError("Embedding inputs are not supported")

+        num_features = num_patches * self.image_seq_length
+
        tile_pos_identifiers = [f"<tile_{i}>" for i in range(1, num_patches)]
-        if self.use_thumbnail:
+        if self.image_processor.use_thumbnail:
            tile_pos_identifiers += ["<tile_global_thumbnail>"]

-        context_size = feature_size // num_patches
+        context_size = num_features // num_patches
        features = "".join(
-            identifier + IMG_PAD * context_size for identifier in tile_pos_identifiers
+            (identifier + self.ctx_image_token * context_size)
+            for identifier in tile_pos_identifiers
        )

        # We include the start and end as well because "<Image><tile" is
        # tokenized as ["<Image", "><", "tile"], resulting in assertion error
        # when trying to find "<tile" as a subsequence of "<Image><tile"
-        repl = "<Image>" + features + "</Image>"
+        repl = self.start_image_token + features + self.end_image_token

-        return PromptUpdateDetails.select_text(repl, IMG_PAD)
+        return PromptUpdateDetails.select_text(repl, self.ctx_image_token)
--- a/vllm/transformers_utils/processors/skyworkr1v.py
+++ b/vllm/transformers_utils/processors/skyworkr1v.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py
-# --------------------------------------------------------
-# SkyworkR1V
-# Copyright (c) 2025 Skywork
-# Licensed under The MIT License [see LICENSE for details]
-# --------------------------------------------------------
-
-import torch
-import torchvision.transforms as T
-from PIL import Image
-from transformers import BatchFeature, PretrainedConfig, TensorType
-
-from vllm.multimodal.image import convert_image_mode
-from vllm.multimodal.processing import PromptUpdateDetails
-from vllm.tokenizers import TokenizerLike
-
-IMG_START = "<img>"
-IMG_END = "</img>"
-IMG_CONTEXT = "<IMG_CONTEXT>"
-
-IMAGENET_MEAN = (0.485, 0.456, 0.406)
-IMAGENET_STD = (0.229, 0.224, 0.225)
-
-
-# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
-def build_transform(input_size: int):
-    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
-    return T.Compose(
-        [
-            T.Lambda(lambda img: convert_image_mode(img, "RGB")),
-            T.Resize(
-                (input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
-            ),
-            T.ToTensor(),
-            T.Normalize(mean=MEAN, std=STD),
-        ]
-    )
-
-
-# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
-def find_closest_aspect_ratio(
-    aspect_ratio: float,
-    target_ratios: list[tuple[int, int]],
-    *,
-    width: int,
-    height: int,
-    image_size: int,
-) -> tuple[int, int]:
-    best_ratio_diff = float("inf")
-    best_ratio = (1, 1)
-    area = width * height
-    for ratio in target_ratios:
-        target_aspect_ratio = ratio[0] / ratio[1]
-        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
-        if ratio_diff < best_ratio_diff:
-            best_ratio_diff = ratio_diff
-            best_ratio = ratio
-        elif ratio_diff == best_ratio_diff:
-            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
-                best_ratio = ratio
-    return best_ratio
-
-
-def resolve_skyworkr1v_min_max_num(
-    *,
-    min_dynamic_patch: int,
-    max_dynamic_patch: int,
-    dynamic_image_size: bool,
-    use_thumbnail: bool,
-) -> tuple[int, int]:
-    min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
-    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
-
-    if use_thumbnail and max_dynamic_patch != 1:
-        max_dynamic_patch += 1
-
-    return min_dynamic_patch, max_dynamic_patch
-
-
-def get_skyworkr1v_target_ratios(
-    min_num: int,
-    max_num: int,
-) -> list[tuple[int, int]]:
-    target_ratios = {
-        (i, j)
-        for n in range(min_num, max_num + 1)
-        for i in range(1, n + 1)
-        for j in range(1, n + 1)
-        if min_num <= i * j <= max_num
-    }
-    return sorted(target_ratios, key=lambda x: x[0] * x[1])
-
-
-def calculate_skyworkr1v_targets(
-    *,
-    orig_width: int,
-    orig_height: int,
-    target_ratios: list[tuple[int, int]],
-    image_size: int,
-    use_thumbnail: bool,
-) -> tuple[int, int, int]:
-    aspect_ratio = orig_width / orig_height
-
-    # find the closest aspect ratio to the target
-    target_aspect_ratio = find_closest_aspect_ratio(
-        aspect_ratio,
-        target_ratios,
-        width=orig_width,
-        height=orig_height,
-        image_size=image_size,
-    )
-
-    # calculate the target width and height
-    target_width = image_size * target_aspect_ratio[0]
-    target_height = image_size * target_aspect_ratio[1]
-    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
-
-    # add thumbnail image if num_blocks != 1
-    if use_thumbnail and blocks != 1:
-        blocks += 1
-
-    return blocks, target_width, target_height
-
-
-def dynamic_preprocess_skyworkr1v(
-    image: Image.Image,
-    *,
-    target_ratios: list[tuple[int, int]],
-    image_size: int,
-    use_thumbnail: bool,
-) -> list[Image.Image]:
-    orig_width, orig_height = image.size
-
-    # calculate the number of blocks without thumbnail
-    blocks, target_width, target_height = calculate_skyworkr1v_targets(
-        orig_width=orig_width,
-        orig_height=orig_height,
-        target_ratios=target_ratios,
-        image_size=image_size,
-        use_thumbnail=False,
-    )
-
-    # resize the image
-    resized_img = image.resize((target_width, target_height))
-    processed_images = []
-    for i in range(blocks):
-        box = (
-            (i % (target_width // image_size)) * image_size,
-            (i // (target_width // image_size)) * image_size,
-            ((i % (target_width // image_size)) + 1) * image_size,
-            ((i // (target_width // image_size)) + 1) * image_size,
-        )
-        # split the image
-        split_img = resized_img.crop(box)
-        processed_images.append(split_img)
-
-    assert len(processed_images) == blocks
-
-    if use_thumbnail and len(processed_images) != 1:
-        thumbnail_img = image.resize((image_size, image_size))
-        processed_images.append(thumbnail_img)
-
-    return processed_images
-
-
-# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B
-def image_to_pixel_values_skyworkr1v(
-    image: Image.Image,
-    *,
-    input_size: int,
-    min_num: int,
-    max_num: int,
-    use_thumbnail: bool,
-) -> torch.Tensor:
-    target_ratios = get_skyworkr1v_target_ratios(min_num, max_num)
-
-    transform = build_transform(input_size=input_size)
-    images = dynamic_preprocess_skyworkr1v(
-        image,
-        target_ratios=target_ratios,
-        image_size=input_size,
-        use_thumbnail=use_thumbnail,
-    )
-
-    pixel_values = torch.stack([transform(image) for image in images])
-    return pixel_values
-
-
-class SkyworkR1VProcessor:
-    """
-    This model doesn't define its own HF processor,
-    so we implement our own one here.
-
-    The code to insert image tokens is based on:
-    https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py#L252
-    """
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        tokenizer: TokenizerLike,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-    ) -> None:
-        super().__init__()
-
-        self.config = config
-        self.tokenizer = tokenizer
-
-        image_size: int = config.vision_config.image_size
-        patch_size: int = config.vision_config.patch_size
-
-        if min_dynamic_patch is None:
-            min_dynamic_patch = config.min_dynamic_patch
-        assert isinstance(min_dynamic_patch, int)
-
-        if max_dynamic_patch is None:
-            max_dynamic_patch = config.max_dynamic_patch
-        assert isinstance(max_dynamic_patch, int)
-
-        if dynamic_image_size is None:
-            dynamic_image_size = config.dynamic_image_size
-        assert isinstance(dynamic_image_size, bool)
-
-        self.num_image_token = int(
-            (image_size // patch_size) ** 2 * (config.downsample_ratio**2)
-        )
-        self.image_size = image_size
-        self.min_dynamic_patch = min_dynamic_patch
-        self.max_dynamic_patch = max_dynamic_patch
-        self.dynamic_image_size = dynamic_image_size
-        self.use_thumbnail: bool = config.use_thumbnail
-
-    @property
-    def image_token_id(self) -> int:
-        return self.tokenizer.get_vocab()[IMG_CONTEXT]
-
-    def get_image_repl(
-        self,
-        feature_size: int,
-        num_patches: int | None,
-    ) -> PromptUpdateDetails[str]:
-        repl_features = IMG_CONTEXT * feature_size
-        repl_full = IMG_START + repl_features + IMG_END
-
-        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
-
-    def resolve_min_max_num(
-        self,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-        use_thumbnail: bool | None = None,
-    ) -> tuple[int, int]:
-        min_dynamic_patch = (
-            self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
-        )
-        max_dynamic_patch = (
-            self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
-        )
-        dynamic_image_size = (
-            self.dynamic_image_size
-            if dynamic_image_size is None
-            else dynamic_image_size
-        )
-        use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
-
-        return resolve_skyworkr1v_min_max_num(
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-            use_thumbnail=use_thumbnail,
-        )
-
-    def resolve_target_ratios(
-        self,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-        use_thumbnail: bool | None = None,
-    ) -> list[tuple[int, int]]:
-        min_num, max_num = self.resolve_min_max_num(
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-            use_thumbnail=use_thumbnail,
-        )
-
-        return get_skyworkr1v_target_ratios(min_num, max_num)
-
-    def get_num_image_tokens(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-    ) -> int:
-        target_ratios = self.resolve_target_ratios(
-            use_thumbnail=False,  # Applied in calculate_targets
-        )
-
-        num_patches, _, _ = calculate_skyworkr1v_targets(
-            orig_width=image_width,
-            orig_height=image_height,
-            image_size=self.image_size,
-            target_ratios=target_ratios,
-            use_thumbnail=self.use_thumbnail,
-        )
-
-        return num_patches * self.num_image_token
-
-    def _images_to_pixel_values_lst(
-        self,
-        images: list[Image.Image],
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-    ) -> list[torch.Tensor]:
-        min_num, max_num = self.resolve_min_max_num(
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-            use_thumbnail=False,  # Applied in image_to_pixel_values
-        )
-
-        return [
-            image_to_pixel_values_skyworkr1v(
-                image,
-                input_size=self.image_size,
-                min_num=min_num,
-                max_num=max_num,
-                use_thumbnail=self.use_thumbnail,
-            )
-            for image in images
-        ]
-
-    def __call__(
-        self,
-        text: str | list[str] | None = None,
-        images: Image.Image | list[Image.Image] | None = None,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-        return_tensors: str | TensorType | None = None,
-    ) -> BatchFeature:
-        if text is None:
-            text = []
-        if not isinstance(text, list):
-            text = [text]
-        if images is None:
-            images = []
-        if not isinstance(images, list):
-            images = [images]
-
-        if len(images) == 0:
-            image_inputs = {}
-        else:
-            pixel_values_lst = self._images_to_pixel_values_lst(
-                images,
-                min_dynamic_patch=min_dynamic_patch,
-                max_dynamic_patch=max_dynamic_patch,
-                dynamic_image_size=dynamic_image_size,
-            )
-            image_inputs = {
-                "pixel_values_flat": torch.cat(pixel_values_lst),
-                "image_num_patches": torch.tensor(
-                    [len(item) for item in pixel_values_lst]
-                ),
-            }
-
-            for pixel_values in pixel_values_lst:
-                num_patches = pixel_values.shape[0]
-                feature_size = num_patches * self.num_image_token
-
-                image_repl = self.get_image_repl(feature_size, num_patches)
-
-                text = [t.replace("<image>", image_repl.full, 1) for t in text]
-
-        text_inputs = self.tokenizer(text)
-
-        combined_outputs = {**text_inputs, **image_inputs}
-
-        return BatchFeature(combined_outputs, tensor_type=return_tensors)