# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project # adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py # -------------------------------------------------------- # InternVL # Copyright (c) 2023 OpenGVLab # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- from abc import abstractmethod from collections.abc import Iterable, Mapping, Sequence from functools import cached_property from typing import Annotated, Literal, TypeAlias, TypeVar import torch import torch.nn as nn from transformers import BatchFeature, PretrainedConfig from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.models.intern_vit import ( InternVisionModel, InternVisionPatchModel, ) from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import ( MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems, ) from vllm.multimodal.parse import ( ImageEmbeddingItems, ImageProcessorItems, ImageSize, MultiModalDataItems, ) from vllm.multimodal.processing import ( BaseDummyInputsBuilder, BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, PromptUpdate, ) from vllm.sequence import IntermediateTensors from vllm.transformers_utils.processors.internvl import ( InternVLImageProcessor, InternVLProcessor, InternVLVideoProcessor, ) from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import ( MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP, ) from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix class InternVLImagePixelInputs(TensorSchema): """ Dimensions: - bn: Batch size * number of images - bnp: Batch size * number of images * (1 + num_patches) - c: Number of channels (3) - h: Height of each image patch - w: Width of each image patch """ type: Literal["pixel_values"] pixel_values_flat: Annotated[torch.Tensor, TensorShape("bnp", 3, "h", "w")] num_patches: Annotated[torch.Tensor, TensorShape("bn")] class InternVLImageEmbeddingInputs(TensorSchema): """ Dimensions: - n: Number of images - f: Total image feature size - h: Hidden size (must match the hidden size of language model backbone) """ type: Literal["image_embeds"] data: Annotated[torch.Tensor | list[torch.Tensor], TensorShape("n", "f", "h")] InternVLImageInputs: TypeAlias = InternVLImagePixelInputs | InternVLImageEmbeddingInputs class InternVLVideoPixelInputs(TensorSchema): """ Dimensions: - bvf: Batch size * number of videos * num_frames - bn: Batch size * number of images - c: Number of channels (3) - h: Height of each video frame - w: Width of each video frame """ type: Literal["pixel_values_videos"] pixel_values_flat: Annotated[torch.Tensor, TensorShape("bvf", 3, "h", "w")] num_patches: Annotated[torch.Tensor, TensorShape("bn")] class InternVLVideoEmbeddingInputs(TensorSchema): """ Dimensions: - n: Number of videos - f: Total video feature size - h: Hidden size (must match the hidden size of language model backbone) """ type: Literal["video_embeds"] data: Annotated[torch.Tensor | list[torch.Tensor], TensorShape("n", "f", "h")] InternVLVideoInputs: TypeAlias = InternVLVideoPixelInputs | InternVLVideoEmbeddingInputs class BaseInternVLProcessingInfo(BaseProcessingInfo): """Basic image-only ProcessingInfo for InternVL-style models.""" @abstractmethod def get_hf_processor(self, **kwargs: object) -> InternVLProcessor: raise NotImplementedError def get_supported_mm_limits(self) -> Mapping[str, int | None]: return {"image": None} def get_num_image_tokens( self, *, image_width: int, image_height: int, processor: InternVLProcessor, ) -> int: return processor.get_num_image_tokens( image_width=image_width, image_height=image_height, ) def get_image_size_with_most_features(self) -> ImageSize: processor = self.get_hf_processor() image_processor = processor.image_processor base_size = image_processor.image_size target_ratios = processor.resolve_target_ratios() largest_feature_size, largest_feature_pinpoint = 0, None for wr, hr in target_ratios: width, height = base_size * wr, base_size * hr feat_size = self.get_num_image_tokens( image_width=width, image_height=height, processor=processor, ) if feat_size > largest_feature_size: largest_feature_size = feat_size largest_feature_pinpoint = ImageSize(width=width, height=height) if largest_feature_size == 0 or largest_feature_pinpoint is None: raise ValueError("Cannot have a largest feature size of 0!") return largest_feature_pinpoint def get_max_image_tokens(self) -> int: processor = self.get_hf_processor() target_width, target_height = self.get_image_size_with_most_features() return self.get_num_image_tokens( image_width=target_width, image_height=target_height, processor=processor, ) _I = TypeVar("_I", bound=BaseInternVLProcessingInfo) class BaseInternVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]): """Basic image-only DummyInputsBuilder for InternVL-style models.""" def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: num_images = mm_counts.get("image", 0) return "" * num_images def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: target_width, target_height = self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( width=target_width, height=target_height, num_images=num_images, overrides=image_overrides, ) } class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]): """Basic image-only MultiModalProcessor for InternVL-style models.""" def _call_hf_processor( self, prompt: str, mm_data: Mapping[str, object], mm_kwargs: Mapping[str, object], tok_kwargs: Mapping[str, object], ) -> BatchFeature: processed_outputs = super()._call_hf_processor( prompt=prompt, mm_data=mm_data, mm_kwargs=mm_kwargs, tok_kwargs=tok_kwargs, ) hf_processor = self.info.get_hf_processor(**mm_kwargs) image_token_id = hf_processor.ctx_image_token_id # Since there may be extra tokens in the feature placeholders, # we need to pass the image token ID to the model to select the # tokens to merge from the vision encoder outputs processed_outputs["image_token_id"] = torch.tensor(image_token_id) return processed_outputs def _get_mm_fields_config( self, hf_inputs: BatchFeature, hf_processor_mm_kwargs: Mapping[str, object], ) -> Mapping[str, MultiModalFieldConfig]: image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0)) num_images = len(image_num_patches) return dict( pixel_values_flat=MultiModalFieldConfig.flat_from_sizes( "image", image_num_patches ), image_num_patches=MultiModalFieldConfig.batched("image"), image_embeds=MultiModalFieldConfig.batched("image"), image_token_id=MultiModalFieldConfig.shared("image", num_images), ) def _get_prompt_updates( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) out_mm_data = out_mm_kwargs.get_data() if "image_num_patches" in out_mm_data: image_num_patches = out_mm_data["image_num_patches"] assert isinstance(image_num_patches, torch.Tensor) image_num_patches = image_num_patches.tolist() elif "image_embeds" in out_mm_data: # TODO: Use image size information in dictionary embedding inputs # to compute num_patches (similar to Qwen2-VL) image_num_patches = [None] * len(out_mm_data["image_embeds"]) else: image_num_patches = [] def get_replacement_internvl(item_idx: int): images = mm_items.get_items( "image", (ImageEmbeddingItems, ImageProcessorItems) ) if isinstance(images, ImageEmbeddingItems): feature_size = images.get_feature_size(item_idx) else: image_size = images.get_image_size(item_idx) feature_size = self.info.get_num_image_tokens( image_width=image_size.width, image_height=image_size.height, processor=hf_processor, ) num_patches = image_num_patches[item_idx] if num_patches is not None: assert isinstance(num_patches, int) return hf_processor.get_image_repl(num_patches, num_features=feature_size) return [ PromptReplacement( modality="image", target="", replacement=get_replacement_internvl, ) ] class InternVLProcessingInfo(BaseInternVLProcessingInfo): """InternVL ProcessingInfo extended for video processing""" def get_image_processor(self, **kwargs): config = self.get_hf_config() vision_config = config.vision_config kwargs = self.ctx.get_merged_mm_kwargs(kwargs) kwargs.setdefault("image_size", vision_config.image_size) kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch) kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch) kwargs.setdefault("dynamic_image_size", config.dynamic_image_size) kwargs.setdefault("use_thumbnail", config.use_thumbnail) return InternVLImageProcessor(**kwargs) def get_video_processor(self, **kwargs): config = self.get_hf_config() vision_config = config.vision_config kwargs = self.ctx.get_merged_mm_kwargs(kwargs) kwargs.setdefault("image_size", vision_config.image_size) return InternVLVideoProcessor(**kwargs) @cached_property def ctx_video_token(self): text_model_type = self.get_hf_config().get_text_config().model_type ctx_video_token_map = { "qwen2": "<|video_pad|>", "qwen3": "<|video_pad|>", "qwen3_moe": "<|video_pad|>", "gpt_oss": "<|reserved_200000|>", } if text_model_type not in ctx_video_token_map: return None ctx_video_token = ctx_video_token_map[text_model_type] if ctx_video_token not in self.get_tokenizer().get_vocab(): return None return ctx_video_token def get_hf_processor(self, **kwargs: object) -> InternVLProcessor: config = self.get_hf_config() vision_config = config.vision_config image_processor = self.get_image_processor(**kwargs) image_size = image_processor.image_size patch_size = vision_config.patch_size downsample_ratio = config.downsample_ratio image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2)) ctx_video_token = self.ctx_video_token video_processor = ( self.get_video_processor(**kwargs) if ctx_video_token else None ) return InternVLProcessor( tokenizer=self.get_tokenizer(), image_processor=image_processor, video_processor=video_processor, image_seq_length=image_seq_length, ctx_video_token=ctx_video_token, ) def get_supported_mm_limits(self): video_limit = {"video": None} if self.ctx_video_token else {} return {**super().get_supported_mm_limits(), **video_limit} def get_num_frames_with_most_features( self, seq_len: int, mm_counts: Mapping[str, int], ) -> int: max_images = mm_counts.get("image", 0) max_videos = mm_counts.get("video", 0) processor = self.get_hf_processor() num_image_token = processor.image_seq_length max_image_tokens = self.get_max_image_tokens() * max_images max_total_frames = (seq_len - max_image_tokens) // num_image_token max_frames_per_video = max_total_frames // max(max_videos, 1) return max(max_frames_per_video, 1) class InternVLDummyInputsBuilder( BaseInternVLDummyInputsBuilder[InternVLProcessingInfo] ): """InternVL DummyInputsBuilder extended for video support""" def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: num_videos = mm_counts.get("video", 0) return super().get_dummy_text(mm_counts) + "