Unverified Commit 99267c23 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[2/3] Refactor InternVL-based processors (#37324)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent 525f2eeb
...@@ -489,13 +489,14 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner: ...@@ -489,13 +489,14 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self.image_size = self.vision_config.image_size self.image_size = self.vision_config.image_size
def __call__(self, text: str, images: Image | list[Image], **kwargs): def __call__(self, text: str, images: Image | list[Image], **kwargs):
from vllm.model_executor.models.h2ovl import ( from vllm.transformers_utils.processors.h2ovl import (
IMG_CONTEXT,
IMG_END,
IMG_START,
image_to_pixel_values_h2ovl, image_to_pixel_values_h2ovl,
) )
IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<IMG_CONTEXT>"
images = [images] if isinstance(images, Image) else images images = [images] if isinstance(images, Image) else images
pixel_values = [ pixel_values = [
image_to_pixel_values_h2ovl( image_to_pixel_values_h2ovl(
...@@ -751,16 +752,17 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner: ...@@ -751,16 +752,17 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self.image_size = self.vision_config.image_size self.image_size = self.vision_config.image_size
def __call__(self, text: str, images: Image | list[Image], **kwargs): def __call__(self, text: str, images: Image | list[Image], **kwargs):
from vllm.model_executor.models.skyworkr1v import ( from vllm.transformers_utils.processors.internvl import (
IMG_CONTEXT, image_to_pixel_values_internvl,
IMG_END,
IMG_START,
image_to_pixel_values_skyworkr1v,
) )
IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<IMG_CONTEXT>"
images = [images] if isinstance(images, Image) else images images = [images] if isinstance(images, Image) else images
pixel_values = [ pixel_values = [
image_to_pixel_values_skyworkr1v( image_to_pixel_values_internvl(
image, image,
input_size=self.image_size, input_size=self.image_size,
min_num=self.min_num, min_num=self.min_num,
...@@ -815,14 +817,15 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner: ...@@ -815,14 +817,15 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
videos: npt.NDArray | list[npt.NDArray] = None, videos: npt.NDArray | list[npt.NDArray] = None,
**kwargs, **kwargs,
): ):
from vllm.model_executor.models.internvl import ( from vllm.transformers_utils.processors.internvl import (
IMG_CONTEXT,
IMG_END,
IMG_START,
image_to_pixel_values_internvl, image_to_pixel_values_internvl,
video_to_pixel_values_internvl, video_to_pixel_values_internvl,
) )
IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<IMG_CONTEXT>"
images = [images] if isinstance(images, Image) else images images = [images] if isinstance(images, Image) else images
videos = [videos] if isinstance(videos, np.ndarray) else videos videos = [videos] if isinstance(videos, np.ndarray) else videos
if images is not None: if images is not None:
......
...@@ -779,7 +779,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -779,7 +779,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"rednote-hilab/dots.ocr", trust_remote_code=True "rednote-hilab/dots.ocr", trust_remote_code=True
), ),
"Eagle2_5_VLForConditionalGeneration": _HfExamplesInfo( "Eagle2_5_VLForConditionalGeneration": _HfExamplesInfo(
"nvidia/Eagle2.5-8B", trust_remote_code=True, is_available_online=False "nvidia/Eagle2.5-8B",
trust_remote_code=True,
), ),
"Emu3ForConditionalGeneration": _HfExamplesInfo("BAAI/Emu3-Chat-hf"), "Emu3ForConditionalGeneration": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
"Ernie4_5_VLMoeForConditionalGeneration": _HfExamplesInfo( "Ernie4_5_VLMoeForConditionalGeneration": _HfExamplesInfo(
......
...@@ -16,7 +16,10 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys ...@@ -16,7 +16,10 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.models.siglip import SiglipVisionModel from vllm.model_executor.models.siglip import SiglipVisionModel
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.processors.eagle2_5_vl import Eagle2_5_VLProcessor from vllm.transformers_utils.processors.internvl import (
InternVLImageProcessor,
InternVLProcessor,
)
from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import ( from .interfaces import (
...@@ -68,12 +71,35 @@ Eagle2_5_VLImageInputs: TypeAlias = ( ...@@ -68,12 +71,35 @@ Eagle2_5_VLImageInputs: TypeAlias = (
class Eagle2_5_VLProcessingInfo(BaseInternVLProcessingInfo): class Eagle2_5_VLProcessingInfo(BaseInternVLProcessingInfo):
"""Processing info for Eagle2.5-VL model.""" """Processing info for Eagle2.5-VL model."""
def get_hf_processor(self, **kwargs) -> Eagle2_5_VLProcessor: def get_image_processor(self, **kwargs):
return self.ctx.init_processor( config = self.get_hf_config()
Eagle2_5_VLProcessor, vision_config = config.vision_config
config=self.ctx.get_hf_config(),
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
kwargs.setdefault(
"image_size", config.force_image_size or vision_config.image_size
)
kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
kwargs.setdefault("use_thumbnail", config.use_thumbnail)
return InternVLImageProcessor(**kwargs)
def get_hf_processor(self, **kwargs) -> InternVLProcessor:
config = self.get_hf_config()
vision_config = config.vision_config
image_processor = self.get_image_processor(**kwargs)
image_size = image_processor.image_size
patch_size = vision_config.patch_size
downsample_ratio = config.downsample_ratio
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
return InternVLProcessor(
tokenizer=self.get_tokenizer(), tokenizer=self.get_tokenizer(),
**kwargs, image_processor=image_processor,
image_seq_length=image_seq_length,
) )
......
...@@ -395,13 +395,13 @@ class GLM4VProcessingInfo(BaseProcessingInfo): ...@@ -395,13 +395,13 @@ class GLM4VProcessingInfo(BaseProcessingInfo):
vision_config = config.vision_config vision_config = config.vision_config
image_size = vision_config["image_size"] image_size = vision_config["image_size"]
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
kwargs.setdefault("size", {"width": image_size, "height": image_size}) kwargs.setdefault("size", {"width": image_size, "height": image_size})
return GLM4VImageProcessorFast(**kwargs) return GLM4VImageProcessorFast(**kwargs)
def get_hf_processor(self, **kwargs: object) -> GLM4VProcessor: def get_hf_processor(self, **kwargs: object) -> GLM4VProcessor:
return self.ctx.init_processor( return GLM4VProcessor(
GLM4VProcessor,
tokenizer=self.get_tokenizer(), tokenizer=self.get_tokenizer(),
image_processor=self.get_image_processor(**kwargs), image_processor=self.get_image_processor(**kwargs),
) )
......
...@@ -28,7 +28,7 @@ from vllm.multimodal.processing.processor import ( ...@@ -28,7 +28,7 @@ from vllm.multimodal.processing.processor import (
PromptUpdate, PromptUpdate,
TimingContext, TimingContext,
) )
from vllm.transformers_utils.processors.h2ovl import H2OVLProcessor from vllm.transformers_utils.processors.h2ovl import H2OVLImageProcessor, H2OVLProcessor
from .intern_vit import InternVisionModel from .intern_vit import InternVisionModel
from .internvl import ( from .internvl import (
...@@ -40,12 +40,34 @@ from .internvl import ( ...@@ -40,12 +40,34 @@ from .internvl import (
class H2OVLProcessingInfo(BaseInternVLProcessingInfo): class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
def get_image_processor(self, **kwargs):
config = self.get_hf_config()
vision_config = config.vision_config
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
kwargs.setdefault("image_size", vision_config.image_size)
kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
kwargs.setdefault("use_thumbnail", config.use_thumbnail)
kwargs.setdefault("use_msac", config.use_msac)
return H2OVLImageProcessor(**kwargs)
def get_hf_processor(self, **kwargs: object) -> H2OVLProcessor: def get_hf_processor(self, **kwargs: object) -> H2OVLProcessor:
return self.ctx.init_processor( config = self.get_hf_config()
H2OVLProcessor, vision_config = config.vision_config
config=self.get_hf_config(),
image_processor = self.get_image_processor(**kwargs)
image_size = image_processor.image_size
patch_size = vision_config.patch_size
downsample_ratio = config.downsample_ratio
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
return H2OVLProcessor(
tokenizer=self.get_tokenizer(), tokenizer=self.get_tokenizer(),
**kwargs, image_processor=image_processor,
image_seq_length=image_seq_length,
) )
def get_num_image_tokens( def get_num_image_tokens(
...@@ -106,7 +128,7 @@ class H2OVLMultiModalProcessor(BaseInternVLMultiModalProcessor[H2OVLProcessingIn ...@@ -106,7 +128,7 @@ class H2OVLMultiModalProcessor(BaseInternVLMultiModalProcessor[H2OVLProcessingIn
if num_patches is not None: if num_patches is not None:
assert isinstance(num_patches, int) assert isinstance(num_patches, int)
return hf_processor.get_image_repl(feature_size, num_patches) return hf_processor.get_image_repl(num_patches, num_features=feature_size)
return [ return [
PromptReplacement( PromptReplacement(
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
# -------------------------------------------------------- # --------------------------------------------------------
from abc import abstractmethod from abc import abstractmethod
from collections.abc import Iterable, Mapping, Sequence from collections.abc import Iterable, Mapping, Sequence
from functools import cached_property
from typing import Annotated, Literal, TypeAlias, TypeVar from typing import Annotated, Literal, TypeAlias, TypeVar
import torch import torch
...@@ -45,8 +46,9 @@ from vllm.multimodal.processing import ( ...@@ -45,8 +46,9 @@ from vllm.multimodal.processing import (
) )
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.processors.internvl import ( from vllm.transformers_utils.processors.internvl import (
BaseInternVLProcessor, InternVLImageProcessor,
InternVLProcessor, InternVLProcessor,
InternVLVideoProcessor,
) )
from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.tensor_schema import TensorSchema, TensorShape
...@@ -123,7 +125,7 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo): ...@@ -123,7 +125,7 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
"""Basic image-only ProcessingInfo for InternVL-style models.""" """Basic image-only ProcessingInfo for InternVL-style models."""
@abstractmethod @abstractmethod
def get_hf_processor(self, **kwargs: object) -> BaseInternVLProcessor: def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
raise NotImplementedError raise NotImplementedError
def get_supported_mm_limits(self) -> Mapping[str, int | None]: def get_supported_mm_limits(self) -> Mapping[str, int | None]:
...@@ -134,7 +136,7 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo): ...@@ -134,7 +136,7 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
*, *,
image_width: int, image_width: int,
image_height: int, image_height: int,
processor: BaseInternVLProcessor, processor: InternVLProcessor,
) -> int: ) -> int:
return processor.get_num_image_tokens( return processor.get_num_image_tokens(
image_width=image_width, image_width=image_width,
...@@ -143,8 +145,9 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo): ...@@ -143,8 +145,9 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
def get_image_size_with_most_features(self) -> ImageSize: def get_image_size_with_most_features(self) -> ImageSize:
processor = self.get_hf_processor() processor = self.get_hf_processor()
image_processor = processor.image_processor
base_size = processor.image_size base_size = image_processor.image_size
target_ratios = processor.resolve_target_ratios() target_ratios = processor.resolve_target_ratios()
largest_feature_size, largest_feature_pinpoint = 0, None largest_feature_size, largest_feature_pinpoint = 0, None
...@@ -226,7 +229,7 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]): ...@@ -226,7 +229,7 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
) )
hf_processor = self.info.get_hf_processor(**mm_kwargs) hf_processor = self.info.get_hf_processor(**mm_kwargs)
image_token_id = hf_processor.image_token_id image_token_id = hf_processor.ctx_image_token_id
# Since there may be extra tokens in the feature placeholders, # Since there may be extra tokens in the feature placeholders,
# we need to pass the image token ID to the model to select the # we need to pass the image token ID to the model to select the
...@@ -291,7 +294,7 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]): ...@@ -291,7 +294,7 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
if num_patches is not None: if num_patches is not None:
assert isinstance(num_patches, int) assert isinstance(num_patches, int)
return hf_processor.get_image_repl(feature_size, num_patches) return hf_processor.get_image_repl(num_patches, num_features=feature_size)
return [ return [
PromptReplacement( PromptReplacement(
...@@ -305,23 +308,73 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]): ...@@ -305,23 +308,73 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
class InternVLProcessingInfo(BaseInternVLProcessingInfo): class InternVLProcessingInfo(BaseInternVLProcessingInfo):
"""InternVL ProcessingInfo extended for video processing""" """InternVL ProcessingInfo extended for video processing"""
@property def get_image_processor(self, **kwargs):
def supports_video(self): config = self.get_hf_config()
return self.get_hf_processor().supports_video vision_config = config.vision_config
def get_supported_mm_limits(self): kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
video_limit = {"video": None} if self.supports_video else {} kwargs.setdefault("image_size", vision_config.image_size)
return {**super().get_supported_mm_limits(), **video_limit} kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
kwargs.setdefault("use_thumbnail", config.use_thumbnail)
return InternVLImageProcessor(**kwargs)
def get_video_processor(self, **kwargs):
config = self.get_hf_config()
vision_config = config.vision_config
def get_video_token(self) -> str | None: kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
kwargs.setdefault("image_size", vision_config.image_size)
return InternVLVideoProcessor(**kwargs)
@cached_property
def ctx_video_token(self):
text_model_type = self.get_hf_config().get_text_config().model_type text_model_type = self.get_hf_config().get_text_config().model_type
video_token_map = { ctx_video_token_map = {
"qwen2": "<|video_pad|>", "qwen2": "<|video_pad|>",
"qwen3": "<|video_pad|>", "qwen3": "<|video_pad|>",
"qwen3_moe": "<|video_pad|>", "qwen3_moe": "<|video_pad|>",
"gpt_oss": "<|reserved_200000|>", "gpt_oss": "<|reserved_200000|>",
} }
return video_token_map.get(text_model_type)
if text_model_type not in ctx_video_token_map:
return None
ctx_video_token = ctx_video_token_map[text_model_type]
if ctx_video_token not in self.get_tokenizer().get_vocab():
return None
return ctx_video_token
def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
config = self.get_hf_config()
vision_config = config.vision_config
image_processor = self.get_image_processor(**kwargs)
image_size = image_processor.image_size
patch_size = vision_config.patch_size
downsample_ratio = config.downsample_ratio
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
ctx_video_token = self.ctx_video_token
video_processor = (
self.get_video_processor(**kwargs) if ctx_video_token else None
)
return InternVLProcessor(
tokenizer=self.get_tokenizer(),
image_processor=image_processor,
video_processor=video_processor,
image_seq_length=image_seq_length,
ctx_video_token=ctx_video_token,
)
def get_supported_mm_limits(self):
video_limit = {"video": None} if self.ctx_video_token else {}
return {**super().get_supported_mm_limits(), **video_limit}
def get_num_frames_with_most_features( def get_num_frames_with_most_features(
self, self,
...@@ -332,22 +385,14 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo): ...@@ -332,22 +385,14 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo):
max_videos = mm_counts.get("video", 0) max_videos = mm_counts.get("video", 0)
processor = self.get_hf_processor() processor = self.get_hf_processor()
num_image_token = processor.image_seq_length
max_image_tokens = self.get_max_image_tokens() * max_images max_image_tokens = self.get_max_image_tokens() * max_images
max_total_frames = (seq_len - max_image_tokens) // processor.num_image_token max_total_frames = (seq_len - max_image_tokens) // num_image_token
max_frames_per_video = max_total_frames // max(max_videos, 1) max_frames_per_video = max_total_frames // max(max_videos, 1)
return max(max_frames_per_video, 1) return max(max_frames_per_video, 1)
def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
return self.ctx.init_processor(
InternVLProcessor,
config=self.get_hf_config(),
tokenizer=self.get_tokenizer(),
video_token=self.get_video_token(),
**kwargs,
)
class InternVLDummyInputsBuilder( class InternVLDummyInputsBuilder(
BaseInternVLDummyInputsBuilder[InternVLProcessingInfo] BaseInternVLDummyInputsBuilder[InternVLProcessingInfo]
...@@ -366,7 +411,7 @@ class InternVLDummyInputsBuilder( ...@@ -366,7 +411,7 @@ class InternVLDummyInputsBuilder(
mm_options: Mapping[str, BaseDummyOptions], mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict: ) -> MultiModalDataDict:
dummy_image = super().get_dummy_mm_data(seq_len, mm_counts, mm_options) dummy_image = super().get_dummy_mm_data(seq_len, mm_counts, mm_options)
if self.info.supports_video: if self.info.ctx_video_token:
config = self.info.get_hf_config() config = self.info.get_hf_config()
image_size: int = config.vision_config.image_size image_size: int = config.vision_config.image_size
target_num_frames = self.info.get_num_frames_with_most_features( target_num_frames = self.info.get_num_frames_with_most_features(
...@@ -405,11 +450,9 @@ class InternVLMultiModalProcessor( ...@@ -405,11 +450,9 @@ class InternVLMultiModalProcessor(
) )
hf_processor = self.info.get_hf_processor(**mm_kwargs) hf_processor = self.info.get_hf_processor(**mm_kwargs)
if ( if (video_token_id := hf_processor.ctx_video_token_id) is not None:
self.info.supports_video
and (video_token_id := hf_processor.video_token_id) is not None
):
processed_outputs["video_token_id"] = torch.tensor(video_token_id) processed_outputs["video_token_id"] = torch.tensor(video_token_id)
return processed_outputs return processed_outputs
def _get_mm_fields_config( def _get_mm_fields_config(
...@@ -418,7 +461,7 @@ class InternVLMultiModalProcessor( ...@@ -418,7 +461,7 @@ class InternVLMultiModalProcessor(
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]: ) -> Mapping[str, MultiModalFieldConfig]:
image_fields = super()._get_mm_fields_config(hf_inputs, hf_processor_mm_kwargs) image_fields = super()._get_mm_fields_config(hf_inputs, hf_processor_mm_kwargs)
if self.info.supports_video: if self.info.ctx_video_token:
video_num_patches = hf_inputs.get("video_num_patches", torch.empty(0)) video_num_patches = hf_inputs.get("video_num_patches", torch.empty(0))
num_videos = len(video_num_patches) num_videos = len(video_num_patches)
video_fields = dict( video_fields = dict(
...@@ -444,6 +487,8 @@ class InternVLMultiModalProcessor( ...@@ -444,6 +487,8 @@ class InternVLMultiModalProcessor(
hf_processor_mm_kwargs=hf_processor_mm_kwargs, hf_processor_mm_kwargs=hf_processor_mm_kwargs,
out_mm_kwargs=out_mm_kwargs, out_mm_kwargs=out_mm_kwargs,
) )
if self.info.ctx_video_token is None:
return prompt_repl
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
...@@ -456,17 +501,13 @@ class InternVLMultiModalProcessor( ...@@ -456,17 +501,13 @@ class InternVLMultiModalProcessor(
video_num_patches = [] video_num_patches = []
def get_video_replacement_internvl(item_idx: int): def get_video_replacement_internvl(item_idx: int):
feature_size = hf_processor.num_image_token
num_patches = video_num_patches[item_idx] num_patches = video_num_patches[item_idx]
if num_patches is not None: if num_patches is not None:
assert isinstance(num_patches, int) assert isinstance(num_patches, int)
return hf_processor.get_video_repl( return hf_processor.get_video_repl(num_patches)
feature_size, num_patches, video_context_token=hf_processor.video_token
)
if self.info.supports_video: return [
prompt_repl = [
*prompt_repl, *prompt_repl,
PromptReplacement( PromptReplacement(
modality="video", modality="video",
...@@ -475,8 +516,6 @@ class InternVLMultiModalProcessor( ...@@ -475,8 +516,6 @@ class InternVLMultiModalProcessor(
), ),
] ]
return prompt_repl
@MULTIMODAL_REGISTRY.register_processor( @MULTIMODAL_REGISTRY.register_processor(
InternVLMultiModalProcessor, InternVLMultiModalProcessor,
......
...@@ -26,8 +26,10 @@ from vllm.multimodal import MULTIMODAL_REGISTRY ...@@ -26,8 +26,10 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.processor import cached_image_processor_from_config from vllm.transformers_utils.processor import cached_image_processor_from_config
from vllm.transformers_utils.processors.nemotron_vl import ( from vllm.transformers_utils.processors.nemotron_vl import (
LlamaNemotronNanoVLImageProcessor,
LlamaNemotronNanoVLProcessor,
LlamaNemotronVLEmbedImageProcessor,
LlamaNemotronVLEmbedProcessor, LlamaNemotronVLEmbedProcessor,
NemotronVLProcessor,
) )
from vllm.transformers_utils.repo_utils import get_hf_file_to_dict from vllm.transformers_utils.repo_utils import get_hf_file_to_dict
...@@ -50,19 +52,34 @@ from .utils import ( ...@@ -50,19 +52,34 @@ from .utils import (
class NemotronVLProcessingInfo(BaseInternVLProcessingInfo): class NemotronVLProcessingInfo(BaseInternVLProcessingInfo):
"""Processing info for Nemotron VL models.""" """Processing info for Nemotron VL models."""
def get_hf_processor(self, **kwargs: object) -> NemotronVLProcessor: def get_image_processor(self, **kwargs: object):
return self.ctx.init_processor( kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
NemotronVLProcessor, orig_processor = cached_image_processor_from_config(
config=self.get_hf_config(), self.ctx.model_config, **kwargs
tokenizer=self.get_tokenizer(),
image_processor=self.get_image_processor(),
**kwargs,
) )
def get_image_processor(self, **kwargs: object): return LlamaNemotronNanoVLImageProcessor(
return cached_image_processor_from_config( image_size=orig_processor.image_size,
self.ctx.model_config, min_dynamic_patch=1,
**kwargs, max_dynamic_patch=orig_processor.max_num_tiles,
dynamic_image_size=True,
use_thumbnail=orig_processor.use_thumbnail,
)
def get_hf_processor(self, **kwargs: object) -> LlamaNemotronNanoVLProcessor:
config = self.get_hf_config()
vision_config = config.vision_config
image_processor = self.get_image_processor(**kwargs)
image_size = image_processor.image_size
patch_size = vision_config.patch_size
downsample_ratio = config.downsample_ratio
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
return LlamaNemotronNanoVLProcessor(
tokenizer=self.get_tokenizer(),
image_processor=image_processor,
image_seq_length=image_seq_length,
) )
...@@ -386,14 +403,13 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor ...@@ -386,14 +403,13 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor
# -------------------------------------------------------- # --------------------------------------------------------
class LlamaNemotronVLEmbedProcessingInfo(NemotronVLProcessingInfo): class LlamaNemotronVLEmbedProcessingInfo(BaseInternVLProcessingInfo):
"""Processing info for LlamaNemotronVL embedding model.""" """Processing info for LlamaNemotronVL embedding model."""
def get_hf_processor(self, **kwargs: object) -> LlamaNemotronVLEmbedProcessor: def get_image_processor(self, **kwargs):
"""Override to create embedding-specific processor without image_processor."""
model_config = self.ctx.model_config model_config = self.ctx.model_config
processor_config = {}
if model_config.model is not None: config = self.get_hf_config()
processor_config = ( processor_config = (
get_hf_file_to_dict( get_hf_file_to_dict(
"processor_config.json", "processor_config.json",
...@@ -403,12 +419,42 @@ class LlamaNemotronVLEmbedProcessingInfo(NemotronVLProcessingInfo): ...@@ -403,12 +419,42 @@ class LlamaNemotronVLEmbedProcessingInfo(NemotronVLProcessingInfo):
or {} or {}
) )
return self.ctx.init_processor( min_dynamic_patch = processor_config.get(
LlamaNemotronVLEmbedProcessor, "min_input_tiles",
config=self.get_hf_config(), getattr(config, "min_dynamic_patch", 1),
)
max_dynamic_patch = processor_config.get(
"max_input_tiles",
getattr(config, "max_dynamic_patch", 1),
)
dynamic_image_size = processor_config.get(
"dynamic_image_size",
getattr(config, "dynamic_image_size", True),
)
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
kwargs.setdefault("image_size", config.force_image_size)
kwargs.setdefault("min_dynamic_patch", min_dynamic_patch)
kwargs.setdefault("max_dynamic_patch", max_dynamic_patch)
kwargs.setdefault("dynamic_image_size", dynamic_image_size)
kwargs.setdefault("use_thumbnail", True)
return LlamaNemotronVLEmbedImageProcessor(**kwargs)
def get_hf_processor(self, **kwargs: object) -> LlamaNemotronVLEmbedProcessor:
config = self.get_hf_config()
vision_config = config.vision_config
image_processor = self.get_image_processor(**kwargs)
image_size = image_processor.image_size
patch_size = vision_config.patch_size
downsample_ratio = config.downsample_ratio
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
return LlamaNemotronVLEmbedProcessor(
tokenizer=self.get_tokenizer(), tokenizer=self.get_tokenizer(),
processor_config=processor_config, image_processor=image_processor,
**kwargs, image_seq_length=image_seq_length,
) )
......
...@@ -27,7 +27,8 @@ from vllm.multimodal.processing import ( ...@@ -27,7 +27,8 @@ from vllm.multimodal.processing import (
PromptUpdate, PromptUpdate,
PromptUpdateDetails, PromptUpdateDetails,
) )
from vllm.transformers_utils.processors.nvlm_d import IMG_PAD, NVLMProcessor from vllm.transformers_utils.processors.internvl import InternVLImageProcessor
from vllm.transformers_utils.processors.nvlm_d import NVLMProcessor
from .intern_vit import InternVisionModel from .intern_vit import InternVisionModel
from .internvl import ( from .internvl import (
...@@ -39,12 +40,33 @@ from .internvl import ( ...@@ -39,12 +40,33 @@ from .internvl import (
class NVLMProcessingInfo(BaseInternVLProcessingInfo): class NVLMProcessingInfo(BaseInternVLProcessingInfo):
def get_image_processor(self, **kwargs):
config = self.get_hf_config()
vision_config = config.vision_config
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
kwargs.setdefault("image_size", vision_config.image_size)
kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
kwargs.setdefault("use_thumbnail", config.use_thumbnail)
return InternVLImageProcessor(**kwargs)
def get_hf_processor(self, **kwargs: object) -> NVLMProcessor: def get_hf_processor(self, **kwargs: object) -> NVLMProcessor:
return self.ctx.init_processor( config = self.get_hf_config()
NVLMProcessor, vision_config = config.vision_config
config=self.get_hf_config(),
image_processor = self.get_image_processor(**kwargs)
image_size = image_processor.image_size
patch_size = vision_config.patch_size
downsample_ratio = config.downsample_ratio
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
return NVLMProcessor(
tokenizer=self.get_tokenizer(), tokenizer=self.get_tokenizer(),
**kwargs, image_processor=image_processor,
image_seq_length=image_seq_length,
) )
...@@ -117,9 +139,11 @@ class NVLMMultiModalProcessor(BaseInternVLMultiModalProcessor[NVLMProcessingInfo ...@@ -117,9 +139,11 @@ class NVLMMultiModalProcessor(BaseInternVLMultiModalProcessor[NVLMProcessingInfo
if num_patches is not None: if num_patches is not None:
assert isinstance(num_patches, int) assert isinstance(num_patches, int)
repl = hf_processor.get_image_repl(feature_size, num_patches) repl = hf_processor.get_image_repl(num_patches, num_features=feature_size)
return PromptUpdateDetails.select_text(repl.full + "\n", IMG_PAD) return PromptUpdateDetails.select_text(
repl.full + "\n", hf_processor.ctx_image_token
)
# See note in dummy data regarding why we have the extra newline # See note in dummy data regarding why we have the extra newline
return [ return [
......
...@@ -440,13 +440,13 @@ class QwenVLProcessingInfo(BaseProcessingInfo): ...@@ -440,13 +440,13 @@ class QwenVLProcessingInfo(BaseProcessingInfo):
vision_config = config.visual vision_config = config.visual
image_size = vision_config["image_size"] image_size = vision_config["image_size"]
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
kwargs.setdefault("size", {"width": image_size, "height": image_size}) kwargs.setdefault("size", {"width": image_size, "height": image_size})
return QwenVLImageProcessorFast(**kwargs) return QwenVLImageProcessorFast(**kwargs)
def get_hf_processor(self, **kwargs: object) -> QwenVLProcessor: def get_hf_processor(self, **kwargs: object) -> QwenVLProcessor:
return self.ctx.init_processor( return QwenVLProcessor(
QwenVLProcessor,
tokenizer=self.get_tokenizer(), tokenizer=self.get_tokenizer(),
image_processor=self.get_image_processor(**kwargs), image_processor=self.get_image_processor(**kwargs),
) )
......
...@@ -43,7 +43,10 @@ from vllm.multimodal.processing import ( ...@@ -43,7 +43,10 @@ from vllm.multimodal.processing import (
PromptUpdate, PromptUpdate,
) )
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.processors.skyworkr1v import SkyworkR1VProcessor from vllm.transformers_utils.processors.internvl import (
InternVLImageProcessor,
InternVLProcessor,
)
from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
...@@ -96,12 +99,33 @@ SkyworkR1VImageInputs: TypeAlias = ( ...@@ -96,12 +99,33 @@ SkyworkR1VImageInputs: TypeAlias = (
class SkyworkR1VProcessingInfo(BaseProcessingInfo): class SkyworkR1VProcessingInfo(BaseProcessingInfo):
def get_hf_processor(self, **kwargs: object) -> SkyworkR1VProcessor: def get_image_processor(self, **kwargs):
return self.ctx.init_processor( config = self.get_hf_config()
SkyworkR1VProcessor, vision_config = config.vision_config
config=self.get_hf_config(),
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
kwargs.setdefault("image_size", vision_config.image_size)
kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
kwargs.setdefault("use_thumbnail", config.use_thumbnail)
return InternVLImageProcessor(**kwargs)
def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
config = self.get_hf_config()
vision_config = config.vision_config
image_processor = self.get_image_processor(**kwargs)
image_size = image_processor.image_size
patch_size = vision_config.patch_size
downsample_ratio = config.downsample_ratio
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
return InternVLProcessor(
tokenizer=self.get_tokenizer(), tokenizer=self.get_tokenizer(),
**kwargs, image_processor=image_processor,
image_seq_length=image_seq_length,
) )
def get_supported_mm_limits(self) -> Mapping[str, int | None]: def get_supported_mm_limits(self) -> Mapping[str, int | None]:
...@@ -112,7 +136,7 @@ class SkyworkR1VProcessingInfo(BaseProcessingInfo): ...@@ -112,7 +136,7 @@ class SkyworkR1VProcessingInfo(BaseProcessingInfo):
*, *,
image_width: int, image_width: int,
image_height: int, image_height: int,
processor: SkyworkR1VProcessor, processor: InternVLProcessor,
) -> int: ) -> int:
return processor.get_num_image_tokens( return processor.get_num_image_tokens(
image_width=image_width, image_width=image_width,
...@@ -121,8 +145,9 @@ class SkyworkR1VProcessingInfo(BaseProcessingInfo): ...@@ -121,8 +145,9 @@ class SkyworkR1VProcessingInfo(BaseProcessingInfo):
def get_image_size_with_most_features(self) -> ImageSize: def get_image_size_with_most_features(self) -> ImageSize:
processor = self.get_hf_processor() processor = self.get_hf_processor()
image_processor = processor.image_processor
base_size = processor.image_size base_size = image_processor.image_size
target_ratios = processor.resolve_target_ratios() target_ratios = processor.resolve_target_ratios()
largest_feature_size, largest_feature_pinpoint = 0, None largest_feature_size, largest_feature_pinpoint = 0, None
...@@ -187,7 +212,7 @@ class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[SkyworkR1VProcessing ...@@ -187,7 +212,7 @@ class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[SkyworkR1VProcessing
) )
hf_processor = self.info.get_hf_processor(**mm_kwargs) hf_processor = self.info.get_hf_processor(**mm_kwargs)
image_token_id = hf_processor.image_token_id image_token_id = hf_processor.ctx_image_token_id
# Since there may be extra tokens in the feature placeholders, # Since there may be extra tokens in the feature placeholders,
# we need to pass the image token ID to the model to select the # we need to pass the image token ID to the model to select the
...@@ -252,7 +277,7 @@ class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[SkyworkR1VProcessing ...@@ -252,7 +277,7 @@ class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[SkyworkR1VProcessing
if num_patches is not None: if num_patches is not None:
assert isinstance(num_patches, int) assert isinstance(num_patches, int)
return hf_processor.get_image_repl(feature_size, num_patches) return hf_processor.get_image_repl(num_patches, num_features=feature_size)
return [ return [
PromptReplacement( PromptReplacement(
......
...@@ -14,7 +14,6 @@ __all__ = [ ...@@ -14,7 +14,6 @@ __all__ = [
"BagelProcessor", "BagelProcessor",
"CohereASRProcessor", "CohereASRProcessor",
"DeepseekVLV2Processor", "DeepseekVLV2Processor",
"Eagle2_5_VLProcessor",
"FireRedASR2Processor", "FireRedASR2Processor",
"FunASRProcessor", "FunASRProcessor",
"GLM4VProcessor", "GLM4VProcessor",
...@@ -34,14 +33,12 @@ __all__ = [ ...@@ -34,14 +33,12 @@ __all__ = [
"Ovis2_5Processor", "Ovis2_5Processor",
"QwenVLProcessor", "QwenVLProcessor",
"Qwen3ASRProcessor", "Qwen3ASRProcessor",
"SkyworkR1VProcessor",
] ]
_CLASS_TO_MODULE: dict[str, str] = { _CLASS_TO_MODULE: dict[str, str] = {
"BagelProcessor": "vllm.transformers_utils.processors.bagel", "BagelProcessor": "vllm.transformers_utils.processors.bagel",
"CohereASRProcessor": "vllm.transformers_utils.processors.cohere_asr", "CohereASRProcessor": "vllm.transformers_utils.processors.cohere_asr",
"DeepseekVLV2Processor": "vllm.transformers_utils.processors.deepseek_vl2", "DeepseekVLV2Processor": "vllm.transformers_utils.processors.deepseek_vl2",
"Eagle2_5_VLProcessor": "vllm.transformers_utils.processors.eagle2_5_vl",
"FireRedASR2Processor": "vllm.transformers_utils.processors.fireredasr2", "FireRedASR2Processor": "vllm.transformers_utils.processors.fireredasr2",
"FunASRProcessor": "vllm.transformers_utils.processors.funasr", "FunASRProcessor": "vllm.transformers_utils.processors.funasr",
"GLM4VProcessor": "vllm.transformers_utils.processors.glm4v", "GLM4VProcessor": "vllm.transformers_utils.processors.glm4v",
...@@ -61,7 +58,6 @@ _CLASS_TO_MODULE: dict[str, str] = { ...@@ -61,7 +58,6 @@ _CLASS_TO_MODULE: dict[str, str] = {
"Ovis2_5Processor": "vllm.transformers_utils.processors.ovis2_5", "Ovis2_5Processor": "vllm.transformers_utils.processors.ovis2_5",
"QwenVLProcessor": "vllm.transformers_utils.processors.qwen_vl", "QwenVLProcessor": "vllm.transformers_utils.processors.qwen_vl",
"Qwen3ASRProcessor": "vllm.transformers_utils.processors.qwen3_asr", "Qwen3ASRProcessor": "vllm.transformers_utils.processors.qwen3_asr",
"SkyworkR1VProcessor": "vllm.transformers_utils.processors.skyworkr1v",
} }
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from NVIDIA Eagle2.5-VL model
# https://huggingface.co/nvidia/Eagle2.5-8B
from transformers import PretrainedConfig
from vllm.multimodal.processing import PromptUpdateDetails
from vllm.tokenizers import TokenizerLike
from .internvl import IMG_CONTEXT, IMG_END, IMG_START, BaseInternVLProcessor
class Eagle2_5_VLProcessor(BaseInternVLProcessor):
"""
Custom processor for Eagle2.5-VL model.
Extends BaseInternVLProcessor with Eagle-specific token handling.
"""
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> None:
# Skip super().__init__() to avoid config manipulation
# Directly initialize all required attributes
self.config = config
self.tokenizer = tokenizer
# Image size with force_image_size override
image_size: int = config.vision_config.image_size
if hasattr(config, "force_image_size") and config.force_image_size:
image_size = config.force_image_size
patch_size: int = config.vision_config.patch_size
downsample_ratio: float = getattr(config, "downsample_ratio", 0.5)
# Compute num_image_token
self.num_image_token = int(
(image_size // patch_size) ** 2 * (downsample_ratio**2)
)
self.image_size = image_size
# Dynamic patch settings with defaults
self.min_dynamic_patch = (
min_dynamic_patch
if min_dynamic_patch is not None
else getattr(config, "min_dynamic_patch", 1)
)
self.max_dynamic_patch = (
max_dynamic_patch
if max_dynamic_patch is not None
else getattr(config, "max_dynamic_patch", 12)
)
self.dynamic_image_size = (
dynamic_image_size
if dynamic_image_size is not None
else getattr(config, "dynamic_image_size", True)
)
self.use_thumbnail: bool = getattr(config, "use_thumbnail", True)
@property
def image_token_id(self) -> int:
"""Get the image token ID from config or tokenizer."""
if hasattr(self.config, "image_token_index"):
return self.config.image_token_index
# Fallback to tokenizer vocab - use <IMG_CONTEXT> (ID: 151667)
vocab = self.tokenizer.get_vocab()
if IMG_CONTEXT in vocab:
return vocab[IMG_CONTEXT]
raise ValueError(f"Cannot find image token '{IMG_CONTEXT}' in vocabulary")
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
"""Get image replacement string for prompt."""
repl_features = IMG_CONTEXT * feature_size
repl_full = IMG_START + repl_features + IMG_END
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
...@@ -10,16 +10,12 @@ ...@@ -10,16 +10,12 @@
# -------------------------------------------------------- # --------------------------------------------------------
import torch import torch
from PIL import Image from PIL import Image
from transformers import PretrainedConfig
from vllm.multimodal.processing import PromptUpdateDetails from vllm.tokenizers.hf import HfTokenizer
from vllm.tokenizers import TokenizerLike
from .internvl import ( from .internvl import (
IMG_CONTEXT, InternVLImageProcessor,
IMG_END, InternVLProcessor,
IMG_START,
BaseInternVLProcessor,
build_transform, build_transform,
find_closest_aspect_ratio, find_closest_aspect_ratio,
get_internvl_target_ratios, get_internvl_target_ratios,
...@@ -217,45 +213,26 @@ def image_to_pixel_values_h2ovl( ...@@ -217,45 +213,26 @@ def image_to_pixel_values_h2ovl(
return pixel_values return pixel_values
class H2OVLProcessor(BaseInternVLProcessor): class H2OVLImageProcessor(InternVLImageProcessor):
def __init__( def __init__(
self, self,
config: PretrainedConfig, image_size: int,
tokenizer: TokenizerLike, min_dynamic_patch: int,
*, max_dynamic_patch: int,
min_dynamic_patch: int | None = None, dynamic_image_size: bool,
max_dynamic_patch: int | None = None, use_thumbnail: bool,
dynamic_image_size: bool | None = None, use_msac: bool,
use_msac: bool | None = None,
) -> None: ) -> None:
super().__init__( super().__init__(
config, image_size=image_size,
tokenizer,
min_dynamic_patch=min_dynamic_patch, min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch, max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size, dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
) )
if use_msac is None:
use_msac = config.use_msac
assert isinstance(use_msac, bool)
self.use_msac = use_msac self.use_msac = use_msac
@property
def image_token_id(self) -> int:
return self.tokenizer.get_vocab()[IMG_CONTEXT]
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
repl_features = IMG_CONTEXT * feature_size
repl_full = IMG_START + repl_features + IMG_END
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
def resolve_min_max_num( def resolve_min_max_num(
self, self,
*, *,
...@@ -264,18 +241,14 @@ class H2OVLProcessor(BaseInternVLProcessor): ...@@ -264,18 +241,14 @@ class H2OVLProcessor(BaseInternVLProcessor):
dynamic_image_size: bool | None = None, dynamic_image_size: bool | None = None,
use_thumbnail: bool | None = None, use_thumbnail: bool | None = None,
) -> tuple[int, int]: ) -> tuple[int, int]:
min_dynamic_patch = ( if min_dynamic_patch is None:
self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch min_dynamic_patch = self.min_dynamic_patch
) if max_dynamic_patch is None:
max_dynamic_patch = ( max_dynamic_patch = self.max_dynamic_patch
self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch if dynamic_image_size is None:
) dynamic_image_size = self.dynamic_image_size
dynamic_image_size = ( if use_thumbnail is None:
self.dynamic_image_size use_thumbnail = self.use_thumbnail
if dynamic_image_size is None
else dynamic_image_size
)
use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
return resolve_h2ovl_min_max_num( return resolve_h2ovl_min_max_num(
min_dynamic_patch=min_dynamic_patch, min_dynamic_patch=min_dynamic_patch,
...@@ -284,6 +257,57 @@ class H2OVLProcessor(BaseInternVLProcessor): ...@@ -284,6 +257,57 @@ class H2OVLProcessor(BaseInternVLProcessor):
use_thumbnail=use_thumbnail, use_thumbnail=use_thumbnail,
) )
def _images_to_pixel_values_lst(
self,
images: list[Image.Image],
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> list[torch.Tensor]:
use_msac = self.use_msac if len(images) == 1 else False
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=False, # Applied in image_to_pixel_values
)
return [
image_to_pixel_values_h2ovl(
image,
input_size=self.image_size,
min_num=min_num,
max_num=max_num,
use_thumbnail=self.use_thumbnail,
use_msac=use_msac,
)
for image in images
]
class H2OVLProcessor(InternVLProcessor):
def __init__(
self,
image_processor: H2OVLImageProcessor,
tokenizer: HfTokenizer,
*,
image_seq_length: int,
start_image_token: str = "<img>",
end_image_token: str = "</img>",
ctx_image_token: str = "<IMG_CONTEXT>",
) -> None:
super().__init__(
image_processor=image_processor,
tokenizer=tokenizer,
image_seq_length=image_seq_length,
start_image_token=start_image_token,
end_image_token=end_image_token,
ctx_image_token=ctx_image_token,
)
self.image_processor: H2OVLImageProcessor
def resolve_target_ratios( def resolve_target_ratios(
self, self,
*, *,
...@@ -294,7 +318,7 @@ class H2OVLProcessor(BaseInternVLProcessor): ...@@ -294,7 +318,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
prior_aspect_ratio: tuple[int, int] | None = None, prior_aspect_ratio: tuple[int, int] | None = None,
override_min_num: int | None = None, override_min_num: int | None = None,
) -> list[tuple[int, int]]: ) -> list[tuple[int, int]]:
min_num, max_num = self.resolve_min_max_num( min_num, max_num = self.image_processor.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch, min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch, max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size, dynamic_image_size=dynamic_image_size,
...@@ -316,9 +340,10 @@ class H2OVLProcessor(BaseInternVLProcessor): ...@@ -316,9 +340,10 @@ class H2OVLProcessor(BaseInternVLProcessor):
image_height: int, image_height: int,
use_msac: bool | None = None, use_msac: bool | None = None,
) -> int: ) -> int:
use_msac = self.use_msac if use_msac is None else use_msac image_processor = self.image_processor
use_msac = image_processor.use_msac if use_msac is None else use_msac
use_thumbnail = self.use_thumbnail use_thumbnail = image_processor.use_thumbnail
if use_msac: if use_msac:
target_ratios_1 = self.resolve_target_ratios( target_ratios_1 = self.resolve_target_ratios(
...@@ -328,7 +353,7 @@ class H2OVLProcessor(BaseInternVLProcessor): ...@@ -328,7 +353,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
num_patches_1, _, _, aspect_ratio_1 = calculate_h2ovl_targets( num_patches_1, _, _, aspect_ratio_1 = calculate_h2ovl_targets(
orig_width=image_width, orig_width=image_width,
orig_height=image_height, orig_height=image_height,
image_size=self.image_size, image_size=image_processor.image_size,
target_ratios=target_ratios_1, target_ratios=target_ratios_1,
use_thumbnail=True, use_thumbnail=True,
) )
...@@ -341,7 +366,7 @@ class H2OVLProcessor(BaseInternVLProcessor): ...@@ -341,7 +366,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
num_patches_2, _, _, _ = calculate_h2ovl_targets( num_patches_2, _, _, _ = calculate_h2ovl_targets(
orig_width=image_width, orig_width=image_width,
orig_height=image_height, orig_height=image_height,
image_size=self.image_size, image_size=image_processor.image_size,
target_ratios=target_ratios_2, target_ratios=target_ratios_2,
use_thumbnail=True, use_thumbnail=True,
) )
...@@ -354,37 +379,9 @@ class H2OVLProcessor(BaseInternVLProcessor): ...@@ -354,37 +379,9 @@ class H2OVLProcessor(BaseInternVLProcessor):
num_patches, _, _, _ = calculate_h2ovl_targets( num_patches, _, _, _ = calculate_h2ovl_targets(
orig_width=image_width, orig_width=image_width,
orig_height=image_height, orig_height=image_height,
image_size=self.image_size, image_size=image_processor.image_size,
target_ratios=target_ratios, target_ratios=target_ratios,
use_thumbnail=use_thumbnail, use_thumbnail=use_thumbnail,
) )
return num_patches * self.num_image_token return num_patches * self.image_seq_length
def _images_to_pixel_values_lst(
self,
images: list[Image.Image],
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> list[torch.Tensor]:
use_msac = self.use_msac if len(images) == 1 else False
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=False, # Applied in image_to_pixel_values
)
return [
image_to_pixel_values_h2ovl(
image,
input_size=self.image_size,
min_num=min_num,
max_num=max_num,
use_thumbnail=self.use_thumbnail,
use_msac=use_msac,
)
for image in images
]
...@@ -25,7 +25,7 @@ from vllm.model_executor.models.parakeet import ParakeetExtractor ...@@ -25,7 +25,7 @@ from vllm.model_executor.models.parakeet import ParakeetExtractor
from vllm.multimodal.evs import compute_retained_tokens_count from vllm.multimodal.evs import compute_retained_tokens_count
from vllm.multimodal.inputs import AudioItem from vllm.multimodal.inputs import AudioItem
from vllm.multimodal.processing.processor import PromptUpdateDetails, _seq2tokens from vllm.multimodal.processing.processor import PromptUpdateDetails, _seq2tokens
from vllm.tokenizers import TokenizerLike from vllm.tokenizers.hf import HfTokenizer
from .internvl import calculate_internvl_targets, get_internvl_target_ratios from .internvl import calculate_internvl_targets, get_internvl_target_ratios
...@@ -508,7 +508,7 @@ class BaseNanoNemotronVLProcessor(ABC): ...@@ -508,7 +508,7 @@ class BaseNanoNemotronVLProcessor(ABC):
def __init__( def __init__(
self, self,
config: PretrainedConfig, config: PretrainedConfig,
tokenizer: TokenizerLike, tokenizer: HfTokenizer,
*args, *args,
max_model_len: int, max_model_len: int,
max_num_tiles: int | None = None, max_num_tiles: int | None = None,
...@@ -689,7 +689,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor): ...@@ -689,7 +689,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
def __init__( def __init__(
self, self,
config: PretrainedConfig, config: PretrainedConfig,
tokenizer: TokenizerLike, tokenizer: HfTokenizer,
*, *,
max_model_len: int, max_model_len: int,
max_num_tiles: int | None = None, max_num_tiles: int | None = None,
...@@ -961,7 +961,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor): ...@@ -961,7 +961,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
tokens_per_frame: list[int], tokens_per_frame: list[int],
frames_indices: list[int], frames_indices: list[int],
frame_duration_ms: int, frame_duration_ms: int,
tokenizer: TokenizerLike, tokenizer: HfTokenizer,
img_start_token_ids: list[int], img_start_token_ids: list[int],
img_end_token_ids: list[int], img_end_token_ids: list[int],
img_context_token_ids: list[int], img_context_token_ids: list[int],
...@@ -986,7 +986,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor): ...@@ -986,7 +986,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
tokens_per_frame (list[int]): number of tokens per frame tokens_per_frame (list[int]): number of tokens per frame
frames_indices (list[int]): frame indices frames_indices (list[int]): frame indices
frame_duration_ms (int): duration of each frame in milliseconds frame_duration_ms (int): duration of each frame in milliseconds
tokenizer (TokenizerLike): tokenizer to use for tokenizing frame separators tokenizer (HfTokenizer): tokenizer to use for tokenizing frame separators
img_start_token_ids (list[int]): pre-tokenized IMG_START tokens img_start_token_ids (list[int]): pre-tokenized IMG_START tokens
img_end_token_ids (list[int]): pre-tokenized IMG_END tokens img_end_token_ids (list[int]): pre-tokenized IMG_END tokens
img_context_token_ids (list[int]): pre-tokenized IMG_CONTEXT tokens img_context_token_ids (list[int]): pre-tokenized IMG_CONTEXT tokens
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from abc import ABC
import torch import torch
import torchvision.transforms as T import torchvision.transforms as T
from PIL import Image from PIL import Image
from transformers import PretrainedConfig
from transformers.image_processing_utils_fast import BaseImageProcessorFast
from vllm.multimodal.image import convert_image_mode from vllm.multimodal.image import convert_image_mode
from vllm.multimodal.processing import PromptUpdateDetails from vllm.tokenizers.hf import HfTokenizer
from vllm.tokenizers import TokenizerLike
from .internvl import InternVLProcessor from .internvl import InternVLImageProcessor, InternVLProcessor
# Configure PIL to handle large images without warnings # Configure PIL to handle large images without warnings
# This prevents DecompressionBombWarning for legitimate large images # This prevents DecompressionBombWarning for legitimate large images
...@@ -172,59 +168,61 @@ def image_to_pixel_values_nemotron_vl( ...@@ -172,59 +168,61 @@ def image_to_pixel_values_nemotron_vl(
return pixel_values return pixel_values
class NemotronVLProcessor(InternVLProcessor): class LlamaNemotronNanoVLImageProcessor(InternVLImageProcessor):
IMG_START = "<img>" def _images_to_pixel_values_lst(
IMG_END = "</img>"
IMG_CONTEXT = "<image>"
def __init__(
self, self,
config: PretrainedConfig, images: list[Image.Image],
tokenizer: TokenizerLike,
image_processor: BaseImageProcessorFast,
*,
min_dynamic_patch: int | None = None, min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None, max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None, dynamic_image_size: bool | None = None,
) -> None: ) -> list[torch.Tensor]:
ABC.__init__(self) min_num, max_num = self.resolve_min_max_num(
self.config = config min_dynamic_patch=min_dynamic_patch,
self.tokenizer = tokenizer max_dynamic_patch=max_dynamic_patch,
self.image_processor = image_processor dynamic_image_size=dynamic_image_size,
image_size: int = config.force_image_size use_thumbnail=False, # Applied in image_to_pixel_values
patch_size: int = config.patch_size
if min_dynamic_patch is None:
min_dynamic_patch = 1
assert isinstance(min_dynamic_patch, int)
if max_dynamic_patch is None:
max_dynamic_patch = self.image_processor.max_num_tiles
assert isinstance(max_dynamic_patch, int)
if dynamic_image_size is None:
dynamic_image_size = True
assert isinstance(dynamic_image_size, bool)
self.num_image_token = int(
(image_size // patch_size) ** 2 * (config.downsample_ratio**2)
) )
self.image_size = image_size
self.min_dynamic_patch = min_dynamic_patch
self.max_dynamic_patch = max_dynamic_patch
self.dynamic_image_size = dynamic_image_size
if image_processor is not None: return [
self.use_thumbnail = image_processor.use_thumbnail image_to_pixel_values_nemotron_vl(
else: image,
self.use_thumbnail = getattr(config, "use_thumbnail", True) input_size=self.image_size,
min_num=min_num,
max_num=max_num,
use_thumbnail=self.use_thumbnail,
transform=build_transform(self.image_size),
)
for image in images
]
@property
def image_token_id(self) -> int:
return self.tokenizer.get_vocab()[self.IMG_CONTEXT]
def _get_transform(self) -> T.Compose: class LlamaNemotronNanoVLProcessor(InternVLProcessor):
return build_transform(input_size=self.image_size) """
This model doesn't define its own HF processor,
so we implement our own one here.
The image processor is given by:
https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1/blob/main/image_processing.py
"""
def __init__(
self,
image_processor: LlamaNemotronNanoVLImageProcessor,
tokenizer: HfTokenizer,
*,
image_seq_length: int,
start_image_token: str = "<img>",
end_image_token: str = "</img>",
ctx_image_token: str = "<image>",
) -> None:
super().__init__(
image_processor=image_processor,
tokenizer=tokenizer,
image_seq_length=image_seq_length,
start_image_token=start_image_token,
end_image_token=end_image_token,
ctx_image_token=ctx_image_token,
)
def get_num_image_tokens( def get_num_image_tokens(
self, self,
...@@ -232,6 +230,7 @@ class NemotronVLProcessor(InternVLProcessor): ...@@ -232,6 +230,7 @@ class NemotronVLProcessor(InternVLProcessor):
image_width: int, image_width: int,
image_height: int, image_height: int,
) -> int: ) -> int:
image_processor = self.image_processor
target_ratios = self.resolve_target_ratios( target_ratios = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets use_thumbnail=False, # Applied in calculate_targets
) )
...@@ -239,13 +238,33 @@ class NemotronVLProcessor(InternVLProcessor): ...@@ -239,13 +238,33 @@ class NemotronVLProcessor(InternVLProcessor):
num_patches, _, _ = calculate_nemotron_vl_targets( num_patches, _, _ = calculate_nemotron_vl_targets(
orig_width=image_width, orig_width=image_width,
orig_height=image_height, orig_height=image_height,
image_size=self.image_size, image_size=image_processor.image_size,
target_ratios=target_ratios, target_ratios=target_ratios,
use_thumbnail=self.use_thumbnail, use_thumbnail=image_processor.use_thumbnail,
)
return num_patches * self.image_seq_length
# SigLIP normalization constants
SIGLIP_MEAN = (0.5, 0.5, 0.5)
SIGLIP_STD = (0.5, 0.5, 0.5)
def build_siglip_transform(input_size: int):
"""Build transform for SigLIP vision encoder with normalization.
Extends the base transform from nemotron_vl with SigLIP-specific normalization.
"""
return T.Compose(
[
build_transform(input_size=input_size),
T.Normalize(mean=SIGLIP_MEAN, std=SIGLIP_STD),
]
) )
return num_patches * self.num_image_token
class LlamaNemotronVLEmbedImageProcessor(InternVLImageProcessor):
def _images_to_pixel_values_lst( def _images_to_pixel_values_lst(
self, self,
images: list[Image.Image], images: list[Image.Image],
...@@ -267,83 +286,13 @@ class NemotronVLProcessor(InternVLProcessor): ...@@ -267,83 +286,13 @@ class NemotronVLProcessor(InternVLProcessor):
min_num=min_num, min_num=min_num,
max_num=max_num, max_num=max_num,
use_thumbnail=self.use_thumbnail, use_thumbnail=self.use_thumbnail,
transform=self._get_transform(), transform=build_siglip_transform(self.image_size),
) )
for image in images for image in images
] ]
def _replace_image_tokens(
self,
text: list[str],
pixel_values_lst: list[torch.Tensor],
) -> list[str]:
"""Replace <image> placeholders with image tokens."""
for pixel_values in pixel_values_lst:
num_patches = pixel_values.shape[0]
feature_size = num_patches * self.num_image_token
image_repl = self.get_image_repl(feature_size, num_patches)
# Use temporary placeholder to avoid replacing tokens we just inserted
NVL_IMAGE_CONTEXT = image_repl.full.replace("<image>", "<NVL_IMG_CONTEXT>")
text = [t.replace("<image>", NVL_IMAGE_CONTEXT, 1) for t in text]
return [t.replace("<NVL_IMG_CONTEXT>", self.IMG_CONTEXT) for t in text]
def _preprocess_image(
self,
text: list[str],
images: list[Image.Image],
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> tuple[list[str], dict[str, torch.Tensor]]:
if len(images) == 0:
image_inputs = {}
else:
pixel_values_lst = self._images_to_pixel_values_lst(
images,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
image_inputs = {
"pixel_values_flat": torch.cat(pixel_values_lst),
"image_num_patches": torch.tensor(
[len(item) for item in pixel_values_lst]
),
}
text = self._replace_image_tokens(text, pixel_values_lst)
return text, image_inputs
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
repl_features = self.IMG_CONTEXT * feature_size
repl_full = self.IMG_START + repl_features + self.IMG_END
return PromptUpdateDetails.select_text(repl_full, self.IMG_CONTEXT)
# SigLIP normalization constants
SIGLIP_MEAN = (0.5, 0.5, 0.5)
SIGLIP_STD = (0.5, 0.5, 0.5)
def build_siglip_transform(input_size: int):
"""Build transform for SigLIP vision encoder with normalization.
Extends the base transform from nemotron_vl with SigLIP-specific normalization.
"""
return T.Compose(
[
build_transform(input_size=input_size),
T.Normalize(mean=SIGLIP_MEAN, std=SIGLIP_STD),
]
)
class LlamaNemotronVLEmbedProcessor(InternVLProcessor):
class LlamaNemotronVLEmbedProcessor(NemotronVLProcessor):
""" """
Processor for LlamaNemotronVL embedding model. Processor for LlamaNemotronVL embedding model.
...@@ -352,59 +301,44 @@ class LlamaNemotronVLEmbedProcessor(NemotronVLProcessor): ...@@ -352,59 +301,44 @@ class LlamaNemotronVLEmbedProcessor(NemotronVLProcessor):
- Uses different image context token (<IMG_CONTEXT> vs <image>) - Uses different image context token (<IMG_CONTEXT> vs <image>)
""" """
IMG_CONTEXT = "<IMG_CONTEXT>"
def __init__( def __init__(
self, self,
config: PretrainedConfig, image_processor: LlamaNemotronVLEmbedImageProcessor,
tokenizer: TokenizerLike, tokenizer: HfTokenizer,
processor_config: dict,
*, *,
min_dynamic_patch: int | None = None, image_seq_length: int,
max_dynamic_patch: int | None = None, start_image_token: str = "<img>",
dynamic_image_size: bool | None = None, end_image_token: str = "</img>",
ctx_image_token: str = "<IMG_CONTEXT>",
) -> None: ) -> None:
if min_dynamic_patch is None:
min_dynamic_patch = processor_config.get(
"min_input_tiles",
getattr(config, "min_dynamic_patch", 1),
)
if max_dynamic_patch is None:
max_dynamic_patch = processor_config.get(
"max_input_tiles",
getattr(config, "max_dynamic_patch", 1),
)
if dynamic_image_size is None:
dynamic_image_size = processor_config.get(
"dynamic_image_size",
getattr(config, "dynamic_image_size", True),
)
super().__init__( super().__init__(
config=config, image_processor=image_processor,
tokenizer=tokenizer, tokenizer=tokenizer,
image_processor=None, image_seq_length=image_seq_length,
min_dynamic_patch=min_dynamic_patch, start_image_token=start_image_token,
max_dynamic_patch=max_dynamic_patch, end_image_token=end_image_token,
dynamic_image_size=dynamic_image_size, ctx_image_token=ctx_image_token,
) )
def _get_transform(self) -> T.Compose: self.image_processor: LlamaNemotronVLEmbedImageProcessor
"""Override to add SigLIP normalization."""
return build_siglip_transform(input_size=self.image_size)
def _replace_image_tokens( def get_num_image_tokens(
self, self,
text: list[str], *,
pixel_values_lst: list[torch.Tensor], image_width: int,
) -> list[str]: image_height: int,
"""Override with simpler token replacement for embedding model. ) -> int:
image_processor = self.image_processor
target_ratios = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
)
No temporary placeholder needed because IMG_CONTEXT is <IMG_CONTEXT>, num_patches, _, _ = calculate_nemotron_vl_targets(
not <image>, so there's no collision risk. orig_width=image_width,
""" orig_height=image_height,
for pixel_values in pixel_values_lst: image_size=image_processor.image_size,
num_patches = pixel_values.shape[0] target_ratios=target_ratios,
feature_size = num_patches * self.num_image_token use_thumbnail=image_processor.use_thumbnail,
image_repl = self.get_image_repl(feature_size, num_patches) )
text = [t.replace("<image>", image_repl.full, 1) for t in text]
return text return num_patches * self.image_seq_length
...@@ -8,37 +8,54 @@ ...@@ -8,37 +8,54 @@
# Licensed under Apache 2.0 License [see LICENSE for details] # Licensed under Apache 2.0 License [see LICENSE for details]
# -------------------------------------------------------- # --------------------------------------------------------
from vllm.multimodal.processing import PromptUpdateDetails from vllm.multimodal.processing import PromptUpdateDetails
from vllm.tokenizers.hf import HfTokenizer
from .internvl import BaseInternVLProcessor from .internvl import InternVLImageProcessor, InternVLProcessor
IMG_PAD = "<|vision_pad|>"
class NVLMProcessor(InternVLProcessor):
class NVLMProcessor(BaseInternVLProcessor): def __init__(
@property self,
def image_token_id(self) -> int: image_processor: InternVLImageProcessor,
return self.tokenizer.get_vocab()[IMG_PAD] tokenizer: HfTokenizer,
*,
image_seq_length: int,
start_image_token: str = "<Image>",
end_image_token: str = "</Image>",
ctx_image_token: str = "<|vision_pad|>",
) -> None:
super().__init__(
image_processor=image_processor,
tokenizer=tokenizer,
image_seq_length=image_seq_length,
start_image_token=start_image_token,
end_image_token=end_image_token,
ctx_image_token=ctx_image_token,
)
def get_image_repl( def get_image_repl(
self, self,
feature_size: int,
num_patches: int | None, num_patches: int | None,
num_features: int | None = None,
) -> PromptUpdateDetails[str]: ) -> PromptUpdateDetails[str]:
if num_patches is None: if num_patches is None:
raise NotImplementedError("Embedding inputs are not supported") raise NotImplementedError("Embedding inputs are not supported")
num_features = num_patches * self.image_seq_length
tile_pos_identifiers = [f"<tile_{i}>" for i in range(1, num_patches)] tile_pos_identifiers = [f"<tile_{i}>" for i in range(1, num_patches)]
if self.use_thumbnail: if self.image_processor.use_thumbnail:
tile_pos_identifiers += ["<tile_global_thumbnail>"] tile_pos_identifiers += ["<tile_global_thumbnail>"]
context_size = feature_size // num_patches context_size = num_features // num_patches
features = "".join( features = "".join(
identifier + IMG_PAD * context_size for identifier in tile_pos_identifiers (identifier + self.ctx_image_token * context_size)
for identifier in tile_pos_identifiers
) )
# We include the start and end as well because "<Image><tile" is # We include the start and end as well because "<Image><tile" is
# tokenized as ["<Image", "><", "tile"], resulting in assertion error # tokenized as ["<Image", "><", "tile"], resulting in assertion error
# when trying to find "<tile" as a subsequence of "<Image><tile" # when trying to find "<tile" as a subsequence of "<Image><tile"
repl = "<Image>" + features + "</Image>" repl = self.start_image_token + features + self.end_image_token
return PromptUpdateDetails.select_text(repl, IMG_PAD) return PromptUpdateDetails.select_text(repl, self.ctx_image_token)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py
# --------------------------------------------------------
# SkyworkR1V
# Copyright (c) 2025 Skywork
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import torch
import torchvision.transforms as T
from PIL import Image
from transformers import BatchFeature, PretrainedConfig, TensorType
from vllm.multimodal.image import convert_image_mode
from vllm.multimodal.processing import PromptUpdateDetails
from vllm.tokenizers import TokenizerLike
IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<IMG_CONTEXT>"
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
def build_transform(input_size: int):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
return T.Compose(
[
T.Lambda(lambda img: convert_image_mode(img, "RGB")),
T.Resize(
(input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD),
]
)
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
def find_closest_aspect_ratio(
aspect_ratio: float,
target_ratios: list[tuple[int, int]],
*,
width: int,
height: int,
image_size: int,
) -> tuple[int, int]:
best_ratio_diff = float("inf")
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def resolve_skyworkr1v_min_max_num(
*,
min_dynamic_patch: int,
max_dynamic_patch: int,
dynamic_image_size: bool,
use_thumbnail: bool,
) -> tuple[int, int]:
min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
if use_thumbnail and max_dynamic_patch != 1:
max_dynamic_patch += 1
return min_dynamic_patch, max_dynamic_patch
def get_skyworkr1v_target_ratios(
min_num: int,
max_num: int,
) -> list[tuple[int, int]]:
target_ratios = {
(i, j)
for n in range(min_num, max_num + 1)
for i in range(1, n + 1)
for j in range(1, n + 1)
if min_num <= i * j <= max_num
}
return sorted(target_ratios, key=lambda x: x[0] * x[1])
def calculate_skyworkr1v_targets(
*,
orig_width: int,
orig_height: int,
target_ratios: list[tuple[int, int]],
image_size: int,
use_thumbnail: bool,
) -> tuple[int, int, int]:
aspect_ratio = orig_width / orig_height
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio,
target_ratios,
width=orig_width,
height=orig_height,
image_size=image_size,
)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# add thumbnail image if num_blocks != 1
if use_thumbnail and blocks != 1:
blocks += 1
return blocks, target_width, target_height
def dynamic_preprocess_skyworkr1v(
image: Image.Image,
*,
target_ratios: list[tuple[int, int]],
image_size: int,
use_thumbnail: bool,
) -> list[Image.Image]:
orig_width, orig_height = image.size
# calculate the number of blocks without thumbnail
blocks, target_width, target_height = calculate_skyworkr1v_targets(
orig_width=orig_width,
orig_height=orig_height,
target_ratios=target_ratios,
image_size=image_size,
use_thumbnail=False,
)
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size,
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B
def image_to_pixel_values_skyworkr1v(
image: Image.Image,
*,
input_size: int,
min_num: int,
max_num: int,
use_thumbnail: bool,
) -> torch.Tensor:
target_ratios = get_skyworkr1v_target_ratios(min_num, max_num)
transform = build_transform(input_size=input_size)
images = dynamic_preprocess_skyworkr1v(
image,
target_ratios=target_ratios,
image_size=input_size,
use_thumbnail=use_thumbnail,
)
pixel_values = torch.stack([transform(image) for image in images])
return pixel_values
class SkyworkR1VProcessor:
"""
This model doesn't define its own HF processor,
so we implement our own one here.
The code to insert image tokens is based on:
https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py#L252
"""
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> None:
super().__init__()
self.config = config
self.tokenizer = tokenizer
image_size: int = config.vision_config.image_size
patch_size: int = config.vision_config.patch_size
if min_dynamic_patch is None:
min_dynamic_patch = config.min_dynamic_patch
assert isinstance(min_dynamic_patch, int)
if max_dynamic_patch is None:
max_dynamic_patch = config.max_dynamic_patch
assert isinstance(max_dynamic_patch, int)
if dynamic_image_size is None:
dynamic_image_size = config.dynamic_image_size
assert isinstance(dynamic_image_size, bool)
self.num_image_token = int(
(image_size // patch_size) ** 2 * (config.downsample_ratio**2)
)
self.image_size = image_size
self.min_dynamic_patch = min_dynamic_patch
self.max_dynamic_patch = max_dynamic_patch
self.dynamic_image_size = dynamic_image_size
self.use_thumbnail: bool = config.use_thumbnail
@property
def image_token_id(self) -> int:
return self.tokenizer.get_vocab()[IMG_CONTEXT]
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
repl_features = IMG_CONTEXT * feature_size
repl_full = IMG_START + repl_features + IMG_END
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
def resolve_min_max_num(
self,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
use_thumbnail: bool | None = None,
) -> tuple[int, int]:
min_dynamic_patch = (
self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
)
max_dynamic_patch = (
self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
)
dynamic_image_size = (
self.dynamic_image_size
if dynamic_image_size is None
else dynamic_image_size
)
use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
return resolve_skyworkr1v_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
)
def resolve_target_ratios(
self,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
use_thumbnail: bool | None = None,
) -> list[tuple[int, int]]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
)
return get_skyworkr1v_target_ratios(min_num, max_num)
def get_num_image_tokens(
self,
*,
image_width: int,
image_height: int,
) -> int:
target_ratios = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
)
num_patches, _, _ = calculate_skyworkr1v_targets(
orig_width=image_width,
orig_height=image_height,
image_size=self.image_size,
target_ratios=target_ratios,
use_thumbnail=self.use_thumbnail,
)
return num_patches * self.num_image_token
def _images_to_pixel_values_lst(
self,
images: list[Image.Image],
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> list[torch.Tensor]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=False, # Applied in image_to_pixel_values
)
return [
image_to_pixel_values_skyworkr1v(
image,
input_size=self.image_size,
min_num=min_num,
max_num=max_num,
use_thumbnail=self.use_thumbnail,
)
for image in images
]
def __call__(
self,
text: str | list[str] | None = None,
images: Image.Image | list[Image.Image] | None = None,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
return_tensors: str | TensorType | None = None,
) -> BatchFeature:
if text is None:
text = []
if not isinstance(text, list):
text = [text]
if images is None:
images = []
if not isinstance(images, list):
images = [images]
if len(images) == 0:
image_inputs = {}
else:
pixel_values_lst = self._images_to_pixel_values_lst(
images,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
image_inputs = {
"pixel_values_flat": torch.cat(pixel_values_lst),
"image_num_patches": torch.tensor(
[len(item) for item in pixel_values_lst]
),
}
for pixel_values in pixel_values_lst:
num_patches = pixel_values.shape[0]
feature_size = num_patches * self.num_image_token
image_repl = self.get_image_repl(feature_size, num_patches)
text = [t.replace("<image>", image_repl.full, 1) for t in text]
text_inputs = self.tokenizer(text)
combined_outputs = {**text_inputs, **image_inputs}
return BatchFeature(combined_outputs, tensor_type=return_tensors)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment