Unverified Commit 99267c23 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[2/3] Refactor InternVL-based processors (#37324)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent 525f2eeb
......@@ -489,13 +489,14 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self.image_size = self.vision_config.image_size
def __call__(self, text: str, images: Image | list[Image], **kwargs):
from vllm.model_executor.models.h2ovl import (
IMG_CONTEXT,
IMG_END,
IMG_START,
from vllm.transformers_utils.processors.h2ovl import (
image_to_pixel_values_h2ovl,
)
IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<IMG_CONTEXT>"
images = [images] if isinstance(images, Image) else images
pixel_values = [
image_to_pixel_values_h2ovl(
......@@ -751,16 +752,17 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self.image_size = self.vision_config.image_size
def __call__(self, text: str, images: Image | list[Image], **kwargs):
from vllm.model_executor.models.skyworkr1v import (
IMG_CONTEXT,
IMG_END,
IMG_START,
image_to_pixel_values_skyworkr1v,
from vllm.transformers_utils.processors.internvl import (
image_to_pixel_values_internvl,
)
IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<IMG_CONTEXT>"
images = [images] if isinstance(images, Image) else images
pixel_values = [
image_to_pixel_values_skyworkr1v(
image_to_pixel_values_internvl(
image,
input_size=self.image_size,
min_num=self.min_num,
......@@ -815,14 +817,15 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
videos: npt.NDArray | list[npt.NDArray] = None,
**kwargs,
):
from vllm.model_executor.models.internvl import (
IMG_CONTEXT,
IMG_END,
IMG_START,
from vllm.transformers_utils.processors.internvl import (
image_to_pixel_values_internvl,
video_to_pixel_values_internvl,
)
IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<IMG_CONTEXT>"
images = [images] if isinstance(images, Image) else images
videos = [videos] if isinstance(videos, np.ndarray) else videos
if images is not None:
......
......@@ -779,7 +779,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"rednote-hilab/dots.ocr", trust_remote_code=True
),
"Eagle2_5_VLForConditionalGeneration": _HfExamplesInfo(
"nvidia/Eagle2.5-8B", trust_remote_code=True, is_available_online=False
"nvidia/Eagle2.5-8B",
trust_remote_code=True,
),
"Emu3ForConditionalGeneration": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
"Ernie4_5_VLMoeForConditionalGeneration": _HfExamplesInfo(
......
......@@ -16,7 +16,10 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.models.siglip import SiglipVisionModel
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.processors.eagle2_5_vl import Eagle2_5_VLProcessor
from vllm.transformers_utils.processors.internvl import (
InternVLImageProcessor,
InternVLProcessor,
)
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import (
......@@ -68,12 +71,35 @@ Eagle2_5_VLImageInputs: TypeAlias = (
class Eagle2_5_VLProcessingInfo(BaseInternVLProcessingInfo):
"""Processing info for Eagle2.5-VL model."""
def get_hf_processor(self, **kwargs) -> Eagle2_5_VLProcessor:
return self.ctx.init_processor(
Eagle2_5_VLProcessor,
config=self.ctx.get_hf_config(),
def get_image_processor(self, **kwargs):
config = self.get_hf_config()
vision_config = config.vision_config
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
kwargs.setdefault(
"image_size", config.force_image_size or vision_config.image_size
)
kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
kwargs.setdefault("use_thumbnail", config.use_thumbnail)
return InternVLImageProcessor(**kwargs)
def get_hf_processor(self, **kwargs) -> InternVLProcessor:
config = self.get_hf_config()
vision_config = config.vision_config
image_processor = self.get_image_processor(**kwargs)
image_size = image_processor.image_size
patch_size = vision_config.patch_size
downsample_ratio = config.downsample_ratio
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
return InternVLProcessor(
tokenizer=self.get_tokenizer(),
**kwargs,
image_processor=image_processor,
image_seq_length=image_seq_length,
)
......
......@@ -395,13 +395,13 @@ class GLM4VProcessingInfo(BaseProcessingInfo):
vision_config = config.vision_config
image_size = vision_config["image_size"]
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
kwargs.setdefault("size", {"width": image_size, "height": image_size})
return GLM4VImageProcessorFast(**kwargs)
def get_hf_processor(self, **kwargs: object) -> GLM4VProcessor:
return self.ctx.init_processor(
GLM4VProcessor,
return GLM4VProcessor(
tokenizer=self.get_tokenizer(),
image_processor=self.get_image_processor(**kwargs),
)
......
......@@ -28,7 +28,7 @@ from vllm.multimodal.processing.processor import (
PromptUpdate,
TimingContext,
)
from vllm.transformers_utils.processors.h2ovl import H2OVLProcessor
from vllm.transformers_utils.processors.h2ovl import H2OVLImageProcessor, H2OVLProcessor
from .intern_vit import InternVisionModel
from .internvl import (
......@@ -40,12 +40,34 @@ from .internvl import (
class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
def get_image_processor(self, **kwargs):
config = self.get_hf_config()
vision_config = config.vision_config
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
kwargs.setdefault("image_size", vision_config.image_size)
kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
kwargs.setdefault("use_thumbnail", config.use_thumbnail)
kwargs.setdefault("use_msac", config.use_msac)
return H2OVLImageProcessor(**kwargs)
def get_hf_processor(self, **kwargs: object) -> H2OVLProcessor:
return self.ctx.init_processor(
H2OVLProcessor,
config=self.get_hf_config(),
config = self.get_hf_config()
vision_config = config.vision_config
image_processor = self.get_image_processor(**kwargs)
image_size = image_processor.image_size
patch_size = vision_config.patch_size
downsample_ratio = config.downsample_ratio
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
return H2OVLProcessor(
tokenizer=self.get_tokenizer(),
**kwargs,
image_processor=image_processor,
image_seq_length=image_seq_length,
)
def get_num_image_tokens(
......@@ -106,7 +128,7 @@ class H2OVLMultiModalProcessor(BaseInternVLMultiModalProcessor[H2OVLProcessingIn
if num_patches is not None:
assert isinstance(num_patches, int)
return hf_processor.get_image_repl(feature_size, num_patches)
return hf_processor.get_image_repl(num_patches, num_features=feature_size)
return [
PromptReplacement(
......
......@@ -9,6 +9,7 @@
# --------------------------------------------------------
from abc import abstractmethod
from collections.abc import Iterable, Mapping, Sequence
from functools import cached_property
from typing import Annotated, Literal, TypeAlias, TypeVar
import torch
......@@ -45,8 +46,9 @@ from vllm.multimodal.processing import (
)
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.processors.internvl import (
BaseInternVLProcessor,
InternVLImageProcessor,
InternVLProcessor,
InternVLVideoProcessor,
)
from vllm.utils.tensor_schema import TensorSchema, TensorShape
......@@ -123,7 +125,7 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
"""Basic image-only ProcessingInfo for InternVL-style models."""
@abstractmethod
def get_hf_processor(self, **kwargs: object) -> BaseInternVLProcessor:
def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
raise NotImplementedError
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
......@@ -134,7 +136,7 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
*,
image_width: int,
image_height: int,
processor: BaseInternVLProcessor,
processor: InternVLProcessor,
) -> int:
return processor.get_num_image_tokens(
image_width=image_width,
......@@ -143,8 +145,9 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
def get_image_size_with_most_features(self) -> ImageSize:
processor = self.get_hf_processor()
image_processor = processor.image_processor
base_size = processor.image_size
base_size = image_processor.image_size
target_ratios = processor.resolve_target_ratios()
largest_feature_size, largest_feature_pinpoint = 0, None
......@@ -226,7 +229,7 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
)
hf_processor = self.info.get_hf_processor(**mm_kwargs)
image_token_id = hf_processor.image_token_id
image_token_id = hf_processor.ctx_image_token_id
# Since there may be extra tokens in the feature placeholders,
# we need to pass the image token ID to the model to select the
......@@ -291,7 +294,7 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
if num_patches is not None:
assert isinstance(num_patches, int)
return hf_processor.get_image_repl(feature_size, num_patches)
return hf_processor.get_image_repl(num_patches, num_features=feature_size)
return [
PromptReplacement(
......@@ -305,23 +308,73 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
class InternVLProcessingInfo(BaseInternVLProcessingInfo):
"""InternVL ProcessingInfo extended for video processing"""
@property
def supports_video(self):
return self.get_hf_processor().supports_video
def get_image_processor(self, **kwargs):
config = self.get_hf_config()
vision_config = config.vision_config
def get_supported_mm_limits(self):
video_limit = {"video": None} if self.supports_video else {}
return {**super().get_supported_mm_limits(), **video_limit}
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
kwargs.setdefault("image_size", vision_config.image_size)
kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
kwargs.setdefault("use_thumbnail", config.use_thumbnail)
return InternVLImageProcessor(**kwargs)
def get_video_processor(self, **kwargs):
config = self.get_hf_config()
vision_config = config.vision_config
def get_video_token(self) -> str | None:
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
kwargs.setdefault("image_size", vision_config.image_size)
return InternVLVideoProcessor(**kwargs)
@cached_property
def ctx_video_token(self):
text_model_type = self.get_hf_config().get_text_config().model_type
video_token_map = {
ctx_video_token_map = {
"qwen2": "<|video_pad|>",
"qwen3": "<|video_pad|>",
"qwen3_moe": "<|video_pad|>",
"gpt_oss": "<|reserved_200000|>",
}
return video_token_map.get(text_model_type)
if text_model_type not in ctx_video_token_map:
return None
ctx_video_token = ctx_video_token_map[text_model_type]
if ctx_video_token not in self.get_tokenizer().get_vocab():
return None
return ctx_video_token
def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
config = self.get_hf_config()
vision_config = config.vision_config
image_processor = self.get_image_processor(**kwargs)
image_size = image_processor.image_size
patch_size = vision_config.patch_size
downsample_ratio = config.downsample_ratio
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
ctx_video_token = self.ctx_video_token
video_processor = (
self.get_video_processor(**kwargs) if ctx_video_token else None
)
return InternVLProcessor(
tokenizer=self.get_tokenizer(),
image_processor=image_processor,
video_processor=video_processor,
image_seq_length=image_seq_length,
ctx_video_token=ctx_video_token,
)
def get_supported_mm_limits(self):
video_limit = {"video": None} if self.ctx_video_token else {}
return {**super().get_supported_mm_limits(), **video_limit}
def get_num_frames_with_most_features(
self,
......@@ -332,22 +385,14 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo):
max_videos = mm_counts.get("video", 0)
processor = self.get_hf_processor()
num_image_token = processor.image_seq_length
max_image_tokens = self.get_max_image_tokens() * max_images
max_total_frames = (seq_len - max_image_tokens) // processor.num_image_token
max_total_frames = (seq_len - max_image_tokens) // num_image_token
max_frames_per_video = max_total_frames // max(max_videos, 1)
return max(max_frames_per_video, 1)
def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
return self.ctx.init_processor(
InternVLProcessor,
config=self.get_hf_config(),
tokenizer=self.get_tokenizer(),
video_token=self.get_video_token(),
**kwargs,
)
class InternVLDummyInputsBuilder(
BaseInternVLDummyInputsBuilder[InternVLProcessingInfo]
......@@ -366,7 +411,7 @@ class InternVLDummyInputsBuilder(
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
dummy_image = super().get_dummy_mm_data(seq_len, mm_counts, mm_options)
if self.info.supports_video:
if self.info.ctx_video_token:
config = self.info.get_hf_config()
image_size: int = config.vision_config.image_size
target_num_frames = self.info.get_num_frames_with_most_features(
......@@ -405,11 +450,9 @@ class InternVLMultiModalProcessor(
)
hf_processor = self.info.get_hf_processor(**mm_kwargs)
if (
self.info.supports_video
and (video_token_id := hf_processor.video_token_id) is not None
):
if (video_token_id := hf_processor.ctx_video_token_id) is not None:
processed_outputs["video_token_id"] = torch.tensor(video_token_id)
return processed_outputs
def _get_mm_fields_config(
......@@ -418,7 +461,7 @@ class InternVLMultiModalProcessor(
hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]:
image_fields = super()._get_mm_fields_config(hf_inputs, hf_processor_mm_kwargs)
if self.info.supports_video:
if self.info.ctx_video_token:
video_num_patches = hf_inputs.get("video_num_patches", torch.empty(0))
num_videos = len(video_num_patches)
video_fields = dict(
......@@ -444,6 +487,8 @@ class InternVLMultiModalProcessor(
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
out_mm_kwargs=out_mm_kwargs,
)
if self.info.ctx_video_token is None:
return prompt_repl
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
......@@ -456,26 +501,20 @@ class InternVLMultiModalProcessor(
video_num_patches = []
def get_video_replacement_internvl(item_idx: int):
feature_size = hf_processor.num_image_token
num_patches = video_num_patches[item_idx]
if num_patches is not None:
assert isinstance(num_patches, int)
return hf_processor.get_video_repl(
feature_size, num_patches, video_context_token=hf_processor.video_token
)
if self.info.supports_video:
prompt_repl = [
*prompt_repl,
PromptReplacement(
modality="video",
target="<video>",
replacement=get_video_replacement_internvl,
),
]
return hf_processor.get_video_repl(num_patches)
return prompt_repl
return [
*prompt_repl,
PromptReplacement(
modality="video",
target="<video>",
replacement=get_video_replacement_internvl,
),
]
@MULTIMODAL_REGISTRY.register_processor(
......
......@@ -26,8 +26,10 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.processor import cached_image_processor_from_config
from vllm.transformers_utils.processors.nemotron_vl import (
LlamaNemotronNanoVLImageProcessor,
LlamaNemotronNanoVLProcessor,
LlamaNemotronVLEmbedImageProcessor,
LlamaNemotronVLEmbedProcessor,
NemotronVLProcessor,
)
from vllm.transformers_utils.repo_utils import get_hf_file_to_dict
......@@ -50,19 +52,34 @@ from .utils import (
class NemotronVLProcessingInfo(BaseInternVLProcessingInfo):
"""Processing info for Nemotron VL models."""
def get_hf_processor(self, **kwargs: object) -> NemotronVLProcessor:
return self.ctx.init_processor(
NemotronVLProcessor,
config=self.get_hf_config(),
tokenizer=self.get_tokenizer(),
image_processor=self.get_image_processor(),
**kwargs,
def get_image_processor(self, **kwargs: object):
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
orig_processor = cached_image_processor_from_config(
self.ctx.model_config, **kwargs
)
def get_image_processor(self, **kwargs: object):
return cached_image_processor_from_config(
self.ctx.model_config,
**kwargs,
return LlamaNemotronNanoVLImageProcessor(
image_size=orig_processor.image_size,
min_dynamic_patch=1,
max_dynamic_patch=orig_processor.max_num_tiles,
dynamic_image_size=True,
use_thumbnail=orig_processor.use_thumbnail,
)
def get_hf_processor(self, **kwargs: object) -> LlamaNemotronNanoVLProcessor:
config = self.get_hf_config()
vision_config = config.vision_config
image_processor = self.get_image_processor(**kwargs)
image_size = image_processor.image_size
patch_size = vision_config.patch_size
downsample_ratio = config.downsample_ratio
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
return LlamaNemotronNanoVLProcessor(
tokenizer=self.get_tokenizer(),
image_processor=image_processor,
image_seq_length=image_seq_length,
)
......@@ -386,29 +403,58 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor
# --------------------------------------------------------
class LlamaNemotronVLEmbedProcessingInfo(NemotronVLProcessingInfo):
class LlamaNemotronVLEmbedProcessingInfo(BaseInternVLProcessingInfo):
"""Processing info for LlamaNemotronVL embedding model."""
def get_hf_processor(self, **kwargs: object) -> LlamaNemotronVLEmbedProcessor:
"""Override to create embedding-specific processor without image_processor."""
def get_image_processor(self, **kwargs):
model_config = self.ctx.model_config
processor_config = {}
if model_config.model is not None:
processor_config = (
get_hf_file_to_dict(
"processor_config.json",
model_config.model,
model_config.revision,
)
or {}
config = self.get_hf_config()
processor_config = (
get_hf_file_to_dict(
"processor_config.json",
model_config.model,
model_config.revision,
)
or {}
)
min_dynamic_patch = processor_config.get(
"min_input_tiles",
getattr(config, "min_dynamic_patch", 1),
)
max_dynamic_patch = processor_config.get(
"max_input_tiles",
getattr(config, "max_dynamic_patch", 1),
)
dynamic_image_size = processor_config.get(
"dynamic_image_size",
getattr(config, "dynamic_image_size", True),
)
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
kwargs.setdefault("image_size", config.force_image_size)
kwargs.setdefault("min_dynamic_patch", min_dynamic_patch)
kwargs.setdefault("max_dynamic_patch", max_dynamic_patch)
kwargs.setdefault("dynamic_image_size", dynamic_image_size)
kwargs.setdefault("use_thumbnail", True)
return LlamaNemotronVLEmbedImageProcessor(**kwargs)
def get_hf_processor(self, **kwargs: object) -> LlamaNemotronVLEmbedProcessor:
config = self.get_hf_config()
vision_config = config.vision_config
image_processor = self.get_image_processor(**kwargs)
image_size = image_processor.image_size
patch_size = vision_config.patch_size
downsample_ratio = config.downsample_ratio
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
return self.ctx.init_processor(
LlamaNemotronVLEmbedProcessor,
config=self.get_hf_config(),
return LlamaNemotronVLEmbedProcessor(
tokenizer=self.get_tokenizer(),
processor_config=processor_config,
**kwargs,
image_processor=image_processor,
image_seq_length=image_seq_length,
)
......
......@@ -27,7 +27,8 @@ from vllm.multimodal.processing import (
PromptUpdate,
PromptUpdateDetails,
)
from vllm.transformers_utils.processors.nvlm_d import IMG_PAD, NVLMProcessor
from vllm.transformers_utils.processors.internvl import InternVLImageProcessor
from vllm.transformers_utils.processors.nvlm_d import NVLMProcessor
from .intern_vit import InternVisionModel
from .internvl import (
......@@ -39,12 +40,33 @@ from .internvl import (
class NVLMProcessingInfo(BaseInternVLProcessingInfo):
def get_image_processor(self, **kwargs):
config = self.get_hf_config()
vision_config = config.vision_config
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
kwargs.setdefault("image_size", vision_config.image_size)
kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
kwargs.setdefault("use_thumbnail", config.use_thumbnail)
return InternVLImageProcessor(**kwargs)
def get_hf_processor(self, **kwargs: object) -> NVLMProcessor:
return self.ctx.init_processor(
NVLMProcessor,
config=self.get_hf_config(),
config = self.get_hf_config()
vision_config = config.vision_config
image_processor = self.get_image_processor(**kwargs)
image_size = image_processor.image_size
patch_size = vision_config.patch_size
downsample_ratio = config.downsample_ratio
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
return NVLMProcessor(
tokenizer=self.get_tokenizer(),
**kwargs,
image_processor=image_processor,
image_seq_length=image_seq_length,
)
......@@ -117,9 +139,11 @@ class NVLMMultiModalProcessor(BaseInternVLMultiModalProcessor[NVLMProcessingInfo
if num_patches is not None:
assert isinstance(num_patches, int)
repl = hf_processor.get_image_repl(feature_size, num_patches)
repl = hf_processor.get_image_repl(num_patches, num_features=feature_size)
return PromptUpdateDetails.select_text(repl.full + "\n", IMG_PAD)
return PromptUpdateDetails.select_text(
repl.full + "\n", hf_processor.ctx_image_token
)
# See note in dummy data regarding why we have the extra newline
return [
......
......@@ -440,13 +440,13 @@ class QwenVLProcessingInfo(BaseProcessingInfo):
vision_config = config.visual
image_size = vision_config["image_size"]
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
kwargs.setdefault("size", {"width": image_size, "height": image_size})
return QwenVLImageProcessorFast(**kwargs)
def get_hf_processor(self, **kwargs: object) -> QwenVLProcessor:
return self.ctx.init_processor(
QwenVLProcessor,
return QwenVLProcessor(
tokenizer=self.get_tokenizer(),
image_processor=self.get_image_processor(**kwargs),
)
......
......@@ -43,7 +43,10 @@ from vllm.multimodal.processing import (
PromptUpdate,
)
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.processors.skyworkr1v import SkyworkR1VProcessor
from vllm.transformers_utils.processors.internvl import (
InternVLImageProcessor,
InternVLProcessor,
)
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
......@@ -96,12 +99,33 @@ SkyworkR1VImageInputs: TypeAlias = (
class SkyworkR1VProcessingInfo(BaseProcessingInfo):
def get_hf_processor(self, **kwargs: object) -> SkyworkR1VProcessor:
return self.ctx.init_processor(
SkyworkR1VProcessor,
config=self.get_hf_config(),
def get_image_processor(self, **kwargs):
config = self.get_hf_config()
vision_config = config.vision_config
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
kwargs.setdefault("image_size", vision_config.image_size)
kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
kwargs.setdefault("use_thumbnail", config.use_thumbnail)
return InternVLImageProcessor(**kwargs)
def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
config = self.get_hf_config()
vision_config = config.vision_config
image_processor = self.get_image_processor(**kwargs)
image_size = image_processor.image_size
patch_size = vision_config.patch_size
downsample_ratio = config.downsample_ratio
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
return InternVLProcessor(
tokenizer=self.get_tokenizer(),
**kwargs,
image_processor=image_processor,
image_seq_length=image_seq_length,
)
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
......@@ -112,7 +136,7 @@ class SkyworkR1VProcessingInfo(BaseProcessingInfo):
*,
image_width: int,
image_height: int,
processor: SkyworkR1VProcessor,
processor: InternVLProcessor,
) -> int:
return processor.get_num_image_tokens(
image_width=image_width,
......@@ -121,8 +145,9 @@ class SkyworkR1VProcessingInfo(BaseProcessingInfo):
def get_image_size_with_most_features(self) -> ImageSize:
processor = self.get_hf_processor()
image_processor = processor.image_processor
base_size = processor.image_size
base_size = image_processor.image_size
target_ratios = processor.resolve_target_ratios()
largest_feature_size, largest_feature_pinpoint = 0, None
......@@ -187,7 +212,7 @@ class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[SkyworkR1VProcessing
)
hf_processor = self.info.get_hf_processor(**mm_kwargs)
image_token_id = hf_processor.image_token_id
image_token_id = hf_processor.ctx_image_token_id
# Since there may be extra tokens in the feature placeholders,
# we need to pass the image token ID to the model to select the
......@@ -252,7 +277,7 @@ class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[SkyworkR1VProcessing
if num_patches is not None:
assert isinstance(num_patches, int)
return hf_processor.get_image_repl(feature_size, num_patches)
return hf_processor.get_image_repl(num_patches, num_features=feature_size)
return [
PromptReplacement(
......
......@@ -14,7 +14,6 @@ __all__ = [
"BagelProcessor",
"CohereASRProcessor",
"DeepseekVLV2Processor",
"Eagle2_5_VLProcessor",
"FireRedASR2Processor",
"FunASRProcessor",
"GLM4VProcessor",
......@@ -34,14 +33,12 @@ __all__ = [
"Ovis2_5Processor",
"QwenVLProcessor",
"Qwen3ASRProcessor",
"SkyworkR1VProcessor",
]
_CLASS_TO_MODULE: dict[str, str] = {
"BagelProcessor": "vllm.transformers_utils.processors.bagel",
"CohereASRProcessor": "vllm.transformers_utils.processors.cohere_asr",
"DeepseekVLV2Processor": "vllm.transformers_utils.processors.deepseek_vl2",
"Eagle2_5_VLProcessor": "vllm.transformers_utils.processors.eagle2_5_vl",
"FireRedASR2Processor": "vllm.transformers_utils.processors.fireredasr2",
"FunASRProcessor": "vllm.transformers_utils.processors.funasr",
"GLM4VProcessor": "vllm.transformers_utils.processors.glm4v",
......@@ -61,7 +58,6 @@ _CLASS_TO_MODULE: dict[str, str] = {
"Ovis2_5Processor": "vllm.transformers_utils.processors.ovis2_5",
"QwenVLProcessor": "vllm.transformers_utils.processors.qwen_vl",
"Qwen3ASRProcessor": "vllm.transformers_utils.processors.qwen3_asr",
"SkyworkR1VProcessor": "vllm.transformers_utils.processors.skyworkr1v",
}
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from NVIDIA Eagle2.5-VL model
# https://huggingface.co/nvidia/Eagle2.5-8B
from transformers import PretrainedConfig
from vllm.multimodal.processing import PromptUpdateDetails
from vllm.tokenizers import TokenizerLike
from .internvl import IMG_CONTEXT, IMG_END, IMG_START, BaseInternVLProcessor
class Eagle2_5_VLProcessor(BaseInternVLProcessor):
"""
Custom processor for Eagle2.5-VL model.
Extends BaseInternVLProcessor with Eagle-specific token handling.
"""
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> None:
# Skip super().__init__() to avoid config manipulation
# Directly initialize all required attributes
self.config = config
self.tokenizer = tokenizer
# Image size with force_image_size override
image_size: int = config.vision_config.image_size
if hasattr(config, "force_image_size") and config.force_image_size:
image_size = config.force_image_size
patch_size: int = config.vision_config.patch_size
downsample_ratio: float = getattr(config, "downsample_ratio", 0.5)
# Compute num_image_token
self.num_image_token = int(
(image_size // patch_size) ** 2 * (downsample_ratio**2)
)
self.image_size = image_size
# Dynamic patch settings with defaults
self.min_dynamic_patch = (
min_dynamic_patch
if min_dynamic_patch is not None
else getattr(config, "min_dynamic_patch", 1)
)
self.max_dynamic_patch = (
max_dynamic_patch
if max_dynamic_patch is not None
else getattr(config, "max_dynamic_patch", 12)
)
self.dynamic_image_size = (
dynamic_image_size
if dynamic_image_size is not None
else getattr(config, "dynamic_image_size", True)
)
self.use_thumbnail: bool = getattr(config, "use_thumbnail", True)
@property
def image_token_id(self) -> int:
"""Get the image token ID from config or tokenizer."""
if hasattr(self.config, "image_token_index"):
return self.config.image_token_index
# Fallback to tokenizer vocab - use <IMG_CONTEXT> (ID: 151667)
vocab = self.tokenizer.get_vocab()
if IMG_CONTEXT in vocab:
return vocab[IMG_CONTEXT]
raise ValueError(f"Cannot find image token '{IMG_CONTEXT}' in vocabulary")
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
"""Get image replacement string for prompt."""
repl_features = IMG_CONTEXT * feature_size
repl_full = IMG_START + repl_features + IMG_END
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
......@@ -10,16 +10,12 @@
# --------------------------------------------------------
import torch
from PIL import Image
from transformers import PretrainedConfig
from vllm.multimodal.processing import PromptUpdateDetails
from vllm.tokenizers import TokenizerLike
from vllm.tokenizers.hf import HfTokenizer
from .internvl import (
IMG_CONTEXT,
IMG_END,
IMG_START,
BaseInternVLProcessor,
InternVLImageProcessor,
InternVLProcessor,
build_transform,
find_closest_aspect_ratio,
get_internvl_target_ratios,
......@@ -217,45 +213,26 @@ def image_to_pixel_values_h2ovl(
return pixel_values
class H2OVLProcessor(BaseInternVLProcessor):
class H2OVLImageProcessor(InternVLImageProcessor):
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
use_msac: bool | None = None,
image_size: int,
min_dynamic_patch: int,
max_dynamic_patch: int,
dynamic_image_size: bool,
use_thumbnail: bool,
use_msac: bool,
) -> None:
super().__init__(
config,
tokenizer,
image_size=image_size,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
)
if use_msac is None:
use_msac = config.use_msac
assert isinstance(use_msac, bool)
self.use_msac = use_msac
@property
def image_token_id(self) -> int:
return self.tokenizer.get_vocab()[IMG_CONTEXT]
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
repl_features = IMG_CONTEXT * feature_size
repl_full = IMG_START + repl_features + IMG_END
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
def resolve_min_max_num(
self,
*,
......@@ -264,18 +241,14 @@ class H2OVLProcessor(BaseInternVLProcessor):
dynamic_image_size: bool | None = None,
use_thumbnail: bool | None = None,
) -> tuple[int, int]:
min_dynamic_patch = (
self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
)
max_dynamic_patch = (
self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
)
dynamic_image_size = (
self.dynamic_image_size
if dynamic_image_size is None
else dynamic_image_size
)
use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
if min_dynamic_patch is None:
min_dynamic_patch = self.min_dynamic_patch
if max_dynamic_patch is None:
max_dynamic_patch = self.max_dynamic_patch
if dynamic_image_size is None:
dynamic_image_size = self.dynamic_image_size
if use_thumbnail is None:
use_thumbnail = self.use_thumbnail
return resolve_h2ovl_min_max_num(
min_dynamic_patch=min_dynamic_patch,
......@@ -284,6 +257,57 @@ class H2OVLProcessor(BaseInternVLProcessor):
use_thumbnail=use_thumbnail,
)
def _images_to_pixel_values_lst(
self,
images: list[Image.Image],
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> list[torch.Tensor]:
use_msac = self.use_msac if len(images) == 1 else False
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=False, # Applied in image_to_pixel_values
)
return [
image_to_pixel_values_h2ovl(
image,
input_size=self.image_size,
min_num=min_num,
max_num=max_num,
use_thumbnail=self.use_thumbnail,
use_msac=use_msac,
)
for image in images
]
class H2OVLProcessor(InternVLProcessor):
def __init__(
self,
image_processor: H2OVLImageProcessor,
tokenizer: HfTokenizer,
*,
image_seq_length: int,
start_image_token: str = "<img>",
end_image_token: str = "</img>",
ctx_image_token: str = "<IMG_CONTEXT>",
) -> None:
super().__init__(
image_processor=image_processor,
tokenizer=tokenizer,
image_seq_length=image_seq_length,
start_image_token=start_image_token,
end_image_token=end_image_token,
ctx_image_token=ctx_image_token,
)
self.image_processor: H2OVLImageProcessor
def resolve_target_ratios(
self,
*,
......@@ -294,7 +318,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
prior_aspect_ratio: tuple[int, int] | None = None,
override_min_num: int | None = None,
) -> list[tuple[int, int]]:
min_num, max_num = self.resolve_min_max_num(
min_num, max_num = self.image_processor.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
......@@ -316,9 +340,10 @@ class H2OVLProcessor(BaseInternVLProcessor):
image_height: int,
use_msac: bool | None = None,
) -> int:
use_msac = self.use_msac if use_msac is None else use_msac
image_processor = self.image_processor
use_msac = image_processor.use_msac if use_msac is None else use_msac
use_thumbnail = self.use_thumbnail
use_thumbnail = image_processor.use_thumbnail
if use_msac:
target_ratios_1 = self.resolve_target_ratios(
......@@ -328,7 +353,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
num_patches_1, _, _, aspect_ratio_1 = calculate_h2ovl_targets(
orig_width=image_width,
orig_height=image_height,
image_size=self.image_size,
image_size=image_processor.image_size,
target_ratios=target_ratios_1,
use_thumbnail=True,
)
......@@ -341,7 +366,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
num_patches_2, _, _, _ = calculate_h2ovl_targets(
orig_width=image_width,
orig_height=image_height,
image_size=self.image_size,
image_size=image_processor.image_size,
target_ratios=target_ratios_2,
use_thumbnail=True,
)
......@@ -354,37 +379,9 @@ class H2OVLProcessor(BaseInternVLProcessor):
num_patches, _, _, _ = calculate_h2ovl_targets(
orig_width=image_width,
orig_height=image_height,
image_size=self.image_size,
image_size=image_processor.image_size,
target_ratios=target_ratios,
use_thumbnail=use_thumbnail,
)
return num_patches * self.num_image_token
def _images_to_pixel_values_lst(
self,
images: list[Image.Image],
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> list[torch.Tensor]:
use_msac = self.use_msac if len(images) == 1 else False
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=False, # Applied in image_to_pixel_values
)
return [
image_to_pixel_values_h2ovl(
image,
input_size=self.image_size,
min_num=min_num,
max_num=max_num,
use_thumbnail=self.use_thumbnail,
use_msac=use_msac,
)
for image in images
]
return num_patches * self.image_seq_length
......@@ -7,24 +7,17 @@
# Copyright (c) 2023 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from abc import ABC, abstractmethod
from typing import Any, TypeVar
import numpy.typing as npt
import torch
import torchvision.transforms as T
from PIL import Image
from transformers import BatchFeature, PretrainedConfig, TensorType
from transformers import BatchFeature, TensorType
from transformers.processing_utils import ProcessorMixin
from vllm.multimodal.image import convert_image_mode
from vllm.multimodal.processing import PromptUpdateDetails
from vllm.tokenizers import TokenizerLike
_T = TypeVar("_T")
IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<IMG_CONTEXT>"
from vllm.tokenizers.hf import HfTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
......@@ -33,7 +26,7 @@ IMAGENET_STD = (0.229, 0.224, 0.225)
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def build_transform(input_size: int):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose(
return T.Compose(
[
T.Lambda(lambda img: convert_image_mode(img, "RGB")),
T.Resize(
......@@ -43,7 +36,6 @@ def build_transform(input_size: int):
T.Normalize(mean=MEAN, std=STD),
]
)
return transform
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
......@@ -223,65 +215,20 @@ def video_to_pixel_values_internvl(
return pixel_values
class BaseInternVLProcessor(ABC):
"""
This model doesn't define its own HF processor,
so we implement our own one here.
The code to insert image tokens is based on:
https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
"""
class InternVLImageProcessor:
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
image_size: int,
min_dynamic_patch: int,
max_dynamic_patch: int,
dynamic_image_size: bool,
use_thumbnail: bool,
) -> None:
super().__init__()
self.config = config
self.tokenizer = tokenizer
image_size: int = config.vision_config.image_size
patch_size: int = config.vision_config.patch_size
if min_dynamic_patch is None:
min_dynamic_patch = config.min_dynamic_patch
assert isinstance(min_dynamic_patch, int)
if max_dynamic_patch is None:
max_dynamic_patch = config.max_dynamic_patch
assert isinstance(max_dynamic_patch, int)
if dynamic_image_size is None:
dynamic_image_size = config.dynamic_image_size
assert isinstance(dynamic_image_size, bool)
self.num_image_token = int(
(image_size // patch_size) ** 2 * (config.downsample_ratio**2)
)
self.image_size = image_size
self.min_dynamic_patch = min_dynamic_patch
self.max_dynamic_patch = max_dynamic_patch
self.dynamic_image_size = dynamic_image_size
self.use_thumbnail: bool = config.use_thumbnail
@property
@abstractmethod
def image_token_id(self) -> int:
raise NotImplementedError
@abstractmethod
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
raise NotImplementedError
self.use_thumbnail = use_thumbnail
def resolve_min_max_num(
self,
......@@ -291,18 +238,14 @@ class BaseInternVLProcessor(ABC):
dynamic_image_size: bool | None = None,
use_thumbnail: bool | None = None,
) -> tuple[int, int]:
min_dynamic_patch = (
self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
)
max_dynamic_patch = (
self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
)
dynamic_image_size = (
self.dynamic_image_size
if dynamic_image_size is None
else dynamic_image_size
)
use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
if min_dynamic_patch is None:
min_dynamic_patch = self.min_dynamic_patch
if max_dynamic_patch is None:
max_dynamic_patch = self.max_dynamic_patch
if dynamic_image_size is None:
dynamic_image_size = self.dynamic_image_size
if use_thumbnail is None:
use_thumbnail = self.use_thumbnail
return resolve_internvl_min_max_num(
min_dynamic_patch=min_dynamic_patch,
......@@ -311,43 +254,6 @@ class BaseInternVLProcessor(ABC):
use_thumbnail=use_thumbnail,
)
def resolve_target_ratios(
self,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
use_thumbnail: bool | None = None,
) -> list[tuple[int, int]]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
)
return get_internvl_target_ratios(min_num, max_num)
def get_num_image_tokens(
self,
*,
image_width: int,
image_height: int,
) -> int:
target_ratios = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
)
num_patches, _, _ = calculate_internvl_targets(
orig_width=image_width,
orig_height=image_height,
image_size=self.image_size,
target_ratios=target_ratios,
use_thumbnail=self.use_thumbnail,
)
return num_patches * self.num_image_token
def _images_to_pixel_values_lst(
self,
images: list[Image.Image],
......@@ -355,7 +261,14 @@ class BaseInternVLProcessor(ABC):
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> list[torch.Tensor]:
min_num, max_num = self.resolve_min_max_num(
if min_dynamic_patch is None:
min_dynamic_patch = self.min_dynamic_patch
if max_dynamic_patch is None:
max_dynamic_patch = self.max_dynamic_patch
if dynamic_image_size is None:
dynamic_image_size = self.dynamic_image_size
min_num, max_num = resolve_internvl_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
......@@ -373,49 +286,9 @@ class BaseInternVLProcessor(ABC):
for image in images
]
def _preprocess_image(
self,
text: list[str],
images: list[Image.Image],
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> tuple[list[str], dict[str, torch.Tensor]]:
if len(images) == 0:
image_inputs = {}
else:
pixel_values_lst = self._images_to_pixel_values_lst(
images,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
image_inputs = {
"pixel_values_flat": torch.cat(pixel_values_lst),
"image_num_patches": torch.tensor(
[len(item) for item in pixel_values_lst]
),
}
for pixel_values in pixel_values_lst:
num_patches = pixel_values.shape[0]
feature_size = num_patches * self.num_image_token
image_repl = self.get_image_repl(feature_size, num_patches)
text = [t.replace("<image>", image_repl.full, 1) for t in text]
return text, image_inputs
def _make_batch_input(self, input_item: _T | list[_T] | None = None) -> list[_T]:
if input_item is None:
input_item = []
if not isinstance(input_item, list):
input_item = [input_item]
return input_item
def __call__(
self,
text: str | list[str] | None = None,
images: Image.Image | list[Image.Image] | None = None,
images: Image.Image | list[Image.Image],
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
......@@ -423,120 +296,173 @@ class BaseInternVLProcessor(ABC):
return_tensors: str | TensorType | None = None,
**kwargs,
) -> BatchFeature:
text = self._make_batch_input(text)
images = self._make_batch_input(images)
images_lst = [images] if not isinstance(images, list) else images
text, image_inputs = self._preprocess_image(
text=text,
images=images,
pixel_values_lst = self._images_to_pixel_values_lst(
images_lst,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
text_inputs = self.tokenizer(text)
image_inputs = {
"pixel_values_flat": torch.cat(pixel_values_lst),
"image_num_patches": torch.tensor([len(item) for item in pixel_values_lst]),
}
return BatchFeature(image_inputs, tensor_type=return_tensors)
combined_outputs = {**text_inputs, **image_inputs}
return BatchFeature(combined_outputs, tensor_type=return_tensors)
class InternVLVideoProcessor:
def __init__(
self,
image_size: int,
) -> None:
self.image_size = image_size
def _videos_to_pixel_values_lst(
self,
videos: list[npt.NDArray],
) -> list[torch.Tensor]:
return [
video_to_pixel_values_internvl(
video,
input_size=self.image_size,
min_num=1,
max_num=1,
use_thumbnail=False,
)
for video in videos
]
def __call__(
self,
videos: npt.NDArray | list[npt.NDArray],
*,
return_tensors: str | TensorType | None = None,
**kwargs,
) -> BatchFeature:
videos_lst = [videos] if not isinstance(videos, list) else videos
pixel_values_lst = self._videos_to_pixel_values_lst(videos_lst)
image_inputs = {
"pixel_values_flat_video": torch.cat(pixel_values_lst),
"video_num_patches": torch.tensor([len(item) for item in pixel_values_lst]),
}
return BatchFeature(image_inputs, tensor_type=return_tensors)
class InternVLProcessor(BaseInternVLProcessor):
class InternVLProcessor(ProcessorMixin):
"""
HF Processor for InternVLChatModel with extended video processing logic.
This model doesn't define its own HF processor,
so we implement our own one here.
The code to insert image tokens is based on:
https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
Code for video processing is adapted from video example:
https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers
"""
attributes = ["image_processor", "tokenizer", "video_processor"]
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
image_processor: InternVLImageProcessor,
tokenizer: HfTokenizer,
video_processor: InternVLVideoProcessor | None = None,
*,
image_seq_length: int,
start_image_token: str = "<img>",
end_image_token: str = "</img>",
ctx_image_token: str = "<IMG_CONTEXT>",
ctx_video_token: str | None = None,
) -> None:
self.image_processor = image_processor
self.tokenizer = tokenizer
self.video_processor = video_processor
self.image_seq_length = image_seq_length
self.start_image_token = start_image_token
self.end_image_token = end_image_token
self.ctx_image_token = ctx_image_token
self.ctx_video_token = ctx_video_token
self.start_image_token_id = tokenizer.convert_tokens_to_ids(start_image_token)
self.end_image_token_id = tokenizer.convert_tokens_to_ids(end_image_token)
self.ctx_image_token_id = tokenizer.convert_tokens_to_ids(ctx_image_token)
self.ctx_video_token_id = (
None
if ctx_video_token is None
else tokenizer.convert_tokens_to_ids(ctx_video_token)
)
def resolve_target_ratios(
self,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
video_token: str | None = None,
) -> None:
super().__init__(
config=config,
tokenizer=tokenizer,
use_thumbnail: bool | None = None,
) -> list[tuple[int, int]]:
min_num, max_num = self.image_processor.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
)
# add extra video token for video processing
self.video_token = video_token
@property
def image_token_id(self) -> int:
return self.tokenizer.get_vocab()[IMG_CONTEXT]
@property
def video_token_id(self) -> int | None:
if self.video_token is None:
return None
return self.tokenizer.get_vocab().get(self.video_token, None)
@property
def supports_video(self) -> bool:
return self.video_token_id is not None
return get_internvl_target_ratios(min_num, max_num)
def _videos_to_pixel_values_lst(
def get_num_image_tokens(
self,
videos: list[npt.NDArray],
dynamic_image_size: bool | None = None,
) -> list[torch.Tensor]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=1,
max_dynamic_patch=1,
dynamic_image_size=dynamic_image_size,
use_thumbnail=False, # Applied in image_to_pixel_values
*,
image_width: int,
image_height: int,
) -> int:
image_processor = self.image_processor
target_ratios = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
)
return [
video_to_pixel_values_internvl(
video,
input_size=self.image_size,
min_num=min_num,
max_num=max_num,
use_thumbnail=False,
)
for video in videos
]
num_patches, _, _ = calculate_internvl_targets(
orig_width=image_width,
orig_height=image_height,
image_size=image_processor.image_size,
target_ratios=target_ratios,
use_thumbnail=image_processor.use_thumbnail,
)
def _preprocess_video(
return num_patches * self.image_seq_length
def get_image_repl(
self,
text: list[str],
videos: list[npt.NDArray],
dynamic_image_size: bool | None = None,
) -> tuple[list[str], dict[str, Any]]:
if len(videos) == 0 or not self.supports_video:
return text, {}
num_patches: int | None,
num_features: int | None = None,
) -> PromptUpdateDetails[str]:
if num_patches is None:
assert num_features is not None
else:
num_features = num_patches * self.image_seq_length
video_token = self.video_token
assert video_token is not None
repl_features = self.ctx_image_token * num_features
repl_full = self.start_image_token + repl_features + self.end_image_token
pixel_values_lst_video = self._videos_to_pixel_values_lst(
videos,
dynamic_image_size=dynamic_image_size,
)
video_inputs = {
"pixel_values_flat_video": torch.cat(pixel_values_lst_video),
"video_num_patches": torch.tensor(
[len(item) for item in pixel_values_lst_video]
),
}
return PromptUpdateDetails.select_text(repl_full, self.ctx_image_token)
for pixel_values in pixel_values_lst_video:
num_patches = pixel_values.shape[0]
def get_video_repl(self, num_patches: int) -> PromptUpdateDetails[str]:
assert self.ctx_video_token is not None
video_repl = self.get_video_repl(
self.num_image_token, num_patches, video_token
)
text = [t.replace("<video>", video_repl.full, 1) for t in text]
return text, video_inputs
repl_features = self.ctx_video_token * self.image_seq_length
repl_features_with_sep = (
self.start_image_token + repl_features + self.end_image_token
)
# num_patches is equal to num_frames
repl_full = "".join(
[f"Frame{i + 1}: {repl_features_with_sep}" for i in range(num_patches)]
)
return PromptUpdateDetails.select_text(repl_full, self.ctx_video_token)
def __call__(
self,
......@@ -550,54 +476,88 @@ class InternVLProcessor(BaseInternVLProcessor):
return_tensors: str | TensorType | None = None,
**kwargs,
) -> BatchFeature:
text = self._make_batch_input(text)
images = self._make_batch_input(images)
videos = self._make_batch_input(videos)
if images is not None:
image_inputs = self.image_processor(
images=images,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
return_tensors=return_tensors,
)
image_num_patches = image_inputs["image_num_patches"]
else:
image_inputs = {}
image_num_patches = []
text, image_inputs = self._preprocess_image(
text=text,
images=images,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
if videos is not None:
if self.video_processor is None:
raise ValueError("This model does not support video inputs")
text, video_inputs = self._preprocess_video(
text=text,
videos=videos,
dynamic_image_size=dynamic_image_size,
)
video_inputs = self.video_processor(
videos=videos,
return_tensors=return_tensors,
)
video_num_patches = video_inputs["video_num_patches"]
else:
video_inputs = {}
video_num_patches = []
text_inputs = self.tokenizer(text)
if text is not None:
if not isinstance(text, list):
text = [text]
combined_outputs = {**text_inputs, **image_inputs, **video_inputs}
if image_inputs:
image_token = "<image>"
image_index = 0
processed_text = list[str]()
replace_strings = list[str]()
return BatchFeature(combined_outputs, tensor_type=return_tensors)
for prompt in text:
new_prompt = prompt
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
repl_features = IMG_CONTEXT * feature_size
repl_full = IMG_START + repl_features + IMG_END
while image_token in new_prompt:
new_prompt = new_prompt.replace(image_token, "<placeholder>", 1)
image_repl = self.get_image_repl(image_num_patches[image_index])
replace_strings.append(image_repl.full)
image_index += 1
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
while "<placeholder>" in new_prompt:
replace_str = replace_strings.pop(0)
new_prompt = new_prompt.replace("<placeholder>", replace_str, 1)
def get_video_repl(
self,
feature_size: int,
num_patches: int | None,
video_context_token: str = IMG_CONTEXT,
) -> PromptUpdateDetails[str]:
if num_patches is None:
raise NotImplementedError("Embedding inputs are not supported")
processed_text.append(new_prompt)
repl_features = video_context_token * self.num_image_token
repl_features_with_sep = IMG_START + repl_features + IMG_END
# num_patches is equal to num_frames
repl_full = "".join(
[f"Frame{i + 1}: {repl_features_with_sep}" for i in range(num_patches)]
)
text = processed_text
if video_inputs:
video_token = "<video>"
video_index = 0
processed_text = list[str]()
replace_strings = list[str]()
assert video_token is not None
for prompt in text:
new_prompt = prompt
while video_token in new_prompt:
new_prompt = new_prompt.replace(video_token, "<placeholder>", 1)
video_repl = self.get_video_repl(video_num_patches[video_index])
replace_strings.append(video_repl.full)
video_index += 1
while "<placeholder>" in new_prompt:
replace_str = replace_strings.pop(0)
new_prompt = new_prompt.replace("<placeholder>", replace_str, 1)
return PromptUpdateDetails.select_text(repl_full, video_context_token)
processed_text.append(new_prompt)
text = processed_text
text_inputs = self.tokenizer(text, return_tensors=return_tensors)
else:
text_inputs = {}
combined_outputs = {**text_inputs, **image_inputs, **video_inputs}
return BatchFeature(combined_outputs, tensor_type=return_tensors)
......@@ -25,7 +25,7 @@ from vllm.model_executor.models.parakeet import ParakeetExtractor
from vllm.multimodal.evs import compute_retained_tokens_count
from vllm.multimodal.inputs import AudioItem
from vllm.multimodal.processing.processor import PromptUpdateDetails, _seq2tokens
from vllm.tokenizers import TokenizerLike
from vllm.tokenizers.hf import HfTokenizer
from .internvl import calculate_internvl_targets, get_internvl_target_ratios
......@@ -508,7 +508,7 @@ class BaseNanoNemotronVLProcessor(ABC):
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
tokenizer: HfTokenizer,
*args,
max_model_len: int,
max_num_tiles: int | None = None,
......@@ -689,7 +689,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
tokenizer: HfTokenizer,
*,
max_model_len: int,
max_num_tiles: int | None = None,
......@@ -961,7 +961,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
tokens_per_frame: list[int],
frames_indices: list[int],
frame_duration_ms: int,
tokenizer: TokenizerLike,
tokenizer: HfTokenizer,
img_start_token_ids: list[int],
img_end_token_ids: list[int],
img_context_token_ids: list[int],
......@@ -986,7 +986,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
tokens_per_frame (list[int]): number of tokens per frame
frames_indices (list[int]): frame indices
frame_duration_ms (int): duration of each frame in milliseconds
tokenizer (TokenizerLike): tokenizer to use for tokenizing frame separators
tokenizer (HfTokenizer): tokenizer to use for tokenizing frame separators
img_start_token_ids (list[int]): pre-tokenized IMG_START tokens
img_end_token_ids (list[int]): pre-tokenized IMG_END tokens
img_context_token_ids (list[int]): pre-tokenized IMG_CONTEXT tokens
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from abc import ABC
import torch
import torchvision.transforms as T
from PIL import Image
from transformers import PretrainedConfig
from transformers.image_processing_utils_fast import BaseImageProcessorFast
from vllm.multimodal.image import convert_image_mode
from vllm.multimodal.processing import PromptUpdateDetails
from vllm.tokenizers import TokenizerLike
from vllm.tokenizers.hf import HfTokenizer
from .internvl import InternVLProcessor
from .internvl import InternVLImageProcessor, InternVLProcessor
# Configure PIL to handle large images without warnings
# This prevents DecompressionBombWarning for legitimate large images
......@@ -172,59 +168,61 @@ def image_to_pixel_values_nemotron_vl(
return pixel_values
class NemotronVLProcessor(InternVLProcessor):
IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<image>"
def __init__(
class LlamaNemotronNanoVLImageProcessor(InternVLImageProcessor):
def _images_to_pixel_values_lst(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
image_processor: BaseImageProcessorFast,
*,
images: list[Image.Image],
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> None:
ABC.__init__(self)
self.config = config
self.tokenizer = tokenizer
self.image_processor = image_processor
image_size: int = config.force_image_size
patch_size: int = config.patch_size
if min_dynamic_patch is None:
min_dynamic_patch = 1
assert isinstance(min_dynamic_patch, int)
if max_dynamic_patch is None:
max_dynamic_patch = self.image_processor.max_num_tiles
assert isinstance(max_dynamic_patch, int)
if dynamic_image_size is None:
dynamic_image_size = True
assert isinstance(dynamic_image_size, bool)
self.num_image_token = int(
(image_size // patch_size) ** 2 * (config.downsample_ratio**2)
) -> list[torch.Tensor]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=False, # Applied in image_to_pixel_values
)
self.image_size = image_size
self.min_dynamic_patch = min_dynamic_patch
self.max_dynamic_patch = max_dynamic_patch
self.dynamic_image_size = dynamic_image_size
if image_processor is not None:
self.use_thumbnail = image_processor.use_thumbnail
else:
self.use_thumbnail = getattr(config, "use_thumbnail", True)
return [
image_to_pixel_values_nemotron_vl(
image,
input_size=self.image_size,
min_num=min_num,
max_num=max_num,
use_thumbnail=self.use_thumbnail,
transform=build_transform(self.image_size),
)
for image in images
]
class LlamaNemotronNanoVLProcessor(InternVLProcessor):
"""
This model doesn't define its own HF processor,
so we implement our own one here.
@property
def image_token_id(self) -> int:
return self.tokenizer.get_vocab()[self.IMG_CONTEXT]
The image processor is given by:
https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1/blob/main/image_processing.py
"""
def _get_transform(self) -> T.Compose:
return build_transform(input_size=self.image_size)
def __init__(
self,
image_processor: LlamaNemotronNanoVLImageProcessor,
tokenizer: HfTokenizer,
*,
image_seq_length: int,
start_image_token: str = "<img>",
end_image_token: str = "</img>",
ctx_image_token: str = "<image>",
) -> None:
super().__init__(
image_processor=image_processor,
tokenizer=tokenizer,
image_seq_length=image_seq_length,
start_image_token=start_image_token,
end_image_token=end_image_token,
ctx_image_token=ctx_image_token,
)
def get_num_image_tokens(
self,
......@@ -232,6 +230,7 @@ class NemotronVLProcessor(InternVLProcessor):
image_width: int,
image_height: int,
) -> int:
image_processor = self.image_processor
target_ratios = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
)
......@@ -239,13 +238,33 @@ class NemotronVLProcessor(InternVLProcessor):
num_patches, _, _ = calculate_nemotron_vl_targets(
orig_width=image_width,
orig_height=image_height,
image_size=self.image_size,
image_size=image_processor.image_size,
target_ratios=target_ratios,
use_thumbnail=self.use_thumbnail,
use_thumbnail=image_processor.use_thumbnail,
)
return num_patches * self.num_image_token
return num_patches * self.image_seq_length
# SigLIP normalization constants
SIGLIP_MEAN = (0.5, 0.5, 0.5)
SIGLIP_STD = (0.5, 0.5, 0.5)
def build_siglip_transform(input_size: int):
"""Build transform for SigLIP vision encoder with normalization.
Extends the base transform from nemotron_vl with SigLIP-specific normalization.
"""
return T.Compose(
[
build_transform(input_size=input_size),
T.Normalize(mean=SIGLIP_MEAN, std=SIGLIP_STD),
]
)
class LlamaNemotronVLEmbedImageProcessor(InternVLImageProcessor):
def _images_to_pixel_values_lst(
self,
images: list[Image.Image],
......@@ -267,83 +286,13 @@ class NemotronVLProcessor(InternVLProcessor):
min_num=min_num,
max_num=max_num,
use_thumbnail=self.use_thumbnail,
transform=self._get_transform(),
transform=build_siglip_transform(self.image_size),
)
for image in images
]
def _replace_image_tokens(
self,
text: list[str],
pixel_values_lst: list[torch.Tensor],
) -> list[str]:
"""Replace <image> placeholders with image tokens."""
for pixel_values in pixel_values_lst:
num_patches = pixel_values.shape[0]
feature_size = num_patches * self.num_image_token
image_repl = self.get_image_repl(feature_size, num_patches)
# Use temporary placeholder to avoid replacing tokens we just inserted
NVL_IMAGE_CONTEXT = image_repl.full.replace("<image>", "<NVL_IMG_CONTEXT>")
text = [t.replace("<image>", NVL_IMAGE_CONTEXT, 1) for t in text]
return [t.replace("<NVL_IMG_CONTEXT>", self.IMG_CONTEXT) for t in text]
def _preprocess_image(
self,
text: list[str],
images: list[Image.Image],
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> tuple[list[str], dict[str, torch.Tensor]]:
if len(images) == 0:
image_inputs = {}
else:
pixel_values_lst = self._images_to_pixel_values_lst(
images,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
image_inputs = {
"pixel_values_flat": torch.cat(pixel_values_lst),
"image_num_patches": torch.tensor(
[len(item) for item in pixel_values_lst]
),
}
text = self._replace_image_tokens(text, pixel_values_lst)
return text, image_inputs
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
repl_features = self.IMG_CONTEXT * feature_size
repl_full = self.IMG_START + repl_features + self.IMG_END
return PromptUpdateDetails.select_text(repl_full, self.IMG_CONTEXT)
# SigLIP normalization constants
SIGLIP_MEAN = (0.5, 0.5, 0.5)
SIGLIP_STD = (0.5, 0.5, 0.5)
def build_siglip_transform(input_size: int):
"""Build transform for SigLIP vision encoder with normalization.
Extends the base transform from nemotron_vl with SigLIP-specific normalization.
"""
return T.Compose(
[
build_transform(input_size=input_size),
T.Normalize(mean=SIGLIP_MEAN, std=SIGLIP_STD),
]
)
class LlamaNemotronVLEmbedProcessor(NemotronVLProcessor):
class LlamaNemotronVLEmbedProcessor(InternVLProcessor):
"""
Processor for LlamaNemotronVL embedding model.
......@@ -352,59 +301,44 @@ class LlamaNemotronVLEmbedProcessor(NemotronVLProcessor):
- Uses different image context token (<IMG_CONTEXT> vs <image>)
"""
IMG_CONTEXT = "<IMG_CONTEXT>"
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
processor_config: dict,
image_processor: LlamaNemotronVLEmbedImageProcessor,
tokenizer: HfTokenizer,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
image_seq_length: int,
start_image_token: str = "<img>",
end_image_token: str = "</img>",
ctx_image_token: str = "<IMG_CONTEXT>",
) -> None:
if min_dynamic_patch is None:
min_dynamic_patch = processor_config.get(
"min_input_tiles",
getattr(config, "min_dynamic_patch", 1),
)
if max_dynamic_patch is None:
max_dynamic_patch = processor_config.get(
"max_input_tiles",
getattr(config, "max_dynamic_patch", 1),
)
if dynamic_image_size is None:
dynamic_image_size = processor_config.get(
"dynamic_image_size",
getattr(config, "dynamic_image_size", True),
)
super().__init__(
config=config,
image_processor=image_processor,
tokenizer=tokenizer,
image_processor=None,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
image_seq_length=image_seq_length,
start_image_token=start_image_token,
end_image_token=end_image_token,
ctx_image_token=ctx_image_token,
)
def _get_transform(self) -> T.Compose:
"""Override to add SigLIP normalization."""
return build_siglip_transform(input_size=self.image_size)
self.image_processor: LlamaNemotronVLEmbedImageProcessor
def _replace_image_tokens(
def get_num_image_tokens(
self,
text: list[str],
pixel_values_lst: list[torch.Tensor],
) -> list[str]:
"""Override with simpler token replacement for embedding model.
No temporary placeholder needed because IMG_CONTEXT is <IMG_CONTEXT>,
not <image>, so there's no collision risk.
"""
for pixel_values in pixel_values_lst:
num_patches = pixel_values.shape[0]
feature_size = num_patches * self.num_image_token
image_repl = self.get_image_repl(feature_size, num_patches)
text = [t.replace("<image>", image_repl.full, 1) for t in text]
return text
*,
image_width: int,
image_height: int,
) -> int:
image_processor = self.image_processor
target_ratios = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
)
num_patches, _, _ = calculate_nemotron_vl_targets(
orig_width=image_width,
orig_height=image_height,
image_size=image_processor.image_size,
target_ratios=target_ratios,
use_thumbnail=image_processor.use_thumbnail,
)
return num_patches * self.image_seq_length
......@@ -8,37 +8,54 @@
# Licensed under Apache 2.0 License [see LICENSE for details]
# --------------------------------------------------------
from vllm.multimodal.processing import PromptUpdateDetails
from vllm.tokenizers.hf import HfTokenizer
from .internvl import BaseInternVLProcessor
from .internvl import InternVLImageProcessor, InternVLProcessor
IMG_PAD = "<|vision_pad|>"
class NVLMProcessor(BaseInternVLProcessor):
@property
def image_token_id(self) -> int:
return self.tokenizer.get_vocab()[IMG_PAD]
class NVLMProcessor(InternVLProcessor):
def __init__(
self,
image_processor: InternVLImageProcessor,
tokenizer: HfTokenizer,
*,
image_seq_length: int,
start_image_token: str = "<Image>",
end_image_token: str = "</Image>",
ctx_image_token: str = "<|vision_pad|>",
) -> None:
super().__init__(
image_processor=image_processor,
tokenizer=tokenizer,
image_seq_length=image_seq_length,
start_image_token=start_image_token,
end_image_token=end_image_token,
ctx_image_token=ctx_image_token,
)
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
num_features: int | None = None,
) -> PromptUpdateDetails[str]:
if num_patches is None:
raise NotImplementedError("Embedding inputs are not supported")
num_features = num_patches * self.image_seq_length
tile_pos_identifiers = [f"<tile_{i}>" for i in range(1, num_patches)]
if self.use_thumbnail:
if self.image_processor.use_thumbnail:
tile_pos_identifiers += ["<tile_global_thumbnail>"]
context_size = feature_size // num_patches
context_size = num_features // num_patches
features = "".join(
identifier + IMG_PAD * context_size for identifier in tile_pos_identifiers
(identifier + self.ctx_image_token * context_size)
for identifier in tile_pos_identifiers
)
# We include the start and end as well because "<Image><tile" is
# tokenized as ["<Image", "><", "tile"], resulting in assertion error
# when trying to find "<tile" as a subsequence of "<Image><tile"
repl = "<Image>" + features + "</Image>"
repl = self.start_image_token + features + self.end_image_token
return PromptUpdateDetails.select_text(repl, IMG_PAD)
return PromptUpdateDetails.select_text(repl, self.ctx_image_token)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py
# --------------------------------------------------------
# SkyworkR1V
# Copyright (c) 2025 Skywork
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import torch
import torchvision.transforms as T
from PIL import Image
from transformers import BatchFeature, PretrainedConfig, TensorType
from vllm.multimodal.image import convert_image_mode
from vllm.multimodal.processing import PromptUpdateDetails
from vllm.tokenizers import TokenizerLike
IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<IMG_CONTEXT>"
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
def build_transform(input_size: int):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
return T.Compose(
[
T.Lambda(lambda img: convert_image_mode(img, "RGB")),
T.Resize(
(input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD),
]
)
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
def find_closest_aspect_ratio(
aspect_ratio: float,
target_ratios: list[tuple[int, int]],
*,
width: int,
height: int,
image_size: int,
) -> tuple[int, int]:
best_ratio_diff = float("inf")
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def resolve_skyworkr1v_min_max_num(
*,
min_dynamic_patch: int,
max_dynamic_patch: int,
dynamic_image_size: bool,
use_thumbnail: bool,
) -> tuple[int, int]:
min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
if use_thumbnail and max_dynamic_patch != 1:
max_dynamic_patch += 1
return min_dynamic_patch, max_dynamic_patch
def get_skyworkr1v_target_ratios(
min_num: int,
max_num: int,
) -> list[tuple[int, int]]:
target_ratios = {
(i, j)
for n in range(min_num, max_num + 1)
for i in range(1, n + 1)
for j in range(1, n + 1)
if min_num <= i * j <= max_num
}
return sorted(target_ratios, key=lambda x: x[0] * x[1])
def calculate_skyworkr1v_targets(
*,
orig_width: int,
orig_height: int,
target_ratios: list[tuple[int, int]],
image_size: int,
use_thumbnail: bool,
) -> tuple[int, int, int]:
aspect_ratio = orig_width / orig_height
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio,
target_ratios,
width=orig_width,
height=orig_height,
image_size=image_size,
)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# add thumbnail image if num_blocks != 1
if use_thumbnail and blocks != 1:
blocks += 1
return blocks, target_width, target_height
def dynamic_preprocess_skyworkr1v(
image: Image.Image,
*,
target_ratios: list[tuple[int, int]],
image_size: int,
use_thumbnail: bool,
) -> list[Image.Image]:
orig_width, orig_height = image.size
# calculate the number of blocks without thumbnail
blocks, target_width, target_height = calculate_skyworkr1v_targets(
orig_width=orig_width,
orig_height=orig_height,
target_ratios=target_ratios,
image_size=image_size,
use_thumbnail=False,
)
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size,
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B
def image_to_pixel_values_skyworkr1v(
image: Image.Image,
*,
input_size: int,
min_num: int,
max_num: int,
use_thumbnail: bool,
) -> torch.Tensor:
target_ratios = get_skyworkr1v_target_ratios(min_num, max_num)
transform = build_transform(input_size=input_size)
images = dynamic_preprocess_skyworkr1v(
image,
target_ratios=target_ratios,
image_size=input_size,
use_thumbnail=use_thumbnail,
)
pixel_values = torch.stack([transform(image) for image in images])
return pixel_values
class SkyworkR1VProcessor:
"""
This model doesn't define its own HF processor,
so we implement our own one here.
The code to insert image tokens is based on:
https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py#L252
"""
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> None:
super().__init__()
self.config = config
self.tokenizer = tokenizer
image_size: int = config.vision_config.image_size
patch_size: int = config.vision_config.patch_size
if min_dynamic_patch is None:
min_dynamic_patch = config.min_dynamic_patch
assert isinstance(min_dynamic_patch, int)
if max_dynamic_patch is None:
max_dynamic_patch = config.max_dynamic_patch
assert isinstance(max_dynamic_patch, int)
if dynamic_image_size is None:
dynamic_image_size = config.dynamic_image_size
assert isinstance(dynamic_image_size, bool)
self.num_image_token = int(
(image_size // patch_size) ** 2 * (config.downsample_ratio**2)
)
self.image_size = image_size
self.min_dynamic_patch = min_dynamic_patch
self.max_dynamic_patch = max_dynamic_patch
self.dynamic_image_size = dynamic_image_size
self.use_thumbnail: bool = config.use_thumbnail
@property
def image_token_id(self) -> int:
return self.tokenizer.get_vocab()[IMG_CONTEXT]
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
repl_features = IMG_CONTEXT * feature_size
repl_full = IMG_START + repl_features + IMG_END
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
def resolve_min_max_num(
self,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
use_thumbnail: bool | None = None,
) -> tuple[int, int]:
min_dynamic_patch = (
self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
)
max_dynamic_patch = (
self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
)
dynamic_image_size = (
self.dynamic_image_size
if dynamic_image_size is None
else dynamic_image_size
)
use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
return resolve_skyworkr1v_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
)
def resolve_target_ratios(
self,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
use_thumbnail: bool | None = None,
) -> list[tuple[int, int]]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
)
return get_skyworkr1v_target_ratios(min_num, max_num)
def get_num_image_tokens(
self,
*,
image_width: int,
image_height: int,
) -> int:
target_ratios = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
)
num_patches, _, _ = calculate_skyworkr1v_targets(
orig_width=image_width,
orig_height=image_height,
image_size=self.image_size,
target_ratios=target_ratios,
use_thumbnail=self.use_thumbnail,
)
return num_patches * self.num_image_token
def _images_to_pixel_values_lst(
self,
images: list[Image.Image],
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> list[torch.Tensor]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=False, # Applied in image_to_pixel_values
)
return [
image_to_pixel_values_skyworkr1v(
image,
input_size=self.image_size,
min_num=min_num,
max_num=max_num,
use_thumbnail=self.use_thumbnail,
)
for image in images
]
def __call__(
self,
text: str | list[str] | None = None,
images: Image.Image | list[Image.Image] | None = None,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
return_tensors: str | TensorType | None = None,
) -> BatchFeature:
if text is None:
text = []
if not isinstance(text, list):
text = [text]
if images is None:
images = []
if not isinstance(images, list):
images = [images]
if len(images) == 0:
image_inputs = {}
else:
pixel_values_lst = self._images_to_pixel_values_lst(
images,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
image_inputs = {
"pixel_values_flat": torch.cat(pixel_values_lst),
"image_num_patches": torch.tensor(
[len(item) for item in pixel_values_lst]
),
}
for pixel_values in pixel_values_lst:
num_patches = pixel_values.shape[0]
feature_size = num_patches * self.num_image_token
image_repl = self.get_image_repl(feature_size, num_patches)
text = [t.replace("<image>", image_repl.full, 1) for t in text]
text_inputs = self.tokenizer(text)
combined_outputs = {**text_inputs, **image_inputs}
return BatchFeature(combined_outputs, tensor_type=return_tensors)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment