Unverified Commit f3403243 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[1/2] Move InternVL-based processors (#37260)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent 2660b928
...@@ -23,7 +23,7 @@ def _get_expected_num_patches( ...@@ -23,7 +23,7 @@ def _get_expected_num_patches(
min_num: int, min_num: int,
max_num: int, max_num: int,
): ):
from vllm.model_executor.models.h2ovl import ( from vllm.transformers_utils.processors.h2ovl import (
calculate_h2ovl_targets, calculate_h2ovl_targets,
get_h2ovl_target_ratios, get_h2ovl_target_ratios,
) )
......
...@@ -23,7 +23,7 @@ def _get_expected_num_patches( ...@@ -23,7 +23,7 @@ def _get_expected_num_patches(
min_num: int, min_num: int,
max_num: int, max_num: int,
): ):
from vllm.model_executor.models.internvl import ( from vllm.transformers_utils.processors.internvl import (
calculate_internvl_targets, calculate_internvl_targets,
get_internvl_target_ratios, get_internvl_target_ratios,
) )
......
...@@ -23,7 +23,7 @@ def _get_expected_num_patches( ...@@ -23,7 +23,7 @@ def _get_expected_num_patches(
min_num: int, min_num: int,
max_num: int, max_num: int,
): ):
from vllm.model_executor.models.nemotron_vl import ( from vllm.transformers_utils.processors.nemotron_vl import (
calculate_nemotron_vl_targets, calculate_nemotron_vl_targets,
get_nemotron_vl_target_ratios, get_nemotron_vl_target_ratios,
) )
......
...@@ -15,9 +15,8 @@ from vllm.model_executor.layers.quantization import QuantizationConfig ...@@ -15,9 +15,8 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.models.siglip import SiglipVisionModel from vllm.model_executor.models.siglip import SiglipVisionModel
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.processing import PromptUpdateDetails
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.tokenizers import TokenizerLike from vllm.transformers_utils.processors.eagle2_5_vl import Eagle2_5_VLProcessor
from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import ( from .interfaces import (
...@@ -27,13 +26,9 @@ from .interfaces import ( ...@@ -27,13 +26,9 @@ from .interfaces import (
SupportsPP, SupportsPP,
) )
from .internvl import ( from .internvl import (
IMG_CONTEXT,
IMG_END,
IMG_START,
BaseInternVLDummyInputsBuilder, BaseInternVLDummyInputsBuilder,
BaseInternVLMultiModalProcessor, BaseInternVLMultiModalProcessor,
BaseInternVLProcessingInfo, BaseInternVLProcessingInfo,
BaseInternVLProcessor,
) )
from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
...@@ -70,81 +65,6 @@ Eagle2_5_VLImageInputs: TypeAlias = ( ...@@ -70,81 +65,6 @@ Eagle2_5_VLImageInputs: TypeAlias = (
) )
class Eagle2_5_VLProcessor(BaseInternVLProcessor):
"""
Custom processor for Eagle2.5-VL model.
Extends BaseInternVLProcessor with Eagle-specific token handling.
"""
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> None:
# Skip super().__init__() to avoid config manipulation
# Directly initialize all required attributes
self.config = config
self.tokenizer = tokenizer
# Image size with force_image_size override
image_size: int = config.vision_config.image_size
if hasattr(config, "force_image_size") and config.force_image_size:
image_size = config.force_image_size
patch_size: int = config.vision_config.patch_size
downsample_ratio: float = getattr(config, "downsample_ratio", 0.5)
# Compute num_image_token
self.num_image_token = int(
(image_size // patch_size) ** 2 * (downsample_ratio**2)
)
self.image_size = image_size
# Dynamic patch settings with defaults
self.min_dynamic_patch = (
min_dynamic_patch
if min_dynamic_patch is not None
else getattr(config, "min_dynamic_patch", 1)
)
self.max_dynamic_patch = (
max_dynamic_patch
if max_dynamic_patch is not None
else getattr(config, "max_dynamic_patch", 12)
)
self.dynamic_image_size = (
dynamic_image_size
if dynamic_image_size is not None
else getattr(config, "dynamic_image_size", True)
)
self.use_thumbnail: bool = getattr(config, "use_thumbnail", True)
@property
def image_token_id(self) -> int:
"""Get the image token ID from config or tokenizer."""
if hasattr(self.config, "image_token_index"):
return self.config.image_token_index
# Fallback to tokenizer vocab - use <IMG_CONTEXT> (ID: 151667)
vocab = self.tokenizer.get_vocab()
if IMG_CONTEXT in vocab:
return vocab[IMG_CONTEXT]
raise ValueError(f"Cannot find image token '{IMG_CONTEXT}' in vocabulary")
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
"""Get image replacement string for prompt."""
repl_features = IMG_CONTEXT * feature_size
repl_full = IMG_START + repl_features + IMG_END
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
class Eagle2_5_VLProcessingInfo(BaseInternVLProcessingInfo): class Eagle2_5_VLProcessingInfo(BaseInternVLProcessingInfo):
"""Processing info for Eagle2.5-VL model.""" """Processing info for Eagle2.5-VL model."""
......
...@@ -11,7 +11,6 @@ ...@@ -11,7 +11,6 @@
from collections.abc import Mapping, Sequence from collections.abc import Mapping, Sequence
import torch import torch
from PIL import Image
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
...@@ -27,391 +26,19 @@ from vllm.multimodal.processing.processor import ( ...@@ -27,391 +26,19 @@ from vllm.multimodal.processing.processor import (
ProcessorInputs, ProcessorInputs,
PromptReplacement, PromptReplacement,
PromptUpdate, PromptUpdate,
PromptUpdateDetails,
TimingContext, TimingContext,
) )
from vllm.tokenizers import TokenizerLike from vllm.transformers_utils.processors.h2ovl import H2OVLProcessor
from .intern_vit import InternVisionModel from .intern_vit import InternVisionModel
from .internvl import ( from .internvl import (
IMG_CONTEXT,
IMG_END,
IMG_START,
BaseInternVLDummyInputsBuilder, BaseInternVLDummyInputsBuilder,
BaseInternVLMultiModalProcessor, BaseInternVLMultiModalProcessor,
BaseInternVLProcessingInfo, BaseInternVLProcessingInfo,
BaseInternVLProcessor,
InternVLChatModel, InternVLChatModel,
build_transform,
find_closest_aspect_ratio,
get_internvl_target_ratios,
) )
def resolve_h2ovl_min_max_num(
*,
min_dynamic_patch: int,
max_dynamic_patch: int,
dynamic_image_size: bool,
use_thumbnail: bool,
) -> tuple[int, int]:
min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
if use_thumbnail and max_dynamic_patch != 1:
max_dynamic_patch += 1
return min_dynamic_patch, max_dynamic_patch
def get_h2ovl_target_ratios(
min_num: int,
max_num: int,
*,
prior_aspect_ratio: tuple[int, int] | None,
) -> list[tuple[int, int]]:
target_ratios = get_internvl_target_ratios(min_num, max_num)
# if prior_aspect_ratio is provided, filter the target ratios
if prior_aspect_ratio is not None:
target_ratios = [
ratio
for ratio in target_ratios
if prior_aspect_ratio[0] % ratio[0] != 0
and prior_aspect_ratio[1] % ratio[1] != 0
]
return target_ratios
# modified to include blocks generated in second pass
def calculate_h2ovl_targets(
*,
orig_width: int,
orig_height: int,
target_ratios: list[tuple[int, int]],
image_size: int,
use_thumbnail: bool,
) -> tuple[int, int, int, tuple[int, int]]:
aspect_ratio = orig_width / orig_height
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio,
target_ratios,
width=orig_width,
height=orig_height,
image_size=image_size,
)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# add thumbnail image if num_blocks != 1
if use_thumbnail and blocks != 1:
blocks += 1
return blocks, target_width, target_height, target_aspect_ratio
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
# refactored to handle prior_aspect_ratio
def dynamic_preprocess_h2ovl(
image: Image.Image,
*,
target_ratios: list[tuple[int, int]],
image_size: int,
use_thumbnail: bool,
) -> tuple[list[Image.Image], tuple[int, int]]:
orig_width, orig_height = image.size
# calculate the number of blocks without thumbnail
(
blocks,
target_width,
target_height,
target_aspect_ratio,
) = calculate_h2ovl_targets(
orig_width=orig_width,
orig_height=orig_height,
target_ratios=target_ratios,
image_size=image_size,
use_thumbnail=False,
)
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size,
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images, target_aspect_ratio
def _preprocess_image(
image: Image.Image,
*,
input_size: int,
min_num: int,
max_num: int,
use_thumbnail: bool,
prior_aspect_ratio: tuple[int, int] | None,
) -> tuple[torch.Tensor, tuple[int, int]]:
target_ratios = get_h2ovl_target_ratios(
min_num,
max_num,
prior_aspect_ratio=prior_aspect_ratio,
)
transform = build_transform(input_size=input_size)
images, target_aspect_ratio = dynamic_preprocess_h2ovl(
image,
image_size=input_size,
use_thumbnail=use_thumbnail,
target_ratios=target_ratios,
)
pixel_values = torch.stack([transform(image) for image in images])
return pixel_values, target_aspect_ratio
# refactored to use the _preprocess_image function
def image_to_pixel_values_h2ovl(
image: Image.Image,
*,
input_size: int,
min_num: int,
max_num: int,
use_thumbnail: bool,
use_msac: bool,
) -> torch.Tensor:
# when MSAC is turned on, we need to process the image twice
if use_msac:
# first pass
pixel_values1, aspect_ratio1 = _preprocess_image(
image,
input_size=input_size,
min_num=1,
max_num=max_num,
use_thumbnail=True,
prior_aspect_ratio=None,
)
# second pass
pixel_values2, _ = _preprocess_image(
image,
input_size=input_size,
min_num=3,
max_num=max_num,
use_thumbnail=True,
prior_aspect_ratio=aspect_ratio1,
)
# combine pixel values
pixel_values = torch.cat(
[pixel_values2[:-1], pixel_values1[:-1], pixel_values2[-1:]], 0
)
else:
pixel_values, _ = _preprocess_image(
image,
input_size=input_size,
min_num=min_num,
max_num=max_num,
use_thumbnail=use_thumbnail,
prior_aspect_ratio=None,
)
return pixel_values
class H2OVLProcessor(BaseInternVLProcessor):
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
use_msac: bool | None = None,
) -> None:
super().__init__(
config,
tokenizer,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
if use_msac is None:
use_msac = config.use_msac
assert isinstance(use_msac, bool)
self.use_msac = use_msac
@property
def image_token_id(self) -> int:
return self.tokenizer.get_vocab()[IMG_CONTEXT]
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
repl_features = IMG_CONTEXT * feature_size
repl_full = IMG_START + repl_features + IMG_END
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
def resolve_min_max_num(
self,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
use_thumbnail: bool | None = None,
) -> tuple[int, int]:
min_dynamic_patch = (
self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
)
max_dynamic_patch = (
self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
)
dynamic_image_size = (
self.dynamic_image_size
if dynamic_image_size is None
else dynamic_image_size
)
use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
return resolve_h2ovl_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
)
def resolve_target_ratios(
self,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
use_thumbnail: bool | None = None,
prior_aspect_ratio: tuple[int, int] | None = None,
override_min_num: int | None = None,
) -> list[tuple[int, int]]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
)
if override_min_num is not None:
min_num = override_min_num
return get_h2ovl_target_ratios(
min_num,
max_num,
prior_aspect_ratio=prior_aspect_ratio,
)
def get_num_image_tokens(
self,
*,
image_width: int,
image_height: int,
use_msac: bool | None = None,
) -> int:
use_msac = self.use_msac if use_msac is None else use_msac
use_thumbnail = self.use_thumbnail
if use_msac:
target_ratios_1 = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
override_min_num=1,
)
num_patches_1, _, _, aspect_ratio_1 = calculate_h2ovl_targets(
orig_width=image_width,
orig_height=image_height,
image_size=self.image_size,
target_ratios=target_ratios_1,
use_thumbnail=True,
)
target_ratios_2 = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
prior_aspect_ratio=aspect_ratio_1,
override_min_num=3,
)
num_patches_2, _, _, _ = calculate_h2ovl_targets(
orig_width=image_width,
orig_height=image_height,
image_size=self.image_size,
target_ratios=target_ratios_2,
use_thumbnail=True,
)
num_patches = num_patches_1 + num_patches_2 - 1
else:
target_ratios = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
)
num_patches, _, _, _ = calculate_h2ovl_targets(
orig_width=image_width,
orig_height=image_height,
image_size=self.image_size,
target_ratios=target_ratios,
use_thumbnail=use_thumbnail,
)
return num_patches * self.num_image_token
def _images_to_pixel_values_lst(
self,
images: list[Image.Image],
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> list[torch.Tensor]:
use_msac = self.use_msac if len(images) == 1 else False
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=False, # Applied in image_to_pixel_values
)
return [
image_to_pixel_values_h2ovl(
image,
input_size=self.image_size,
min_num=min_num,
max_num=max_num,
use_thumbnail=self.use_thumbnail,
use_msac=use_msac,
)
for image in images
]
class H2OVLProcessingInfo(BaseInternVLProcessingInfo): class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
def get_hf_processor(self, **kwargs: object) -> H2OVLProcessor: def get_hf_processor(self, **kwargs: object) -> H2OVLProcessor:
return self.ctx.init_processor( return self.ctx.init_processor(
......
...@@ -7,16 +7,13 @@ ...@@ -7,16 +7,13 @@
# Copyright (c) 2023 OpenGVLab # Copyright (c) 2023 OpenGVLab
# Licensed under The MIT License [see LICENSE for details] # Licensed under The MIT License [see LICENSE for details]
# -------------------------------------------------------- # --------------------------------------------------------
from abc import ABC, abstractmethod from abc import abstractmethod
from collections.abc import Iterable, Mapping, Sequence from collections.abc import Iterable, Mapping, Sequence
from typing import Annotated, Any, Literal, TypeAlias, TypeVar from typing import Annotated, Literal, TypeAlias, TypeVar
import numpy.typing as npt
import torch import torch
import torch.nn as nn import torch.nn as nn
import torchvision.transforms as T from transformers import BatchFeature, PretrainedConfig
from PIL import Image
from transformers import BatchFeature, PretrainedConfig, TensorType
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
...@@ -28,7 +25,6 @@ from vllm.model_executor.models.intern_vit import ( ...@@ -28,7 +25,6 @@ from vllm.model_executor.models.intern_vit import (
) )
from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.image import convert_image_mode
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict, MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
...@@ -46,10 +42,12 @@ from vllm.multimodal.processing import ( ...@@ -46,10 +42,12 @@ from vllm.multimodal.processing import (
BaseProcessingInfo, BaseProcessingInfo,
PromptReplacement, PromptReplacement,
PromptUpdate, PromptUpdate,
PromptUpdateDetails,
) )
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.tokenizers import TokenizerLike from vllm.transformers_utils.processors.internvl import (
BaseInternVLProcessor,
InternVLProcessor,
)
from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import ( from .interfaces import (
...@@ -60,13 +58,6 @@ from .interfaces import ( ...@@ -60,13 +58,6 @@ from .interfaces import (
) )
from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<IMG_CONTEXT>"
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
class InternVLImagePixelInputs(TensorSchema): class InternVLImagePixelInputs(TensorSchema):
""" """
...@@ -128,568 +119,6 @@ class InternVLVideoEmbeddingInputs(TensorSchema): ...@@ -128,568 +119,6 @@ class InternVLVideoEmbeddingInputs(TensorSchema):
InternVLVideoInputs: TypeAlias = InternVLVideoPixelInputs | InternVLVideoEmbeddingInputs InternVLVideoInputs: TypeAlias = InternVLVideoPixelInputs | InternVLVideoEmbeddingInputs
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def build_transform(input_size: int):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose(
[
T.Lambda(lambda img: convert_image_mode(img, "RGB")),
T.Resize(
(input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD),
]
)
return transform
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def find_closest_aspect_ratio(
aspect_ratio: float,
target_ratios: list[tuple[int, int]],
*,
width: int,
height: int,
image_size: int,
) -> tuple[int, int]:
best_ratio_diff = float("inf")
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def resolve_internvl_min_max_num(
*,
min_dynamic_patch: int,
max_dynamic_patch: int,
dynamic_image_size: bool,
use_thumbnail: bool,
) -> tuple[int, int]:
min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
if use_thumbnail and max_dynamic_patch != 1:
max_dynamic_patch += 1
return min_dynamic_patch, max_dynamic_patch
def get_internvl_target_ratios(
min_num: int,
max_num: int,
) -> list[tuple[int, int]]:
target_ratios = {
(i, j)
for n in range(min_num, max_num + 1)
for i in range(1, n + 1)
for j in range(1, n + 1)
if min_num <= i * j <= max_num
}
return sorted(target_ratios, key=lambda x: x[0] * x[1])
def calculate_internvl_targets(
*,
orig_width: int,
orig_height: int,
target_ratios: list[tuple[int, int]],
image_size: int,
use_thumbnail: bool,
) -> tuple[int, int, int]:
aspect_ratio = orig_width / orig_height
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio,
target_ratios,
width=orig_width,
height=orig_height,
image_size=image_size,
)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# add thumbnail image if num_blocks != 1
if use_thumbnail and blocks != 1:
blocks += 1
return blocks, target_width, target_height
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def dynamic_preprocess_internvl(
image: Image.Image,
*,
target_ratios: list[tuple[int, int]],
image_size: int,
use_thumbnail: bool,
) -> list[Image.Image]:
orig_width, orig_height = image.size
# calculate the number of blocks without thumbnail
blocks, target_width, target_height = calculate_internvl_targets(
orig_width=orig_width,
orig_height=orig_height,
target_ratios=target_ratios,
image_size=image_size,
use_thumbnail=False,
)
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size,
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def image_to_pixel_values_internvl(
image: Image.Image,
*,
input_size: int,
min_num: int,
max_num: int,
use_thumbnail: bool,
) -> torch.Tensor:
target_ratios = get_internvl_target_ratios(min_num, max_num)
transform = build_transform(input_size=input_size)
images = dynamic_preprocess_internvl(
image,
target_ratios=target_ratios,
image_size=input_size,
use_thumbnail=use_thumbnail,
)
pixel_values = torch.stack([transform(image) for image in images])
return pixel_values
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def video_to_pixel_values_internvl(
video: npt.NDArray,
*,
input_size: int,
min_num: int,
max_num: int,
use_thumbnail: bool,
) -> torch.Tensor:
target_ratios = get_internvl_target_ratios(min_num, max_num)
transform = build_transform(input_size=input_size)
frames_list = list[Image.Image]()
for frame in video:
pil_frame = dynamic_preprocess_internvl(
Image.fromarray(frame, mode="RGB"),
target_ratios=target_ratios,
image_size=input_size,
use_thumbnail=use_thumbnail,
)
assert len(pil_frame) == 1
frames_list.extend(pil_frame)
pixel_values = torch.stack([transform(image) for image in frames_list])
return pixel_values
class BaseInternVLProcessor(ABC):
"""
This model doesn't define its own HF processor,
so we implement our own one here.
The code to insert image tokens is based on:
https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
"""
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> None:
super().__init__()
self.config = config
self.tokenizer = tokenizer
image_size: int = config.vision_config.image_size
patch_size: int = config.vision_config.patch_size
if min_dynamic_patch is None:
min_dynamic_patch = config.min_dynamic_patch
assert isinstance(min_dynamic_patch, int)
if max_dynamic_patch is None:
max_dynamic_patch = config.max_dynamic_patch
assert isinstance(max_dynamic_patch, int)
if dynamic_image_size is None:
dynamic_image_size = config.dynamic_image_size
assert isinstance(dynamic_image_size, bool)
self.num_image_token = int(
(image_size // patch_size) ** 2 * (config.downsample_ratio**2)
)
self.image_size = image_size
self.min_dynamic_patch = min_dynamic_patch
self.max_dynamic_patch = max_dynamic_patch
self.dynamic_image_size = dynamic_image_size
self.use_thumbnail: bool = config.use_thumbnail
@property
@abstractmethod
def image_token_id(self) -> int:
raise NotImplementedError
@abstractmethod
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
raise NotImplementedError
def resolve_min_max_num(
self,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
use_thumbnail: bool | None = None,
) -> tuple[int, int]:
min_dynamic_patch = (
self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
)
max_dynamic_patch = (
self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
)
dynamic_image_size = (
self.dynamic_image_size
if dynamic_image_size is None
else dynamic_image_size
)
use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
return resolve_internvl_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
)
def resolve_target_ratios(
self,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
use_thumbnail: bool | None = None,
) -> list[tuple[int, int]]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
)
return get_internvl_target_ratios(min_num, max_num)
def get_num_image_tokens(
self,
*,
image_width: int,
image_height: int,
) -> int:
target_ratios = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
)
num_patches, _, _ = calculate_internvl_targets(
orig_width=image_width,
orig_height=image_height,
image_size=self.image_size,
target_ratios=target_ratios,
use_thumbnail=self.use_thumbnail,
)
return num_patches * self.num_image_token
def _images_to_pixel_values_lst(
self,
images: list[Image.Image],
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> list[torch.Tensor]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=False, # Applied in image_to_pixel_values
)
return [
image_to_pixel_values_internvl(
image,
input_size=self.image_size,
min_num=min_num,
max_num=max_num,
use_thumbnail=self.use_thumbnail,
)
for image in images
]
def _preprocess_image(
self,
text: list[str],
images: list[Image.Image],
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> tuple[list[str], dict[str, torch.Tensor]]:
if len(images) == 0:
image_inputs = {}
else:
pixel_values_lst = self._images_to_pixel_values_lst(
images,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
image_inputs = {
"pixel_values_flat": torch.cat(pixel_values_lst),
"image_num_patches": torch.tensor(
[len(item) for item in pixel_values_lst]
),
}
for pixel_values in pixel_values_lst:
num_patches = pixel_values.shape[0]
feature_size = num_patches * self.num_image_token
image_repl = self.get_image_repl(feature_size, num_patches)
text = [t.replace("<image>", image_repl.full, 1) for t in text]
return text, image_inputs
def _make_batch_input(self, input_item: Any | list[Any] | None = None):
if input_item is None:
input_item = []
if not isinstance(input_item, list):
input_item = [input_item]
return input_item
def __call__(
self,
text: str | list[str] | None = None,
images: Image.Image | list[Image.Image] | None = None,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
return_tensors: str | TensorType | None = None,
) -> BatchFeature:
text, images = [self._make_batch_input(x) for x in (text, images)]
text, image_inputs = self._preprocess_image(
text=text,
images=images,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
text_inputs = self.tokenizer(text)
combined_outputs = {**text_inputs, **image_inputs}
return BatchFeature(combined_outputs, tensor_type=return_tensors)
class InternVLProcessor(BaseInternVLProcessor):
"""
HF Processor for InternVLChatModel with extended video processing logic.
Code for video processing is adapted from video example:
https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers
"""
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
video_token: str | None = None,
) -> None:
super().__init__(
config=config,
tokenizer=tokenizer,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
# add extra video token for video processing
self.video_token = video_token
@property
def image_token_id(self) -> int:
return self.tokenizer.get_vocab()[IMG_CONTEXT]
@property
def video_token_id(self) -> int | None:
if self.video_token is None:
return None
return self.tokenizer.get_vocab().get(self.video_token, None)
@property
def supports_video(self) -> bool:
return self.video_token_id is not None
def _videos_to_pixel_values_lst(
self,
videos: list[npt.NDArray],
dynamic_image_size: bool | None = None,
) -> list[torch.Tensor]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=1,
max_dynamic_patch=1,
dynamic_image_size=dynamic_image_size,
use_thumbnail=False, # Applied in image_to_pixel_values
)
return [
video_to_pixel_values_internvl(
video,
input_size=self.image_size,
min_num=min_num,
max_num=max_num,
use_thumbnail=False,
)
for video in videos
]
def _preprocess_video(
self,
text: list[str],
videos: list[npt.NDArray],
dynamic_image_size: bool | None = None,
):
if len(videos) == 0 or not self.supports_video:
video_inputs = {}
else:
pixel_values_lst_video = self._videos_to_pixel_values_lst(
videos,
dynamic_image_size=dynamic_image_size,
)
video_inputs = {
"pixel_values_flat_video": torch.cat(pixel_values_lst_video),
"video_num_patches": torch.tensor(
[len(item) for item in pixel_values_lst_video]
),
}
for pixel_values in pixel_values_lst_video:
num_patches = pixel_values.shape[0]
video_repl = self.get_video_repl(
self.num_image_token, num_patches, self.video_token
)
text = [t.replace("<video>", video_repl.full, 1) for t in text]
return text, video_inputs
def __call__(
self,
text: str | list[str] | None = None,
images: Image.Image | list[Image.Image] | None = None,
videos: npt.NDArray | list[npt.NDArray] | None = None,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
return_tensors: str | TensorType | None = None,
) -> BatchFeature:
text, images, videos = [
self._make_batch_input(x) for x in (text, images, videos)
]
text, image_inputs = self._preprocess_image(
text=text,
images=images,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
text, video_inputs = self._preprocess_video(
text=text,
videos=videos,
dynamic_image_size=dynamic_image_size,
)
text_inputs = self.tokenizer(text)
combined_outputs = {**text_inputs, **image_inputs, **video_inputs}
return BatchFeature(combined_outputs, tensor_type=return_tensors)
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
repl_features = IMG_CONTEXT * feature_size
repl_full = IMG_START + repl_features + IMG_END
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
def get_video_repl(
self,
feature_size: int,
num_patches: int | None = None,
video_context_token: str = IMG_CONTEXT,
) -> PromptUpdateDetails[str]:
repl_features = video_context_token * self.num_image_token
repl_features_with_sep = IMG_START + repl_features + IMG_END
# num_patches is equal to num_frames
repl_full = "".join(
[f"Frame{i + 1}: {repl_features_with_sep}" for i in range(num_patches)]
)
return PromptUpdateDetails.select_text(repl_full, video_context_token)
class BaseInternVLProcessingInfo(BaseProcessingInfo): class BaseInternVLProcessingInfo(BaseProcessingInfo):
"""Basic image-only ProcessingInfo for InternVL-style models.""" """Basic image-only ProcessingInfo for InternVL-style models."""
......
...@@ -8,22 +8,15 @@ ...@@ -8,22 +8,15 @@
# -------------------------------------------------------- # --------------------------------------------------------
import copy import copy
import math
import warnings import warnings
from abc import ABC, abstractmethod from abc import abstractmethod
from collections.abc import Iterable, Mapping, Sequence from collections.abc import Iterable, Mapping, Sequence
from dataclasses import dataclass
from functools import cached_property from functools import cached_property
from typing import Annotated, Any, Literal, TypeAlias, TypeVar from typing import Annotated, Literal, TypeAlias, TypeVar
import einops
import numpy as np
import numpy.typing as npt
import regex as re
import torch import torch
import torch.nn as nn import torch.nn as nn
from PIL import Image from transformers import BatchFeature
from transformers import BatchFeature, PretrainedConfig, TensorType
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
...@@ -38,10 +31,6 @@ from vllm.model_executor.models.interfaces import ( ...@@ -38,10 +31,6 @@ from vllm.model_executor.models.interfaces import (
SupportsMultiModal, SupportsMultiModal,
SupportsMultiModalPruning, SupportsMultiModalPruning,
) )
from vllm.model_executor.models.internvl import (
calculate_internvl_targets,
get_internvl_target_ratios,
)
from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.models.nemotron_h import NemotronHForCausalLM from vllm.model_executor.models.nemotron_h import NemotronHForCausalLM
from vllm.model_executor.models.parakeet import ParakeetExtractor, ProjectedParakeet from vllm.model_executor.models.parakeet import ParakeetExtractor, ProjectedParakeet
...@@ -83,23 +72,28 @@ from vllm.multimodal.processing.processor import ( ...@@ -83,23 +72,28 @@ from vllm.multimodal.processing.processor import (
BaseProcessingInfo, BaseProcessingInfo,
PromptReplacement, PromptReplacement,
PromptUpdate, PromptUpdate,
PromptUpdateDetails,
_seq2tokens,
) )
from vllm.renderers import TokenizeParams from vllm.renderers import TokenizeParams
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config from vllm.tokenizers import cached_tokenizer_from_config
from vllm.transformers_utils.configs.radio import RadioConfig from vllm.transformers_utils.configs.radio import RadioConfig
from vllm.transformers_utils.processors.nano_nemotron_vl import (
AUDIO_CONTEXT,
IMG_CONTEXT,
IMG_END,
IMG_START,
BaseNanoNemotronVLProcessor,
DynamicResolutionImageTiler,
NanoNemotronVLProcessor,
get_internvl_target_ratios,
)
from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .utils import _merge_multimodal_embeddings from .utils import _merge_multimodal_embeddings
logger = init_logger(__name__) logger = init_logger(__name__)
# Configure PIL to handle large images without warnings
# This prevents DecompressionBombWarning for legitimate large images MAX_AUDIO_LEN_S = 10 * 60 # 10 minutes
Image.MAX_IMAGE_PIXELS = None # Disable the limit entirely
# Alternative: Set a specific higher limit
# Image.MAX_IMAGE_PIXELS = 300000000 # ~300M pixels
class NanoNemotronVLAudioFeatureInputs(TensorSchema): class NanoNemotronVLAudioFeatureInputs(TensorSchema):
...@@ -116,20 +110,6 @@ class NanoNemotronVLAudioFeatureInputs(TensorSchema): ...@@ -116,20 +110,6 @@ class NanoNemotronVLAudioFeatureInputs(TensorSchema):
audio_feature_lengths: Annotated[torch.Tensor, TensorShape("b")] audio_feature_lengths: Annotated[torch.Tensor, TensorShape("b")]
MAX_AUDIO_LEN_S = 10 * 60 # 10 minutes
IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<image>"
AUDIO_START = "<so_start>"
AUDIO_END = "<so_end>"
AUDIO_CONTEXT = "<so_embedding>"
# Profiling
# MAX_FRAMES = 16
DEFAULT_NUM_TILES = 12
class NanoNemotronVLImagePixelInputs(TensorSchema): class NanoNemotronVLImagePixelInputs(TensorSchema):
""" """
Dimensions: Dimensions:
...@@ -213,987 +193,6 @@ NanoNemotronVLVideoInputs: TypeAlias = ( ...@@ -213,987 +193,6 @@ NanoNemotronVLVideoInputs: TypeAlias = (
) )
def dynamic_preprocess(
image,
*,
image_size=512,
max_num_tiles=12,
use_thumbnail=True,
idx=0,
):
orig_width, orig_height = image.size
target_ratios = get_internvl_target_ratios(1, max_num_tiles)
blocks, target_width, target_height = calculate_internvl_targets(
orig_width=orig_width,
orig_height=orig_height,
target_ratios=target_ratios,
image_size=image_size,
use_thumbnail=False,
)
image = np.asarray(
image.convert("RGB") if image.mode != "RGB" else image, dtype=np.uint8
)
image = torch.from_numpy(image).unsqueeze(0) # (1, H, W, 3)
image = image.permute(0, 3, 1, 2) # (1, 3, H, W)
resized_img = torch.nn.functional.interpolate(
image,
size=(target_height, target_width),
mode="bicubic",
align_corners=False,
antialias=True,
)
B, C, H, W = resized_img.shape
hp, wp = H // image_size, W // image_size
patches = (
resized_img.reshape(B, C, hp, image_size, wp, image_size)
.permute(0, 2, 4, 1, 3, 5)
.reshape(B * hp * wp, C, image_size, image_size)
/ 255.0
)
if use_thumbnail and patches.shape[0] > 1:
thumb = (
torch.nn.functional.interpolate(
image,
size=(image_size, image_size),
mode="bicubic",
align_corners=False,
antialias=True,
)
/ 255.0
)
patches = torch.cat([patches, thumb], dim=0)
return list(patches)
def image_to_pixel_values(
image: Image.Image,
*,
input_size: int,
max_num: int,
use_thumbnail: bool,
idx: int,
) -> torch.Tensor:
images = dynamic_preprocess(
image,
image_size=input_size,
max_num_tiles=max_num,
use_thumbnail=use_thumbnail,
idx=idx,
)
pixel_values = torch.stack(images)
return pixel_values
def video_to_pixel_values(
video: npt.NDArray,
*,
input_size: int,
max_num_tiles: int = 1,
use_thumbnail: bool,
) -> torch.Tensor:
assert max_num_tiles == 1, "Video modality always uses one tile"
# (num_frames, H, W, C) -> (num_frames, C, H, W)
video_tensor = torch.from_numpy(video).permute(0, 3, 1, 2)
if video_tensor.shape[2] != input_size or video_tensor.shape[3] != input_size:
video_tensor = torch.nn.functional.interpolate(
video_tensor,
size=(input_size, input_size),
mode="bicubic",
align_corners=False,
antialias=True,
)
video_tensor = video_tensor / 255.0
return video_tensor
def input_conditioner(x, norm_mean, norm_std):
return (x - norm_mean) / norm_std
def calculate_timestamps(
indices: list[int] | torch.Tensor,
frame_duration_ms: int,
):
if not isinstance(indices, list):
indices = indices.tolist()
timestamps = [int(i) * frame_duration_ms / 1000.0 for i in indices]
return timestamps
class DynamicResolutionImageTiler:
CONV_MERGING = False
PIXEL_SHUFFLE = True
USE_THUMBNAIL = False
def __init__(
self,
*,
max_model_len: int,
patch_size: int,
min_num_patches: int,
max_num_patches: int,
downsample_ratio: int,
norm_mean: Sequence[float],
norm_std: Sequence[float],
factor_max: float = 1.0,
use_thumbnail: bool = False,
) -> None:
assert use_thumbnail is False, "use_thumbnail is not supported"
self._patch_size: int = patch_size
self._max_model_len = max_model_len
self._min_num_patches = min_num_patches
self._max_num_patches = max_num_patches if max_num_patches > 0 else float("inf")
self._factor_max = factor_max
self.norm_mean = torch.tensor(norm_mean).reshape(3, 1, 1)
self.norm_std = torch.tensor(norm_std).reshape(3, 1, 1)
assert downsample_ratio < 1
reduction_factor = 1 / downsample_ratio
assert reduction_factor == 2.0
self._downsample_ratio = int(reduction_factor) ** (
self.PIXEL_SHUFFLE + self.CONV_MERGING
)
assert self._downsample_ratio == 2
def _get_num_embeddings(self, width: int, height: int) -> int:
num_patches = (width // self._patch_size) * (height // self._patch_size)
num_tokens = num_patches // (self._downsample_ratio**2)
return num_tokens
def width_and_height_for_max_num_tokens_available(
self,
target_num_tokens_post_shuffle: int,
) -> tuple[int, int]:
"""
TODO: optimize this so it squeezes closer to target number of tokens.
Calculate image dimensions that produce approximately `target` tokens after
pixel_shuffle.
With pixel_shuffle enabled, each 2x2 patch grid becomes 1 token, so we
need 4*B patches to get B tokens.
Examples:
>>> PATCH_SIZE = 16
>>> DOWNSAMPLE_RATIO = 0.5
>>> tiler = DynamicResolutionImageTiler(
... max_model_len=16384,
... patch_size=PATCH_SIZE,
... downsample_ratio=DOWNSAMPLE_RATIO,
... min_num_patches=4,
... max_num_patches=0,
... )
>>> width, height = tiler.width_and_height_for_max_num_tokens_available(
... target_num_tokens_post_shuffle=8192,
... )
>>> assert width, height == (2880, 2880)
>>> assert (width // PATCH_SIZE) * (
... height // PATCH_SIZE
... ) // 2**2 == 8100 # tokens post-shuffle
>>> assert tiler._get_num_embeddings(width=width, height=height) == 8100
"""
side_pixels = (
math.isqrt(target_num_tokens_post_shuffle)
* self._downsample_ratio
* self._patch_size
)
assert isinstance(side_pixels, int) and side_pixels % self._patch_size == 0
return side_pixels, side_pixels
def max_num_tokens_available(self, text_prompt_length: int) -> int:
return self._max_model_len - text_prompt_length - 4
def _images_to_pixel_values_lst(
self,
text_prompt_length: int,
images: list[Image.Image],
) -> tuple[list[torch.Tensor], list[int]]:
num_tokens_available = self.max_num_tokens_available(text_prompt_length)
params_per_image = self.compute_params(images, num_tokens_available)
feature_sizes = []
images = []
for param in params_per_image:
for t in self.apply_params(param):
assert t.ndim == 3, f"{t.ndim=}: expected 3 dim tensor"
images.append(t)
feature_sizes.append(param.num_embeddings)
return images, feature_sizes
feature_size_cache: dict[Image.Image, int] = {}
@classmethod
def get_cached_feature_size(cls, image: Image.Image) -> int:
feature_size = cls.feature_size_cache[id(image)]
# hard assert that we only use the feature size once
del cls.feature_size_cache[id(image)]
return feature_size
@dataclass
class DynamicResolutionParams:
media: Image.Image
num_tiles: int
num_embeddings: int
patch_size: tuple[int, int]
def apply_params(self, params: DynamicResolutionParams) -> list[torch.Tensor]:
target_size = (
params.patch_size[1] * self._patch_size,
params.patch_size[0] * self._patch_size,
)
image = np.asarray(
params.media.convert("RGB") if params.media.mode != "RGB" else params.media,
dtype=np.uint8,
)
resized_img = (
torch.nn.functional.interpolate(
torch.from_numpy(image).unsqueeze(0).permute(0, 3, 1, 2),
size=target_size,
mode="bicubic",
align_corners=False,
antialias=True,
)
/ 255.0
)
return list(resized_img)
def process_media(
self,
media: Image.Image,
num_tokens_available: int,
) -> tuple[DynamicResolutionParams, int]:
"""Process a single media item and return its parameters.
Args:
media: The media item to process
num_tokens_available: Number of tokens available for this media
Returns:
DynamicResolutionParams for the media
"""
current_num_tokens_available = num_tokens_available
assert isinstance(media, Image.Image), (
"Dynamic resolution is only supported for image media"
)
orig_width, orig_height = media.width, media.height
closest_patch_height = round(orig_height / self._patch_size + 0.5)
closest_patch_width = round(orig_width / self._patch_size + 0.5)
patches = closest_patch_height * closest_patch_width
factor = min(
math.sqrt(current_num_tokens_available / patches), self._factor_max
)
target_patch_height = math.floor(factor * closest_patch_height)
target_patch_width = math.floor(factor * closest_patch_width)
# Consider self._min_num_patches if > current_num_tokens_available.
if (
current_num_tokens_available > self._min_num_patches
and target_patch_height * target_patch_width < self._min_num_patches
):
up_factor = math.sqrt(
self._min_num_patches / (target_patch_height * target_patch_width)
)
target_patch_height = math.ceil(up_factor * target_patch_height)
target_patch_width = math.ceil(up_factor * target_patch_width)
# Round patch grid to be divisible by 2 (pixel-shuffle OR conv-merging)
# or by 4 when BOTH are enabled (two successive 2x reductions)
if self.PIXEL_SHUFFLE or self.CONV_MERGING:
required_divisor = 4 if (self.PIXEL_SHUFFLE and self.CONV_MERGING) else 2
rem_h = target_patch_height % required_divisor
if rem_h != 0:
inc_h = required_divisor - rem_h
if (
target_patch_height + inc_h
) * target_patch_width <= current_num_tokens_available:
target_patch_height += inc_h
else:
target_patch_height = max(
required_divisor, target_patch_height - rem_h
)
rem_w = target_patch_width % required_divisor
if rem_w != 0:
inc_w = required_divisor - rem_w
if (
target_patch_height * (target_patch_width + inc_w)
<= current_num_tokens_available
):
target_patch_width += inc_w
else:
target_patch_width = max(
required_divisor, target_patch_width - rem_w
)
# Calculate embeddings for the main dynamic resolution image
num_embeddings = self._get_num_embeddings(
target_patch_width * self._patch_size,
target_patch_height * self._patch_size,
)
token_count = target_patch_width * target_patch_height
# Add thumbnail embeddings if enabled and image area is below threshold
num_tiles = 1 # Base dynamic resolution image
return self.DynamicResolutionParams(
media=media,
num_tiles=num_tiles,
num_embeddings=num_embeddings,
patch_size=(target_patch_width, target_patch_height),
), token_count
def compute_params(
self,
media_list: list[Image.Image],
num_tokens_available: int | None = None,
) -> list[DynamicResolutionParams]:
"""Compute parameters for all media with iterative token budgeting.
Args:
media_list: List of media items to process
num_tokens_available: Total number of tokens available across all media
Returns:
List of ImageTilingParams for each media item
"""
num_tokens_available = (
num_tokens_available
* (4 if self.PIXEL_SHUFFLE else 1)
* (4 if self.CONV_MERGING else 1)
)
# When the number of available token is too small,
# allow self._min_num_patches per media and let the sample be truncated.
num_tokens_available = max(
num_tokens_available, self._min_num_patches * len(media_list)
)
# Clip the number of tokens available per media to >min and <max patches.
num_tokens_available_per_media = [
max(min(num_tokens_available, self._max_num_patches), self._min_num_patches)
for _ in range(len(media_list))
]
# prevent infinite loop in any case
for _ in range(10):
# Step 1: Process each media with current token budget
params = []
token_counts = []
for media, tokens_for_media in zip(
media_list, num_tokens_available_per_media
):
param, token_count = self.process_media(media, tokens_for_media)
params.append(param)
token_counts.append(token_count)
self.feature_size_cache[id(param.media)] = param.num_embeddings
# Step 2: Check if total tokens is within budget
total_tokens = sum(token_counts)
if total_tokens <= num_tokens_available:
# We're within budget, return the params
return params
# Step 3: We're over budget, need to scale down
# Calculate scaling factor to get under budget
scaling_factor = num_tokens_available / total_tokens
# Recalculate token budgets for each media based on scaling
# Each media gets a proportional share of the total budget
scaled_down_num_tokens_available_per_media = [
max(self._min_num_patches, int(token_count * scaling_factor))
for token_count in token_counts
]
scaled_down = any(
[
scaled_down_num_tokens_available_per_media[i]
< num_tokens_available_per_media[i]
for i in range(len(num_tokens_available_per_media))
]
)
# If there wasn't scaling down, we're stuck with min_num_patches per media,
# else try with the scaled down num_tokens_available_per_media.
if not scaled_down:
num_tokens_available_per_media = [self._min_num_patches] * len(
media_list
)
else:
num_tokens_available_per_media = (
scaled_down_num_tokens_available_per_media
)
ctx = f"{params=} {total_tokens=} {num_tokens_available=}"
raise ValueError(
f"Should be unreachable - `return params` above must be reached: {ctx}"
)
@staticmethod
def stack(images: list[torch.Tensor], patch_size: int) -> torch.Tensor:
assert len(images) > 0, "No images to stack"
def rearrange_img(x):
py = x.shape[-2] // patch_size
px = x.shape[-1] // patch_size
x = einops.rearrange(
x,
"c (py yy) (px xx) -> (py px) (c yy xx)",
py=py,
yy=patch_size,
px=px,
xx=patch_size,
)
return x
imgs = [rearrange_img(img) for img in images]
pixel_values_flat = torch.cat(imgs, dim=0).unsqueeze(0)
return pixel_values_flat
class BaseNanoNemotronVLProcessor(ABC):
"""
This model doesn't define its own HF processor,
so we implement our own one here.
The code to insert image tokens is based on:
https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
"""
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
*args,
max_model_len: int,
max_num_tiles: int | None = None,
**kwargs,
) -> None:
super().__init__()
self.config = config
self.tokenizer = tokenizer
self.max_num_tiles = max_num_tiles or DEFAULT_NUM_TILES
image_size: int = config.force_image_size
patch_size: int = config.patch_size
downsample_ratio: int = config.downsample_ratio
self.num_image_token = int(
(image_size // patch_size) ** 2 * (downsample_ratio**2)
)
self.image_size = image_size
self.use_thumbnail: bool = config.use_thumbnail
self.norm_mean = torch.Tensor(config.norm_mean).reshape(1, 3, 1, 1)
self.norm_std = torch.Tensor(config.norm_std).reshape(1, 3, 1, 1)
self.dynamic_tiler: DynamicResolutionImageTiler | None = None
if self.use_dynamic_resolution(config):
self.dynamic_tiler = DynamicResolutionImageTiler(
max_model_len=max_model_len,
patch_size=patch_size,
downsample_ratio=downsample_ratio,
min_num_patches=config.vision_config.args["min_num_patches"],
max_num_patches=config.vision_config.args["max_num_patches"],
norm_mean=config.norm_mean,
norm_std=config.norm_std,
)
@staticmethod
def use_dynamic_resolution(config: PretrainedConfig) -> bool:
return "min_num_patches" in config.vision_config.args
@property
@abstractmethod
def image_token_id(self) -> int:
raise NotImplementedError
@abstractmethod
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
raise NotImplementedError
def get_num_image_tokens(
self,
*,
image_width: int,
image_height: int,
max_num_tiles: int,
) -> int:
target_ratios = get_internvl_target_ratios(1, max_num_tiles)
num_patches, _, _ = calculate_internvl_targets(
orig_width=image_width,
orig_height=image_height,
target_ratios=target_ratios,
image_size=self.image_size,
use_thumbnail=self.use_thumbnail,
)
return num_patches * self.num_image_token
def _images_to_pixel_values_lst(
self,
images: list[Image.Image],
max_num_tiles: int,
) -> list[torch.Tensor]:
return [
image_to_pixel_values(
image,
input_size=self.image_size,
max_num=max_num_tiles,
use_thumbnail=self.use_thumbnail,
idx=idx,
)
for idx, image in enumerate(images)
]
def _preprocess_image(
self,
text: list[str],
images: list[Image.Image],
max_num_tiles: int,
) -> tuple[list[str], dict[str, Any]]:
if len(images) == 0:
image_inputs = {}
return text, image_inputs
if tiler := self.dynamic_tiler:
sans_images = text[0].replace("<image>", "")
text_prompt_length = len(
self.tokenizer(sans_images, add_special_tokens=False).input_ids
)
pixel_values_lst, num_tokens_per_image = tiler._images_to_pixel_values_lst(
text_prompt_length=text_prompt_length,
images=images,
)
imgs_sizes = [(pv.shape[-2], pv.shape[-1]) for pv in pixel_values_lst]
normalized = [
input_conditioner(img, tiler.norm_mean, tiler.norm_std)
for img in pixel_values_lst
]
image_num_patches = torch.tensor([1] * len(num_tokens_per_image))
image_inputs = {
"pixel_values_flat": normalized,
"imgs_sizes": imgs_sizes,
"num_tokens_per_image": num_tokens_per_image,
}
else:
pixel_values_lst = self._images_to_pixel_values_lst(images, max_num_tiles)
image_num_patches = torch.tensor([len(item) for item in pixel_values_lst])
pixel_values_flat = input_conditioner(
torch.cat(pixel_values_lst), self.norm_mean, self.norm_std
)
image_inputs = {
"pixel_values_flat": pixel_values_flat,
"image_num_patches": image_num_patches,
}
num_tokens_per_image = [
self.num_image_token * len(item) for item in pixel_values_lst
]
assert len(text) == 1, (
"hf_processor is called on the output of get_dummy_text, "
"which should be a single string"
)
parts = [x for x in re.split(r"(<image>)", text[0]) if x]
assert parts.count("<image>") == len(pixel_values_lst), (
"the number of <image> tokens in the text should be the "
"same as the number of images"
)
for i, (feature_size, num_patches) in enumerate(
zip(num_tokens_per_image, image_num_patches, strict=True)
):
image_repl = self.get_image_repl(feature_size, num_patches)
parts[i] = parts[i].replace("<image>", image_repl.full)
text = ["".join(parts)]
return text, image_inputs
def _make_batch_input(self, input_item: Any | list[Any] | None = None):
if input_item is None:
input_item = []
if not isinstance(input_item, list):
input_item = [input_item]
return input_item
@abstractmethod
def __call__(
self,
text: str | list[str] | None = None,
images: Image.Image | list[Image.Image] | None = None,
return_tensors: str | TensorType | None = None,
max_num_tiles: int | None = None,
) -> BatchFeature:
raise NotImplementedError
class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
"""
HF Processor with extended video processing logic.
Code for video processing is adapted from video example:
https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers
"""
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
*,
max_model_len: int,
max_num_tiles: int | None = None,
video_token: str | None = None,
video_pruning_rate: float | None = None,
) -> None:
super().__init__(
config=config,
tokenizer=tokenizer,
max_model_len=max_model_len,
max_num_tiles=max_num_tiles,
)
# add extra video token for video processing
self.video_token = video_token
self.video_pruning_rate = video_pruning_rate
self.audio_extractor: ParakeetExtractor | None = None
raw_sound_config = getattr(config, "sound_config", None)
if raw_sound_config is not None:
self.audio_extractor = ParakeetExtractor(raw_sound_config)
# Pre-tokenize special tokens for video processing
# to avoid repeated tokenization
self._img_start_token_ids = tokenizer.encode(
IMG_START, add_special_tokens=False
)
self._img_end_token_ids = tokenizer.encode(IMG_END, add_special_tokens=False)
self._img_context_token_ids = tokenizer.encode(
IMG_CONTEXT, add_special_tokens=False
)
@property
def supports_video(self) -> bool:
return self.video_token_id is not None
@property
def video_token_id(self) -> int | None:
if self.video_token is None:
return None
return self.tokenizer.get_vocab().get(self.video_token, None)
@property
def image_token_id(self) -> int:
return self.tokenizer.convert_tokens_to_ids(IMG_CONTEXT)
def _videos_to_pixel_values_lst(
self,
videos: list[npt.NDArray],
max_num_tiles: int,
) -> list[torch.Tensor]:
return [
video_to_pixel_values(
video,
input_size=self.image_size,
max_num_tiles=max_num_tiles,
use_thumbnail=self.use_thumbnail,
)
for video in videos
]
def _preprocess_video(
self,
text: list[str],
videos: list[tuple[npt.NDArray, dict[str, Any]]],
max_num_tiles: int,
):
if len(videos) == 0 or not self.supports_video:
video_inputs = {}
else:
videos_lst = [v[0] for v in videos]
video_metadata_lst = [v[1] for v in videos]
pixel_values_lst_video = self._videos_to_pixel_values_lst(
videos_lst,
max_num_tiles=max_num_tiles,
)
# We use frame duration in milliseconds (as integer) to ensure
# we have consistent timestamps calculation. At preprocessing
# fps parameter is given in fp32, while at inference it is bf16
# which leads to inaccurate timestamp calculation and causes
# timestamp values to differ.In rare cases this causes
# mismatching number of output tokens for tokenized frame prefixes
frame_duration_ms_lst = [
int(1000.0 / metadata["fps"]) for metadata in video_metadata_lst
]
frames_indices_lst = [
metadata["frames_indices"] for metadata in video_metadata_lst
]
video_num_patches = torch.tensor(
[len(item) for item in pixel_values_lst_video]
)
video_inputs = {
"pixel_values_flat_video": input_conditioner(
torch.cat(pixel_values_lst_video), self.norm_mean, self.norm_std
),
"video_num_patches": video_num_patches,
"frames_indices": frames_indices_lst,
"frame_duration_ms": torch.tensor(frame_duration_ms_lst),
}
image_size: int = self.config.force_image_size
patch_size: int = self.config.patch_size
downsample_ratio = self.config.downsample_ratio
tokens_in_single_frame = int(
(image_size * image_size // patch_size**2) * (downsample_ratio**2)
)
for pixel_values, video_metadata, frames_indices, frame_duration_ms in zip(
pixel_values_lst_video,
video_metadata_lst,
frames_indices_lst,
frame_duration_ms_lst,
):
num_frames = pixel_values.shape[0]
if (
self.video_pruning_rate is not None
and self.video_pruning_rate > 0.0
):
# Start of EVS-specific code
num_tokens = compute_retained_tokens_count(
tokens_per_frame=tokens_in_single_frame,
num_frames=num_frames,
q=self.video_pruning_rate,
)
# Here we just need placeholders that won't actually be replaced -
# we just need to make sure the total number of tokens is correct
# assign all tokens to the first frame
tokens_per_frame = [num_tokens] + [0] * (num_frames - 1)
# End of EVS-specific code
else:
tokens_per_frame = [tokens_in_single_frame] * num_frames
video_repl = self.get_video_repl(
tokens_per_frame=tokens_per_frame,
frames_indices=frames_indices,
frame_duration_ms=frame_duration_ms,
tokenizer=self.tokenizer,
img_start_token_ids=self._img_start_token_ids,
img_end_token_ids=self._img_end_token_ids,
img_context_token_ids=self._img_context_token_ids,
)
# video_repl.full is a list of token IDs
# Convert token IDs back to text for the HF processor flow
video_repl_text = self.tokenizer.decode(
video_repl.full, skip_special_tokens=False
)
text = [t.replace("<video>", video_repl_text, 1) for t in text]
return text, video_inputs
def _preprocess_audio(
self,
text: list[str],
audios: list[npt.NDArray],
):
if len(audios) == 0:
return text, {}
assert self.audio_extractor is not None
extractor = self.audio_extractor
parts = [x for x in re.split(f"({re.escape(AUDIO_CONTEXT)})", text[0]) if x]
token_count = parts.count(AUDIO_CONTEXT)
if token_count != len(audios):
raise ValueError(
"Number of audio tokens in text does not match the number "
f"of audios (tokens={token_count}, audios={len(audios)})."
)
audio_index = 0
for idx, part in enumerate(parts):
if part == AUDIO_CONTEXT:
audio_repl = self.get_audio_repl(audios[audio_index])
parts[idx] = audio_repl.full
audio_index += 1
text = ["".join(parts)]
audio_inputs = extractor(
audios,
sampling_rate=extractor.sampling_rate,
return_tensors="pt",
)
input_audio_features = audio_inputs.input_features
feature_attention_mask = audio_inputs.attention_mask
audio_feature_lengths = feature_attention_mask.sum(dim=1)
audio_inputs = {
"input_audio_features": input_audio_features,
"feature_attention_mask": feature_attention_mask,
"audio_feature_lengths": audio_feature_lengths,
}
return text, audio_inputs
def __call__(
self,
text: str | list[str] | None = None,
images: Image.Image | list[Image.Image] | None = None,
videos: list[tuple[npt.NDArray, dict[str, Any]]] | None = None,
audios: AudioItem | list[AudioItem] | None = None,
return_tensors: str | TensorType | None = None,
max_num_tiles: int | None = None,
) -> BatchFeature:
# Use default if not provided
if max_num_tiles is None:
max_num_tiles = self.max_num_tiles
text, images, videos, audios = [
self._make_batch_input(x) for x in (text, images, videos, audios)
]
text, image_inputs = self._preprocess_image(
text=text,
images=images,
max_num_tiles=max_num_tiles,
)
text, video_inputs = self._preprocess_video(
text=text,
videos=videos,
max_num_tiles=1,
)
text, audio_inputs = self._preprocess_audio(
text=text,
audios=audios,
)
text_inputs = self.tokenizer(text, add_special_tokens=False)
combined_inputs = {**text_inputs, **video_inputs, **audio_inputs}
if self.dynamic_tiler is None:
batch = BatchFeature(
{**combined_inputs, **image_inputs},
tensor_type=return_tensors,
)
else:
batch = BatchFeature(combined_inputs, tensor_type=return_tensors)
# allow images to be exempt from the BatchFeature validation:
# We will .stack() them in _parse_and_validate_image_input
batch.update(image_inputs)
return batch
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
repl_features = IMG_CONTEXT * feature_size
repl_full = IMG_START + repl_features + IMG_END
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
def get_audio_repl(
self,
audio: npt.NDArray,
) -> PromptUpdateDetails[str]:
assert self.audio_extractor is not None
num_tokens = self.audio_extractor.audio_token_count(len(audio))
repl_full = f"{AUDIO_START}{AUDIO_CONTEXT * num_tokens}{AUDIO_END}"
return PromptUpdateDetails.select_text(repl_full, AUDIO_CONTEXT)
@classmethod
def get_video_repl(
cls,
*,
tokens_per_frame: list[int],
frames_indices: list[int],
frame_duration_ms: int,
tokenizer: TokenizerLike,
img_start_token_ids: list[int],
img_end_token_ids: list[int],
img_context_token_ids: list[int],
) -> PromptUpdateDetails[list[int]]:
"""
Build prompt replacement for a video.
The replacement returned is not actually used to replace the placeholder
tokens - it's just used to make sure we allocate the correct number
of tokens.
Actual replacement is done in embed_multimodal of
NemotronH_Nano_VL_V2
(specifically in _process_video_input -> _create_final_video_embeddings).
There, we create the final embeddings with text embeddings for indicator tokens
and video embeddings for video tokens.
This is a single function that handles all cases - non EVS, EVS dummy, EVS real.
The differentiation is done via tokens_per_frame parameter.
- non EVS case - constant value same value across all frames
- EVS dummy - Doesn't matter how tokens are distributed between frames - just
make sure the total number of tokens is correct.
- EVS real (called from get_real_video_repl_for_evs) - different value per frame
Args:
tokens_per_frame (list[int]): number of tokens per frame
frames_indices (list[int]): frame indices
frame_duration_ms (int): duration of each frame in milliseconds
tokenizer (TokenizerLike): tokenizer to use for tokenizing frame separators
img_start_token_ids (list[int]): pre-tokenized IMG_START tokens
img_end_token_ids (list[int]): pre-tokenized IMG_END tokens
img_context_token_ids (list[int]): pre-tokenized IMG_CONTEXT tokens
"""
# TODO: Add support of frame_duration_ms to be None
# At preprocessing step we should allow absent / metadata without
# frames_indices field.
timestamps_enabled = frame_duration_ms is not None
if timestamps_enabled:
timestamps = calculate_timestamps(frames_indices, frame_duration_ms)
assert len(timestamps) == len(tokens_per_frame), (
"timestamps and tokens_per_frame must have the same length"
)
frame_separators = [
f"Frame {i + 1} sampled at {timestamp:.2f} seconds: "
for i, timestamp in enumerate(timestamps)
]
else:
frame_separators = [
f"Frame {i + 1}: " for i, _ in enumerate(tokens_per_frame)
]
# Tokenize frame separator independently
frame_separators_tokenized = [
_seq2tokens(tokenizer, sep) for sep in frame_separators
]
# Tokenize each component independently to avoid tokenizer merging tokens
# across boundaries. This ensures consistent tokenization regardless of
# num_tokens_per_frame values.
all_token_ids = []
for i, num_tokens in enumerate(tokens_per_frame):
frame_sep_token_ids = frame_separators_tokenized[i]
all_token_ids.extend(frame_sep_token_ids)
# Add pre-tokenized special tokens
all_token_ids.extend(img_start_token_ids)
all_token_ids.extend(img_context_token_ids * num_tokens)
all_token_ids.extend(img_end_token_ids)
return PromptUpdateDetails.from_seq(all_token_ids)
class BaseNanoNemotronVLProcessingInfo(BaseProcessingInfo): class BaseNanoNemotronVLProcessingInfo(BaseProcessingInfo):
"""Basic image-only ProcessingInfo for InternVL-style models.""" """Basic image-only ProcessingInfo for InternVL-style models."""
......
...@@ -11,18 +11,13 @@ import math ...@@ -11,18 +11,13 @@ import math
from collections.abc import Iterable, Mapping, Sequence from collections.abc import Iterable, Mapping, Sequence
from typing import Annotated, Literal from typing import Annotated, Literal
import numpy as np
import torch import torch
import torch.nn as nn import torch.nn as nn
from einops import rearrange from einops import rearrange
from PIL import Image
from timm.data.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
from torchvision import transforms as T
from transformers import ( from transformers import (
BartConfig, BartConfig,
BatchFeature, BatchFeature,
PretrainedConfig, PretrainedConfig,
TensorType,
) )
from vllm.config import CacheConfig, VllmConfig from vllm.config import CacheConfig, VllmConfig
...@@ -59,13 +54,12 @@ from vllm.multimodal.processing import ( ...@@ -59,13 +54,12 @@ from vllm.multimodal.processing import (
PromptUpdate, PromptUpdate,
) )
from vllm.renderers import TokenizeParams from vllm.renderers import TokenizeParams
from vllm.tokenizers import TokenizerLike
from vllm.transformers_utils.configs.radio import RadioConfig from vllm.transformers_utils.configs.radio import RadioConfig
from vllm.transformers_utils.processors.nemotron_parse import NemotronParseProcessor
from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.tensor_schema import TensorSchema, TensorShape
from vllm.v1.attention.backend import AttentionType from vllm.v1.attention.backend import AttentionType
logger = init_logger(__name__) logger = init_logger(__name__)
DEFAULT_FINAL_IMAGE_SIZE = (2048, 1648)
class BartScaledWordEmbedding(VocabParallelEmbedding): class BartScaledWordEmbedding(VocabParallelEmbedding):
...@@ -372,231 +366,6 @@ class NemotronParsePixelInputs(TensorSchema): ...@@ -372,231 +366,6 @@ class NemotronParsePixelInputs(TensorSchema):
data: Annotated[torch.Tensor, TensorShape("b", 3, "h", "w")] data: Annotated[torch.Tensor, TensorShape("b", 3, "h", "w")]
class NemotronParseImageProcessor:
"""
NemotronParse Image Processor
"""
def __init__(
self,
final_size: tuple = DEFAULT_FINAL_IMAGE_SIZE,
**kwargs,
):
# Ensure final_size is properly formatted
if isinstance(final_size, (list, tuple)) and len(final_size) >= 2:
self.final_size = (int(final_size[0]), int(final_size[1]))
elif isinstance(final_size, (int, float)):
self.final_size = (int(final_size), int(final_size))
else:
self.final_size = DEFAULT_FINAL_IMAGE_SIZE # Default fallback
self.norm_mean = torch.Tensor(OPENAI_CLIP_MEAN).reshape(1, 3, 1, 1)
self.norm_std = torch.Tensor(OPENAI_CLIP_STD).reshape(1, 3, 1, 1)
# Create transforms
self._create_transforms()
def _create_transforms(self):
"""Create transform objects."""
try:
import albumentations as A
except ImportError as err:
raise ImportError(
"The package `albumentations` is required to use "
"NemotronParse model. Please install it with `pip install "
"albumentations`."
) from err
# Ensure final_size is a tuple of integers
if isinstance(self.final_size, (list, tuple)):
self.target_height, self.target_width = (
int(self.final_size[0]),
int(self.final_size[1]),
)
else:
self.target_height = self.target_width = int(self.final_size)
import cv2
self.transform = A.Compose(
[
A.PadIfNeeded(
min_height=self.target_height,
min_width=self.target_width,
border_mode=cv2.BORDER_CONSTANT,
fill=[255, 255, 255],
p=1.0,
),
]
)
self.torch_transform = T.Compose(
[
T.ToTensor(),
]
)
def _resize_with_aspect_ratio(self, image: np.ndarray) -> np.ndarray:
"""Resize image maintaining aspect ratio (exact replica of original
LongestMaxSizeHW)."""
height, width = image.shape[:2]
max_size_height = self.target_height
max_size_width = self.target_width
# Original LongestMaxSizeHW algorithm from custom_augmentations.py
aspect_ratio = width / height
new_height = height
new_width = width
# If height too big then scale image down
if height > max_size_height:
new_height = max_size_height
new_width = int(new_height * aspect_ratio)
# If width too big, scale image down further
if new_width > max_size_width:
new_width = max_size_width
new_height = int(new_width / aspect_ratio)
# Use cv2.INTER_LINEAR like the original
import cv2
return cv2.resize(
image, (new_width, new_height), interpolation=cv2.INTER_LINEAR
)
def _pad_to_size(self, image: np.ndarray) -> np.ndarray:
"""Pad image to target size with white padding (matches A.PadIfNeeded
behavior)."""
h, w = image.shape[:2]
min_height, min_width = self.target_height, self.target_width
# Only pad if image is smaller than target (matches A.PadIfNeeded logic)
pad_h = max(0, min_height - h)
pad_w = max(0, min_width - w)
if pad_h == 0 and pad_w == 0:
return image
# A.PadIfNeeded pads to bottom-right with constant value
if len(image.shape) == 3:
# Color image - pad bottom and right with white (255, 255, 255)
padded = np.pad(
image,
((0, pad_h), (0, pad_w), (0, 0)),
mode="constant",
constant_values=255,
)
else:
# Grayscale image - pad with white (255)
padded = np.pad(
image, ((0, pad_h), (0, pad_w)), mode="constant", constant_values=255
)
return padded
def preprocess(
self,
images: Image.Image | list[Image.Image],
**kwargs,
) -> dict[str, torch.Tensor]:
"""
Preprocess an image or batch of images for the NemotronParse model.
Args:
images: Input image(s)
"""
# Ensure images is a list
if not isinstance(images, list):
images = [images]
# Convert PIL images to numpy arrays if needed
processed_images = []
for image in images:
if isinstance(image, Image.Image):
image = np.asarray(image)
processed_images.append(image)
# Apply NemotronParse-specific transforms
pixel_values = []
for image in processed_images:
# Manual resize with aspect ratio preservation
# (replaces LongestMaxSizeHW)
processed_image = self._resize_with_aspect_ratio(image)
# Apply remaining albumentations transforms if available
if self.transform is not None:
transformed = self.transform(image=processed_image)
processed_image = transformed["image"]
else:
# Fallback: just pad to target size
processed_image = self._pad_to_size(processed_image)
# Convert to tensor
pixel_values_tensor = self.torch_transform(processed_image)
# Handle grayscale images
if pixel_values_tensor.shape[0] == 1:
pixel_values_tensor = pixel_values_tensor.expand(3, -1, -1)
pixel_values.append(pixel_values_tensor)
# Stack into batch
pixel_values = torch.stack(pixel_values)
# Normalize pixel values
normalized_values = (pixel_values - self.norm_mean) / self.norm_std
return {"pixel_values": normalized_values}
def __call__(
self, images: Image.Image | list[Image.Image], **kwargs
) -> dict[str, torch.Tensor]:
return self.preprocess(images, **kwargs)
class NemotronParseProcessor:
"""
NemotronParse Processor
"""
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
**kwargs,
) -> None:
super().__init__()
self.config = config
self.tokenizer = tokenizer
self.image_processor = NemotronParseImageProcessor(final_size=config.image_size)
def _make_batch_input(self, input_item=None):
if input_item is None:
input_item = []
if not isinstance(input_item, list):
input_item = [input_item]
return input_item
def __call__(
self,
text: str | None = None,
images: Image.Image | list[Image.Image] | None = None,
return_tensors: str | TensorType | None = None,
**kwargs,
) -> BatchFeature:
text, images = [self._make_batch_input(x) for x in (text, images)]
image_inputs = {} if len(images) == 0 else self.image_processor(images)
text_inputs = self.tokenizer(text, add_special_tokens=False, **kwargs)
combined_outputs = BatchFeature(
data={**text_inputs, **image_inputs},
tensor_type=return_tensors,
)
return combined_outputs
class NemotronParseProcessingInfo(BaseProcessingInfo): class NemotronParseProcessingInfo(BaseProcessingInfo):
def get_hf_config(self): def get_hf_config(self):
return self.ctx.get_hf_config() return self.ctx.get_hf_config()
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py
# --------------------------------------------------------
# InternVL
# Copyright (c) 2023 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import math import math
from abc import ABC
from collections.abc import Iterable from collections.abc import Iterable
import torch import torch
import torch.nn as nn import torch.nn as nn
import torchvision.transforms as T
from PIL import Image
from transformers import AutoModel, PretrainedConfig from transformers import AutoModel, PretrainedConfig
from transformers.image_processing_utils_fast import BaseImageProcessorFast
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.linear import ReplicatedLinear
...@@ -30,16 +19,16 @@ from vllm.model_executor.models.internvl import ( ...@@ -30,16 +19,16 @@ from vllm.model_executor.models.internvl import (
InternVLImageEmbeddingInputs, InternVLImageEmbeddingInputs,
InternVLImageInputs, InternVLImageInputs,
InternVLImagePixelInputs, InternVLImagePixelInputs,
InternVLProcessor,
) )
from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.models.siglip import SiglipVisionModel from vllm.model_executor.models.siglip import SiglipVisionModel
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.image import convert_image_mode
from vllm.multimodal.processing import PromptUpdateDetails
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.tokenizers import TokenizerLike
from vllm.transformers_utils.processor import cached_image_processor_from_config from vllm.transformers_utils.processor import cached_image_processor_from_config
from vllm.transformers_utils.processors.nemotron_vl import (
LlamaNemotronVLEmbedProcessor,
NemotronVLProcessor,
)
from vllm.transformers_utils.repo_utils import get_hf_file_to_dict from vllm.transformers_utils.repo_utils import get_hf_file_to_dict
from .interfaces import ( from .interfaces import (
...@@ -58,310 +47,6 @@ from .utils import ( ...@@ -58,310 +47,6 @@ from .utils import (
) )
def build_transform(input_size: int):
return T.Compose(
[
T.Lambda(lambda img: convert_image_mode(img, "RGB")),
T.Resize(
(input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
),
T.ToTensor(),
]
)
# adapted from https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1
def find_closest_aspect_ratio(
aspect_ratio: float,
target_ratios: list[tuple[int, int]],
*,
width: int,
height: int,
image_size: int,
) -> tuple[int, int]:
best_factor = float("-inf")
best_ratio = (1, 1)
area = width * height
for rw, rh in target_ratios:
target_aspect_ratio = rw / rh
size_factor = min((rw * rh * image_size * image_size) / area, 0.6)
ratio_closeness = min(
target_aspect_ratio / aspect_ratio, aspect_ratio / target_aspect_ratio
)
factor = size_factor * ratio_closeness
if factor > best_factor:
best_factor = factor
best_ratio = (rw, rh)
return best_ratio
def calculate_nemotron_vl_targets(
*,
orig_width: int,
orig_height: int,
target_ratios: list[tuple[int, int]],
image_size: int,
use_thumbnail: bool,
) -> tuple[int, int, int]:
aspect_ratio = orig_width / orig_height
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio,
target_ratios,
width=orig_width,
height=orig_height,
image_size=image_size,
)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# add thumbnail image if num_blocks != 1
if use_thumbnail and blocks != 1:
blocks += 1
return blocks, target_width, target_height
def dynamic_preprocess_nemotron_vl(
image: Image.Image,
*,
target_ratios: list[tuple[int, int]],
image_size: int,
use_thumbnail: bool,
) -> list[Image.Image]:
orig_width, orig_height = image.size
# calculate the number of blocks without thumbnail
blocks, target_width, target_height = calculate_nemotron_vl_targets(
orig_width=orig_width,
orig_height=orig_height,
target_ratios=target_ratios,
image_size=image_size,
use_thumbnail=False,
)
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size,
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
def get_nemotron_vl_target_ratios(
min_num: int,
max_num: int,
) -> list[tuple[int, int]]:
target_ratios = {
(i, j)
for n in range(min_num, max_num + 1)
for i in range(1, n + 1)
for j in range(1, n + 1)
if min_num <= i * j <= max_num
}
return sorted(target_ratios, key=lambda x: x[0] * x[1])
def image_to_pixel_values_nemotron_vl(
image: Image.Image,
*,
input_size: int,
min_num: int,
max_num: int,
use_thumbnail: bool,
transform: T.Compose | None = None,
) -> torch.Tensor:
target_ratios = get_nemotron_vl_target_ratios(min_num, max_num)
if transform is None:
transform = build_transform(input_size=input_size)
images = dynamic_preprocess_nemotron_vl(
image,
target_ratios=target_ratios,
image_size=input_size,
use_thumbnail=use_thumbnail,
)
pixel_values = torch.stack([transform(image) for image in images])
return pixel_values
class NemotronVLProcessor(InternVLProcessor):
IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<image>"
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
image_processor: BaseImageProcessorFast | None = None,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> None:
ABC.__init__(self)
self.config = config
self.tokenizer = tokenizer
self.image_processor = image_processor
image_size: int = config.force_image_size
patch_size: int = config.patch_size
if min_dynamic_patch is None:
min_dynamic_patch = 1
assert isinstance(min_dynamic_patch, int)
if max_dynamic_patch is None:
max_dynamic_patch = self.image_processor.max_num_tiles
assert isinstance(max_dynamic_patch, int)
if dynamic_image_size is None:
dynamic_image_size = True
assert isinstance(dynamic_image_size, bool)
self.num_image_token = int(
(image_size // patch_size) ** 2 * (config.downsample_ratio**2)
)
self.image_size = image_size
self.min_dynamic_patch = min_dynamic_patch
self.max_dynamic_patch = max_dynamic_patch
self.dynamic_image_size = dynamic_image_size
if image_processor is not None:
self.use_thumbnail = image_processor.use_thumbnail
else:
self.use_thumbnail = getattr(config, "use_thumbnail", True)
@property
def image_token_id(self) -> int:
return self.tokenizer.get_vocab()[self.IMG_CONTEXT]
def _get_transform(self) -> T.Compose:
return build_transform(input_size=self.image_size)
def get_num_image_tokens(
self,
*,
image_width: int,
image_height: int,
) -> int:
target_ratios = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
)
num_patches, _, _ = calculate_nemotron_vl_targets(
orig_width=image_width,
orig_height=image_height,
image_size=self.image_size,
target_ratios=target_ratios,
use_thumbnail=self.use_thumbnail,
)
return num_patches * self.num_image_token
def _images_to_pixel_values_lst(
self,
images: list[Image.Image],
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> list[torch.Tensor]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=False, # Applied in image_to_pixel_values
)
return [
image_to_pixel_values_nemotron_vl(
image,
input_size=self.image_size,
min_num=min_num,
max_num=max_num,
use_thumbnail=self.use_thumbnail,
transform=self._get_transform(),
)
for image in images
]
def _replace_image_tokens(
self,
text: list[str],
pixel_values_lst: list[torch.Tensor],
) -> list[str]:
"""Replace <image> placeholders with image tokens."""
for pixel_values in pixel_values_lst:
num_patches = pixel_values.shape[0]
feature_size = num_patches * self.num_image_token
image_repl = self.get_image_repl(feature_size, num_patches)
# Use temporary placeholder to avoid replacing tokens we just inserted
NVL_IMAGE_CONTEXT = image_repl.full.replace("<image>", "<NVL_IMG_CONTEXT>")
text = [t.replace("<image>", NVL_IMAGE_CONTEXT, 1) for t in text]
return [t.replace("<NVL_IMG_CONTEXT>", self.IMG_CONTEXT) for t in text]
def _preprocess_image(
self,
text: list[str],
images: list[Image.Image],
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> tuple[list[str], dict[str, torch.Tensor]]:
if len(images) == 0:
image_inputs = {}
else:
pixel_values_lst = self._images_to_pixel_values_lst(
images,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
image_inputs = {
"pixel_values_flat": torch.cat(pixel_values_lst),
"image_num_patches": torch.tensor(
[len(item) for item in pixel_values_lst]
),
}
text = self._replace_image_tokens(text, pixel_values_lst)
return text, image_inputs
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
repl_features = self.IMG_CONTEXT * feature_size
repl_full = self.IMG_START + repl_features + self.IMG_END
return PromptUpdateDetails.select_text(repl_full, self.IMG_CONTEXT)
class NemotronVLProcessingInfo(BaseInternVLProcessingInfo): class NemotronVLProcessingInfo(BaseInternVLProcessingInfo):
"""Processing info for Nemotron VL models.""" """Processing info for Nemotron VL models."""
...@@ -700,91 +385,6 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor ...@@ -700,91 +385,6 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor
# - Pooler output instead of generative logits # - Pooler output instead of generative logits
# -------------------------------------------------------- # --------------------------------------------------------
# SigLIP normalization constants
SIGLIP_MEAN = (0.5, 0.5, 0.5)
SIGLIP_STD = (0.5, 0.5, 0.5)
def build_siglip_transform(input_size: int):
"""Build transform for SigLIP vision encoder with normalization.
Extends the base transform from nemotron_vl with SigLIP-specific normalization.
"""
base_transform = build_transform(input_size=input_size)
return T.Compose(
[
base_transform,
T.Normalize(mean=SIGLIP_MEAN, std=SIGLIP_STD),
]
)
class LlamaNemotronVLEmbedProcessor(NemotronVLProcessor):
"""
Processor for LlamaNemotronVL embedding model.
Inherits from NemotronVLProcessor and specializes it for embedding tasks:
- Uses SigLIP transform with normalization instead of base transform
- Uses different image context token (<IMG_CONTEXT> vs <image>)
"""
IMG_CONTEXT = "<IMG_CONTEXT>"
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
processor_config: dict,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> None:
if min_dynamic_patch is None:
min_dynamic_patch = processor_config.get(
"min_input_tiles",
getattr(config, "min_dynamic_patch", 1),
)
if max_dynamic_patch is None:
max_dynamic_patch = processor_config.get(
"max_input_tiles",
getattr(config, "max_dynamic_patch", 1),
)
if dynamic_image_size is None:
dynamic_image_size = processor_config.get(
"dynamic_image_size",
getattr(config, "dynamic_image_size", True),
)
super().__init__(
config=config,
tokenizer=tokenizer,
image_processor=None,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
def _get_transform(self) -> T.Compose:
"""Override to add SigLIP normalization."""
return build_siglip_transform(input_size=self.image_size)
def _replace_image_tokens(
self,
text: list[str],
pixel_values_lst: list[torch.Tensor],
) -> list[str]:
"""Override with simpler token replacement for embedding model.
No temporary placeholder needed because IMG_CONTEXT is <IMG_CONTEXT>,
not <image>, so there's no collision risk.
"""
for pixel_values in pixel_values_lst:
num_patches = pixel_values.shape[0]
feature_size = num_patches * self.num_image_token
image_repl = self.get_image_repl(feature_size, num_patches)
text = [t.replace("<image>", image_repl.full, 1) for t in text]
return text
class LlamaNemotronVLEmbedProcessingInfo(NemotronVLProcessingInfo): class LlamaNemotronVLEmbedProcessingInfo(NemotronVLProcessingInfo):
"""Processing info for LlamaNemotronVL embedding model.""" """Processing info for LlamaNemotronVL embedding model."""
......
...@@ -27,48 +27,16 @@ from vllm.multimodal.processing import ( ...@@ -27,48 +27,16 @@ from vllm.multimodal.processing import (
PromptUpdate, PromptUpdate,
PromptUpdateDetails, PromptUpdateDetails,
) )
from vllm.transformers_utils.processors.nvlm_d import IMG_PAD, NVLMProcessor
from .intern_vit import InternVisionModel from .intern_vit import InternVisionModel
from .internvl import ( from .internvl import (
BaseInternVLDummyInputsBuilder, BaseInternVLDummyInputsBuilder,
BaseInternVLMultiModalProcessor, BaseInternVLMultiModalProcessor,
BaseInternVLProcessingInfo, BaseInternVLProcessingInfo,
BaseInternVLProcessor,
InternVLChatModel, InternVLChatModel,
) )
IMG_PAD = "<|vision_pad|>"
class NVLMProcessor(BaseInternVLProcessor):
@property
def image_token_id(self) -> int:
return self.tokenizer.get_vocab()[IMG_PAD]
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
if num_patches is None:
raise NotImplementedError("Embedding inputs are not supported")
tile_pos_identifiers = [f"<tile_{i}>" for i in range(1, num_patches)]
if self.use_thumbnail:
tile_pos_identifiers += ["<tile_global_thumbnail>"]
context_size = feature_size // num_patches
features = "".join(
identifier + IMG_PAD * context_size for identifier in tile_pos_identifiers
)
# We include the start and end as well because "<Image><tile" is
# tokenized as ["<Image", "><", "tile"], resulting in assertion error
# when trying to find "<tile" as a subsequence of "<Image><tile"
repl = "<Image>" + features + "</Image>"
return PromptUpdateDetails.select_text(repl, IMG_PAD)
class NVLMProcessingInfo(BaseInternVLProcessingInfo): class NVLMProcessingInfo(BaseInternVLProcessingInfo):
def get_hf_processor(self, **kwargs: object) -> NVLMProcessor: def get_hf_processor(self, **kwargs: object) -> NVLMProcessor:
......
...@@ -12,9 +12,7 @@ from typing import Annotated, Literal, TypeAlias ...@@ -12,9 +12,7 @@ from typing import Annotated, Literal, TypeAlias
import torch import torch
import torch.nn as nn import torch.nn as nn
import torchvision.transforms as T from transformers import BatchFeature, PretrainedConfig
from PIL import Image
from transformers import BatchFeature, PretrainedConfig, TensorType
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
...@@ -26,7 +24,6 @@ from vllm.model_executor.models.intern_vit import ( ...@@ -26,7 +24,6 @@ from vllm.model_executor.models.intern_vit import (
InternVisionPatchModel, InternVisionPatchModel,
) )
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.image import convert_image_mode
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict, MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
...@@ -44,22 +41,14 @@ from vllm.multimodal.processing import ( ...@@ -44,22 +41,14 @@ from vllm.multimodal.processing import (
BaseProcessingInfo, BaseProcessingInfo,
PromptReplacement, PromptReplacement,
PromptUpdate, PromptUpdate,
PromptUpdateDetails,
) )
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.tokenizers import TokenizerLike from vllm.transformers_utils.processors.skyworkr1v import SkyworkR1VProcessor
from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<IMG_CONTEXT>"
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
class SkyworkR1VImagePixelInputs(TensorSchema): class SkyworkR1VImagePixelInputs(TensorSchema):
""" """
...@@ -106,370 +95,6 @@ SkyworkR1VImageInputs: TypeAlias = ( ...@@ -106,370 +95,6 @@ SkyworkR1VImageInputs: TypeAlias = (
) )
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
def build_transform(input_size: int):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
return T.Compose(
[
T.Lambda(lambda img: convert_image_mode(img, "RGB")),
T.Resize(
(input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD),
]
)
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
def find_closest_aspect_ratio(
aspect_ratio: float,
target_ratios: list[tuple[int, int]],
*,
width: int,
height: int,
image_size: int,
) -> tuple[int, int]:
best_ratio_diff = float("inf")
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def resolve_skyworkr1v_min_max_num(
*,
min_dynamic_patch: int,
max_dynamic_patch: int,
dynamic_image_size: bool,
use_thumbnail: bool,
) -> tuple[int, int]:
min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
if use_thumbnail and max_dynamic_patch != 1:
max_dynamic_patch += 1
return min_dynamic_patch, max_dynamic_patch
def get_skyworkr1v_target_ratios(
min_num: int,
max_num: int,
) -> list[tuple[int, int]]:
target_ratios = {
(i, j)
for n in range(min_num, max_num + 1)
for i in range(1, n + 1)
for j in range(1, n + 1)
if min_num <= i * j <= max_num
}
return sorted(target_ratios, key=lambda x: x[0] * x[1])
def calculate_skyworkr1v_targets(
*,
orig_width: int,
orig_height: int,
target_ratios: list[tuple[int, int]],
image_size: int,
use_thumbnail: bool,
) -> tuple[int, int, int]:
aspect_ratio = orig_width / orig_height
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio,
target_ratios,
width=orig_width,
height=orig_height,
image_size=image_size,
)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# add thumbnail image if num_blocks != 1
if use_thumbnail and blocks != 1:
blocks += 1
return blocks, target_width, target_height
def dynamic_preprocess_skyworkr1v(
image: Image.Image,
*,
target_ratios: list[tuple[int, int]],
image_size: int,
use_thumbnail: bool,
) -> list[Image.Image]:
orig_width, orig_height = image.size
# calculate the number of blocks without thumbnail
blocks, target_width, target_height = calculate_skyworkr1v_targets(
orig_width=orig_width,
orig_height=orig_height,
target_ratios=target_ratios,
image_size=image_size,
use_thumbnail=False,
)
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size,
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B
def image_to_pixel_values_skyworkr1v(
image: Image.Image,
*,
input_size: int,
min_num: int,
max_num: int,
use_thumbnail: bool,
) -> torch.Tensor:
target_ratios = get_skyworkr1v_target_ratios(min_num, max_num)
transform = build_transform(input_size=input_size)
images = dynamic_preprocess_skyworkr1v(
image,
target_ratios=target_ratios,
image_size=input_size,
use_thumbnail=use_thumbnail,
)
pixel_values = torch.stack([transform(image) for image in images])
return pixel_values
class SkyworkR1VProcessor:
"""
This model doesn't define its own HF processor,
so we implement our own one here.
The code to insert image tokens is based on:
https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py#L252
"""
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> None:
super().__init__()
self.config = config
self.tokenizer = tokenizer
image_size: int = config.vision_config.image_size
patch_size: int = config.vision_config.patch_size
if min_dynamic_patch is None:
min_dynamic_patch = config.min_dynamic_patch
assert isinstance(min_dynamic_patch, int)
if max_dynamic_patch is None:
max_dynamic_patch = config.max_dynamic_patch
assert isinstance(max_dynamic_patch, int)
if dynamic_image_size is None:
dynamic_image_size = config.dynamic_image_size
assert isinstance(dynamic_image_size, bool)
self.num_image_token = int(
(image_size // patch_size) ** 2 * (config.downsample_ratio**2)
)
self.image_size = image_size
self.min_dynamic_patch = min_dynamic_patch
self.max_dynamic_patch = max_dynamic_patch
self.dynamic_image_size = dynamic_image_size
self.use_thumbnail: bool = config.use_thumbnail
@property
def image_token_id(self) -> int:
return self.tokenizer.get_vocab()[IMG_CONTEXT]
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
repl_features = IMG_CONTEXT * feature_size
repl_full = IMG_START + repl_features + IMG_END
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
def resolve_min_max_num(
self,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
use_thumbnail: bool | None = None,
) -> tuple[int, int]:
min_dynamic_patch = (
self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
)
max_dynamic_patch = (
self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
)
dynamic_image_size = (
self.dynamic_image_size
if dynamic_image_size is None
else dynamic_image_size
)
use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
return resolve_skyworkr1v_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
)
def resolve_target_ratios(
self,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
use_thumbnail: bool | None = None,
) -> list[tuple[int, int]]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
)
return get_skyworkr1v_target_ratios(min_num, max_num)
def get_num_image_tokens(
self,
*,
image_width: int,
image_height: int,
) -> int:
target_ratios = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
)
num_patches, _, _ = calculate_skyworkr1v_targets(
orig_width=image_width,
orig_height=image_height,
image_size=self.image_size,
target_ratios=target_ratios,
use_thumbnail=self.use_thumbnail,
)
return num_patches * self.num_image_token
def _images_to_pixel_values_lst(
self,
images: list[Image.Image],
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> list[torch.Tensor]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=False, # Applied in image_to_pixel_values
)
return [
image_to_pixel_values_skyworkr1v(
image,
input_size=self.image_size,
min_num=min_num,
max_num=max_num,
use_thumbnail=self.use_thumbnail,
)
for image in images
]
def __call__(
self,
text: str | list[str] | None = None,
images: Image.Image | list[Image.Image] | None = None,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
return_tensors: str | TensorType | None = None,
) -> BatchFeature:
if text is None:
text = []
if not isinstance(text, list):
text = [text]
if images is None:
images = []
if not isinstance(images, list):
images = [images]
if len(images) == 0:
image_inputs = {}
else:
pixel_values_lst = self._images_to_pixel_values_lst(
images,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
image_inputs = {
"pixel_values_flat": torch.cat(pixel_values_lst),
"image_num_patches": torch.tensor(
[len(item) for item in pixel_values_lst]
),
}
for pixel_values in pixel_values_lst:
num_patches = pixel_values.shape[0]
feature_size = num_patches * self.num_image_token
image_repl = self.get_image_repl(feature_size, num_patches)
text = [t.replace("<image>", image_repl.full, 1) for t in text]
text_inputs = self.tokenizer(text)
combined_outputs = {**text_inputs, **image_inputs}
return BatchFeature(combined_outputs, tensor_type=return_tensors)
class SkyworkR1VProcessingInfo(BaseProcessingInfo): class SkyworkR1VProcessingInfo(BaseProcessingInfo):
def get_hf_processor(self, **kwargs: object) -> SkyworkR1VProcessor: def get_hf_processor(self, **kwargs: object) -> SkyworkR1VProcessor:
return self.ctx.init_processor( return self.ctx.init_processor(
......
...@@ -13,35 +13,53 @@ import importlib ...@@ -13,35 +13,53 @@ import importlib
__all__ = [ __all__ = [
"BagelProcessor", "BagelProcessor",
"DeepseekVLV2Processor", "DeepseekVLV2Processor",
"Eagle2_5_VLProcessor",
"FireRedASR2Processor", "FireRedASR2Processor",
"FunASRProcessor", "FunASRProcessor",
"GLM4VProcessor", "GLM4VProcessor",
"H2OVLProcessor",
"HunYuanVLProcessor", "HunYuanVLProcessor",
"HunYuanVLImageProcessor", "HunYuanVLImageProcessor",
"InternVLProcessor",
"KimiAudioProcessor", "KimiAudioProcessor",
"MistralCommonPixtralProcessor", "MistralCommonPixtralProcessor",
"MistralCommonVoxtralProcessor", "MistralCommonVoxtralProcessor",
"NanoNemotronVLProcessor",
"NemotronParseProcessor",
"NemotronVLProcessor",
"LlamaNemotronVLEmbedProcessor",
"NVLMProcessor",
"OvisProcessor", "OvisProcessor",
"Ovis2_5Processor", "Ovis2_5Processor",
"QwenVLProcessor", "QwenVLProcessor",
"Qwen3ASRProcessor", "Qwen3ASRProcessor",
"SkyworkR1VProcessor",
] ]
_CLASS_TO_MODULE: dict[str, str] = { _CLASS_TO_MODULE: dict[str, str] = {
"BagelProcessor": "vllm.transformers_utils.processors.bagel", "BagelProcessor": "vllm.transformers_utils.processors.bagel",
"DeepseekVLV2Processor": "vllm.transformers_utils.processors.deepseek_vl2", "DeepseekVLV2Processor": "vllm.transformers_utils.processors.deepseek_vl2",
"Eagle2_5_VLProcessor": "vllm.transformers_utils.processors.eagle2_5_vl",
"FireRedASR2Processor": "vllm.transformers_utils.processors.fireredasr2", "FireRedASR2Processor": "vllm.transformers_utils.processors.fireredasr2",
"FunASRProcessor": "vllm.transformers_utils.processors.funasr", "FunASRProcessor": "vllm.transformers_utils.processors.funasr",
"GLM4VProcessor": "vllm.transformers_utils.processors.glm4v", "GLM4VProcessor": "vllm.transformers_utils.processors.glm4v",
"H2OVLProcessor": "vllm.transformers_utils.processors.h2ovl",
"HunYuanVLProcessor": "vllm.transformers_utils.processors.hunyuan_vl", "HunYuanVLProcessor": "vllm.transformers_utils.processors.hunyuan_vl",
"HunYuanVLImageProcessor": "vllm.transformers_utils.processors.hunyuan_vl_image", "HunYuanVLImageProcessor": "vllm.transformers_utils.processors.hunyuan_vl_image",
"InternVLProcessor": "vllm.transformers_utils.processors.internvl",
"KimiAudioProcessor": "vllm.transformers_utils.processors.kimi_audio", "KimiAudioProcessor": "vllm.transformers_utils.processors.kimi_audio",
"MistralCommonPixtralProcessor": "vllm.transformers_utils.processors.pixtral", "MistralCommonPixtralProcessor": "vllm.transformers_utils.processors.pixtral",
"MistralCommonVoxtralProcessor": "vllm.transformers_utils.processors.voxtral", "MistralCommonVoxtralProcessor": "vllm.transformers_utils.processors.voxtral",
"NanoNemotronVLProcessor": "vllm.transformers_utils.processors.nano_nemotron_vl",
"NemotronParseProcessor": "vllm.transformers_utils.processors.nemotron_parse",
"NemotronVLProcessor": "vllm.transformers_utils.processors.nemotron_vl",
"LlamaNemotronVLEmbedProcessor": "vllm.transformers_utils.processors.nemotron_vl",
"NVLMProcessor": "vllm.transformers_utils.processors.nvlm_d",
"OvisProcessor": "vllm.transformers_utils.processors.ovis", "OvisProcessor": "vllm.transformers_utils.processors.ovis",
"Ovis2_5Processor": "vllm.transformers_utils.processors.ovis2_5", "Ovis2_5Processor": "vllm.transformers_utils.processors.ovis2_5",
"QwenVLProcessor": "vllm.transformers_utils.processors.qwen_vl", "QwenVLProcessor": "vllm.transformers_utils.processors.qwen_vl",
"Qwen3ASRProcessor": "vllm.transformers_utils.processors.qwen3_asr", "Qwen3ASRProcessor": "vllm.transformers_utils.processors.qwen3_asr",
"SkyworkR1VProcessor": "vllm.transformers_utils.processors.skyworkr1v",
} }
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from NVIDIA Eagle2.5-VL model
# https://huggingface.co/nvidia/Eagle2.5-8B
from transformers import PretrainedConfig
from vllm.multimodal.processing import PromptUpdateDetails
from vllm.tokenizers import TokenizerLike
from .internvl import IMG_CONTEXT, IMG_END, IMG_START, BaseInternVLProcessor
class Eagle2_5_VLProcessor(BaseInternVLProcessor):
"""
Custom processor for Eagle2.5-VL model.
Extends BaseInternVLProcessor with Eagle-specific token handling.
"""
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> None:
# Skip super().__init__() to avoid config manipulation
# Directly initialize all required attributes
self.config = config
self.tokenizer = tokenizer
# Image size with force_image_size override
image_size: int = config.vision_config.image_size
if hasattr(config, "force_image_size") and config.force_image_size:
image_size = config.force_image_size
patch_size: int = config.vision_config.patch_size
downsample_ratio: float = getattr(config, "downsample_ratio", 0.5)
# Compute num_image_token
self.num_image_token = int(
(image_size // patch_size) ** 2 * (downsample_ratio**2)
)
self.image_size = image_size
# Dynamic patch settings with defaults
self.min_dynamic_patch = (
min_dynamic_patch
if min_dynamic_patch is not None
else getattr(config, "min_dynamic_patch", 1)
)
self.max_dynamic_patch = (
max_dynamic_patch
if max_dynamic_patch is not None
else getattr(config, "max_dynamic_patch", 12)
)
self.dynamic_image_size = (
dynamic_image_size
if dynamic_image_size is not None
else getattr(config, "dynamic_image_size", True)
)
self.use_thumbnail: bool = getattr(config, "use_thumbnail", True)
@property
def image_token_id(self) -> int:
"""Get the image token ID from config or tokenizer."""
if hasattr(self.config, "image_token_index"):
return self.config.image_token_index
# Fallback to tokenizer vocab - use <IMG_CONTEXT> (ID: 151667)
vocab = self.tokenizer.get_vocab()
if IMG_CONTEXT in vocab:
return vocab[IMG_CONTEXT]
raise ValueError(f"Cannot find image token '{IMG_CONTEXT}' in vocabulary")
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
"""Get image replacement string for prompt."""
repl_features = IMG_CONTEXT * feature_size
repl_full = IMG_START + repl_features + IMG_END
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# adapted from https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/modeling_h2ovl_chat.py
# https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/image_process.py
# --------------------------------------------------------
# H2OVL-Mississippi
# Copyright (c) 2024 H2O.AI
# Licensed under Apache 2.0 License [see LICENSE for details]
# --------------------------------------------------------
import torch
from PIL import Image
from transformers import PretrainedConfig
from vllm.multimodal.processing import PromptUpdateDetails
from vllm.tokenizers import TokenizerLike
from .internvl import (
IMG_CONTEXT,
IMG_END,
IMG_START,
BaseInternVLProcessor,
build_transform,
find_closest_aspect_ratio,
get_internvl_target_ratios,
)
def resolve_h2ovl_min_max_num(
*,
min_dynamic_patch: int,
max_dynamic_patch: int,
dynamic_image_size: bool,
use_thumbnail: bool,
) -> tuple[int, int]:
min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
if use_thumbnail and max_dynamic_patch != 1:
max_dynamic_patch += 1
return min_dynamic_patch, max_dynamic_patch
def get_h2ovl_target_ratios(
min_num: int,
max_num: int,
*,
prior_aspect_ratio: tuple[int, int] | None,
) -> list[tuple[int, int]]:
target_ratios = get_internvl_target_ratios(min_num, max_num)
# if prior_aspect_ratio is provided, filter the target ratios
if prior_aspect_ratio is not None:
target_ratios = [
ratio
for ratio in target_ratios
if prior_aspect_ratio[0] % ratio[0] != 0
and prior_aspect_ratio[1] % ratio[1] != 0
]
return target_ratios
# modified to include blocks generated in second pass
def calculate_h2ovl_targets(
*,
orig_width: int,
orig_height: int,
target_ratios: list[tuple[int, int]],
image_size: int,
use_thumbnail: bool,
) -> tuple[int, int, int, tuple[int, int]]:
aspect_ratio = orig_width / orig_height
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio,
target_ratios,
width=orig_width,
height=orig_height,
image_size=image_size,
)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# add thumbnail image if num_blocks != 1
if use_thumbnail and blocks != 1:
blocks += 1
return blocks, target_width, target_height, target_aspect_ratio
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
# refactored to handle prior_aspect_ratio
def dynamic_preprocess_h2ovl(
image: Image.Image,
*,
target_ratios: list[tuple[int, int]],
image_size: int,
use_thumbnail: bool,
) -> tuple[list[Image.Image], tuple[int, int]]:
orig_width, orig_height = image.size
# calculate the number of blocks without thumbnail
(
blocks,
target_width,
target_height,
target_aspect_ratio,
) = calculate_h2ovl_targets(
orig_width=orig_width,
orig_height=orig_height,
target_ratios=target_ratios,
image_size=image_size,
use_thumbnail=False,
)
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size,
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images, target_aspect_ratio
def _preprocess_image(
image: Image.Image,
*,
input_size: int,
min_num: int,
max_num: int,
use_thumbnail: bool,
prior_aspect_ratio: tuple[int, int] | None,
) -> tuple[torch.Tensor, tuple[int, int]]:
target_ratios = get_h2ovl_target_ratios(
min_num,
max_num,
prior_aspect_ratio=prior_aspect_ratio,
)
transform = build_transform(input_size=input_size)
images, target_aspect_ratio = dynamic_preprocess_h2ovl(
image,
image_size=input_size,
use_thumbnail=use_thumbnail,
target_ratios=target_ratios,
)
pixel_values = torch.stack([transform(image) for image in images])
return pixel_values, target_aspect_ratio
# refactored to use the _preprocess_image function
def image_to_pixel_values_h2ovl(
image: Image.Image,
*,
input_size: int,
min_num: int,
max_num: int,
use_thumbnail: bool,
use_msac: bool,
) -> torch.Tensor:
# when MSAC is turned on, we need to process the image twice
if use_msac:
# first pass
pixel_values1, aspect_ratio1 = _preprocess_image(
image,
input_size=input_size,
min_num=1,
max_num=max_num,
use_thumbnail=True,
prior_aspect_ratio=None,
)
# second pass
pixel_values2, _ = _preprocess_image(
image,
input_size=input_size,
min_num=3,
max_num=max_num,
use_thumbnail=True,
prior_aspect_ratio=aspect_ratio1,
)
# combine pixel values
pixel_values = torch.cat(
[pixel_values2[:-1], pixel_values1[:-1], pixel_values2[-1:]], 0
)
else:
pixel_values, _ = _preprocess_image(
image,
input_size=input_size,
min_num=min_num,
max_num=max_num,
use_thumbnail=use_thumbnail,
prior_aspect_ratio=None,
)
return pixel_values
class H2OVLProcessor(BaseInternVLProcessor):
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
use_msac: bool | None = None,
) -> None:
super().__init__(
config,
tokenizer,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
if use_msac is None:
use_msac = config.use_msac
assert isinstance(use_msac, bool)
self.use_msac = use_msac
@property
def image_token_id(self) -> int:
return self.tokenizer.get_vocab()[IMG_CONTEXT]
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
repl_features = IMG_CONTEXT * feature_size
repl_full = IMG_START + repl_features + IMG_END
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
def resolve_min_max_num(
self,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
use_thumbnail: bool | None = None,
) -> tuple[int, int]:
min_dynamic_patch = (
self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
)
max_dynamic_patch = (
self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
)
dynamic_image_size = (
self.dynamic_image_size
if dynamic_image_size is None
else dynamic_image_size
)
use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
return resolve_h2ovl_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
)
def resolve_target_ratios(
self,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
use_thumbnail: bool | None = None,
prior_aspect_ratio: tuple[int, int] | None = None,
override_min_num: int | None = None,
) -> list[tuple[int, int]]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
)
if override_min_num is not None:
min_num = override_min_num
return get_h2ovl_target_ratios(
min_num,
max_num,
prior_aspect_ratio=prior_aspect_ratio,
)
def get_num_image_tokens(
self,
*,
image_width: int,
image_height: int,
use_msac: bool | None = None,
) -> int:
use_msac = self.use_msac if use_msac is None else use_msac
use_thumbnail = self.use_thumbnail
if use_msac:
target_ratios_1 = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
override_min_num=1,
)
num_patches_1, _, _, aspect_ratio_1 = calculate_h2ovl_targets(
orig_width=image_width,
orig_height=image_height,
image_size=self.image_size,
target_ratios=target_ratios_1,
use_thumbnail=True,
)
target_ratios_2 = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
prior_aspect_ratio=aspect_ratio_1,
override_min_num=3,
)
num_patches_2, _, _, _ = calculate_h2ovl_targets(
orig_width=image_width,
orig_height=image_height,
image_size=self.image_size,
target_ratios=target_ratios_2,
use_thumbnail=True,
)
num_patches = num_patches_1 + num_patches_2 - 1
else:
target_ratios = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
)
num_patches, _, _, _ = calculate_h2ovl_targets(
orig_width=image_width,
orig_height=image_height,
image_size=self.image_size,
target_ratios=target_ratios,
use_thumbnail=use_thumbnail,
)
return num_patches * self.num_image_token
def _images_to_pixel_values_lst(
self,
images: list[Image.Image],
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> list[torch.Tensor]:
use_msac = self.use_msac if len(images) == 1 else False
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=False, # Applied in image_to_pixel_values
)
return [
image_to_pixel_values_h2ovl(
image,
input_size=self.image_size,
min_num=min_num,
max_num=max_num,
use_thumbnail=self.use_thumbnail,
use_msac=use_msac,
)
for image in images
]
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py
# --------------------------------------------------------
# InternVL
# Copyright (c) 2023 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from abc import ABC, abstractmethod
from typing import Any, TypeVar
import numpy.typing as npt
import torch
import torchvision.transforms as T
from PIL import Image
from transformers import BatchFeature, PretrainedConfig, TensorType
from vllm.multimodal.image import convert_image_mode
from vllm.multimodal.processing import PromptUpdateDetails
from vllm.tokenizers import TokenizerLike
_T = TypeVar("_T")
IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<IMG_CONTEXT>"
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def build_transform(input_size: int):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose(
[
T.Lambda(lambda img: convert_image_mode(img, "RGB")),
T.Resize(
(input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD),
]
)
return transform
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def find_closest_aspect_ratio(
aspect_ratio: float,
target_ratios: list[tuple[int, int]],
*,
width: int,
height: int,
image_size: int,
) -> tuple[int, int]:
best_ratio_diff = float("inf")
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def resolve_internvl_min_max_num(
*,
min_dynamic_patch: int,
max_dynamic_patch: int,
dynamic_image_size: bool,
use_thumbnail: bool,
) -> tuple[int, int]:
min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
if use_thumbnail and max_dynamic_patch != 1:
max_dynamic_patch += 1
return min_dynamic_patch, max_dynamic_patch
def get_internvl_target_ratios(
min_num: int,
max_num: int,
) -> list[tuple[int, int]]:
target_ratios = {
(i, j)
for n in range(min_num, max_num + 1)
for i in range(1, n + 1)
for j in range(1, n + 1)
if min_num <= i * j <= max_num
}
return sorted(target_ratios, key=lambda x: x[0] * x[1])
def calculate_internvl_targets(
*,
orig_width: int,
orig_height: int,
target_ratios: list[tuple[int, int]],
image_size: int,
use_thumbnail: bool,
) -> tuple[int, int, int]:
aspect_ratio = orig_width / orig_height
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio,
target_ratios,
width=orig_width,
height=orig_height,
image_size=image_size,
)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# add thumbnail image if num_blocks != 1
if use_thumbnail and blocks != 1:
blocks += 1
return blocks, target_width, target_height
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def dynamic_preprocess_internvl(
image: Image.Image,
*,
target_ratios: list[tuple[int, int]],
image_size: int,
use_thumbnail: bool,
) -> list[Image.Image]:
orig_width, orig_height = image.size
# calculate the number of blocks without thumbnail
blocks, target_width, target_height = calculate_internvl_targets(
orig_width=orig_width,
orig_height=orig_height,
target_ratios=target_ratios,
image_size=image_size,
use_thumbnail=False,
)
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size,
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def image_to_pixel_values_internvl(
image: Image.Image,
*,
input_size: int,
min_num: int,
max_num: int,
use_thumbnail: bool,
) -> torch.Tensor:
target_ratios = get_internvl_target_ratios(min_num, max_num)
transform = build_transform(input_size=input_size)
images = dynamic_preprocess_internvl(
image,
target_ratios=target_ratios,
image_size=input_size,
use_thumbnail=use_thumbnail,
)
pixel_values = torch.stack([transform(image) for image in images])
return pixel_values
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def video_to_pixel_values_internvl(
video: npt.NDArray,
*,
input_size: int,
min_num: int,
max_num: int,
use_thumbnail: bool,
) -> torch.Tensor:
target_ratios = get_internvl_target_ratios(min_num, max_num)
transform = build_transform(input_size=input_size)
frames_list = list[Image.Image]()
for frame in video:
pil_frame = dynamic_preprocess_internvl(
Image.fromarray(frame, mode="RGB"),
target_ratios=target_ratios,
image_size=input_size,
use_thumbnail=use_thumbnail,
)
assert len(pil_frame) == 1
frames_list.extend(pil_frame)
pixel_values = torch.stack([transform(image) for image in frames_list])
return pixel_values
class BaseInternVLProcessor(ABC):
"""
This model doesn't define its own HF processor,
so we implement our own one here.
The code to insert image tokens is based on:
https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
"""
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> None:
super().__init__()
self.config = config
self.tokenizer = tokenizer
image_size: int = config.vision_config.image_size
patch_size: int = config.vision_config.patch_size
if min_dynamic_patch is None:
min_dynamic_patch = config.min_dynamic_patch
assert isinstance(min_dynamic_patch, int)
if max_dynamic_patch is None:
max_dynamic_patch = config.max_dynamic_patch
assert isinstance(max_dynamic_patch, int)
if dynamic_image_size is None:
dynamic_image_size = config.dynamic_image_size
assert isinstance(dynamic_image_size, bool)
self.num_image_token = int(
(image_size // patch_size) ** 2 * (config.downsample_ratio**2)
)
self.image_size = image_size
self.min_dynamic_patch = min_dynamic_patch
self.max_dynamic_patch = max_dynamic_patch
self.dynamic_image_size = dynamic_image_size
self.use_thumbnail: bool = config.use_thumbnail
@property
@abstractmethod
def image_token_id(self) -> int:
raise NotImplementedError
@abstractmethod
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
raise NotImplementedError
def resolve_min_max_num(
self,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
use_thumbnail: bool | None = None,
) -> tuple[int, int]:
min_dynamic_patch = (
self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
)
max_dynamic_patch = (
self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
)
dynamic_image_size = (
self.dynamic_image_size
if dynamic_image_size is None
else dynamic_image_size
)
use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
return resolve_internvl_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
)
def resolve_target_ratios(
self,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
use_thumbnail: bool | None = None,
) -> list[tuple[int, int]]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
)
return get_internvl_target_ratios(min_num, max_num)
def get_num_image_tokens(
self,
*,
image_width: int,
image_height: int,
) -> int:
target_ratios = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
)
num_patches, _, _ = calculate_internvl_targets(
orig_width=image_width,
orig_height=image_height,
image_size=self.image_size,
target_ratios=target_ratios,
use_thumbnail=self.use_thumbnail,
)
return num_patches * self.num_image_token
def _images_to_pixel_values_lst(
self,
images: list[Image.Image],
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> list[torch.Tensor]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=False, # Applied in image_to_pixel_values
)
return [
image_to_pixel_values_internvl(
image,
input_size=self.image_size,
min_num=min_num,
max_num=max_num,
use_thumbnail=self.use_thumbnail,
)
for image in images
]
def _preprocess_image(
self,
text: list[str],
images: list[Image.Image],
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> tuple[list[str], dict[str, torch.Tensor]]:
if len(images) == 0:
image_inputs = {}
else:
pixel_values_lst = self._images_to_pixel_values_lst(
images,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
image_inputs = {
"pixel_values_flat": torch.cat(pixel_values_lst),
"image_num_patches": torch.tensor(
[len(item) for item in pixel_values_lst]
),
}
for pixel_values in pixel_values_lst:
num_patches = pixel_values.shape[0]
feature_size = num_patches * self.num_image_token
image_repl = self.get_image_repl(feature_size, num_patches)
text = [t.replace("<image>", image_repl.full, 1) for t in text]
return text, image_inputs
def _make_batch_input(self, input_item: _T | list[_T] | None = None) -> list[_T]:
if input_item is None:
input_item = []
if not isinstance(input_item, list):
input_item = [input_item]
return input_item
def __call__(
self,
text: str | list[str] | None = None,
images: Image.Image | list[Image.Image] | None = None,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
return_tensors: str | TensorType | None = None,
**kwargs,
) -> BatchFeature:
text = self._make_batch_input(text)
images = self._make_batch_input(images)
text, image_inputs = self._preprocess_image(
text=text,
images=images,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
text_inputs = self.tokenizer(text)
combined_outputs = {**text_inputs, **image_inputs}
return BatchFeature(combined_outputs, tensor_type=return_tensors)
class InternVLProcessor(BaseInternVLProcessor):
"""
HF Processor for InternVLChatModel with extended video processing logic.
Code for video processing is adapted from video example:
https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers
"""
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
video_token: str | None = None,
) -> None:
super().__init__(
config=config,
tokenizer=tokenizer,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
# add extra video token for video processing
self.video_token = video_token
@property
def image_token_id(self) -> int:
return self.tokenizer.get_vocab()[IMG_CONTEXT]
@property
def video_token_id(self) -> int | None:
if self.video_token is None:
return None
return self.tokenizer.get_vocab().get(self.video_token, None)
@property
def supports_video(self) -> bool:
return self.video_token_id is not None
def _videos_to_pixel_values_lst(
self,
videos: list[npt.NDArray],
dynamic_image_size: bool | None = None,
) -> list[torch.Tensor]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=1,
max_dynamic_patch=1,
dynamic_image_size=dynamic_image_size,
use_thumbnail=False, # Applied in image_to_pixel_values
)
return [
video_to_pixel_values_internvl(
video,
input_size=self.image_size,
min_num=min_num,
max_num=max_num,
use_thumbnail=False,
)
for video in videos
]
def _preprocess_video(
self,
text: list[str],
videos: list[npt.NDArray],
dynamic_image_size: bool | None = None,
) -> tuple[list[str], dict[str, Any]]:
if len(videos) == 0 or not self.supports_video:
return text, {}
video_token = self.video_token
assert video_token is not None
pixel_values_lst_video = self._videos_to_pixel_values_lst(
videos,
dynamic_image_size=dynamic_image_size,
)
video_inputs = {
"pixel_values_flat_video": torch.cat(pixel_values_lst_video),
"video_num_patches": torch.tensor(
[len(item) for item in pixel_values_lst_video]
),
}
for pixel_values in pixel_values_lst_video:
num_patches = pixel_values.shape[0]
video_repl = self.get_video_repl(
self.num_image_token, num_patches, video_token
)
text = [t.replace("<video>", video_repl.full, 1) for t in text]
return text, video_inputs
def __call__(
self,
text: str | list[str] | None = None,
images: Image.Image | list[Image.Image] | None = None,
videos: npt.NDArray | list[npt.NDArray] | None = None,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
return_tensors: str | TensorType | None = None,
**kwargs,
) -> BatchFeature:
text = self._make_batch_input(text)
images = self._make_batch_input(images)
videos = self._make_batch_input(videos)
text, image_inputs = self._preprocess_image(
text=text,
images=images,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
text, video_inputs = self._preprocess_video(
text=text,
videos=videos,
dynamic_image_size=dynamic_image_size,
)
text_inputs = self.tokenizer(text)
combined_outputs = {**text_inputs, **image_inputs, **video_inputs}
return BatchFeature(combined_outputs, tensor_type=return_tensors)
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
repl_features = IMG_CONTEXT * feature_size
repl_full = IMG_START + repl_features + IMG_END
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
def get_video_repl(
self,
feature_size: int,
num_patches: int | None,
video_context_token: str = IMG_CONTEXT,
) -> PromptUpdateDetails[str]:
if num_patches is None:
raise NotImplementedError("Embedding inputs are not supported")
repl_features = video_context_token * self.num_image_token
repl_features_with_sep = IMG_START + repl_features + IMG_END
# num_patches is equal to num_frames
repl_full = "".join(
[f"Frame{i + 1}: {repl_features_with_sep}" for i in range(num_patches)]
)
return PromptUpdateDetails.select_text(repl_full, video_context_token)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# --------------------------------------------------------
# Adapted from
# https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/internvl.py
# under Apache-2.0 License
# LICENSE is in root directory.
# --------------------------------------------------------
import math
from abc import ABC, abstractmethod
from collections.abc import Sequence
from dataclasses import dataclass
from typing import Any, TypeVar
import einops
import numpy as np
import numpy.typing as npt
import regex as re
import torch
from PIL import Image
from transformers import BatchFeature, PretrainedConfig, TensorType
from vllm.model_executor.models.parakeet import ParakeetExtractor
from vllm.multimodal.evs import compute_retained_tokens_count
from vllm.multimodal.inputs import AudioItem
from vllm.multimodal.processing.processor import PromptUpdateDetails, _seq2tokens
from vllm.tokenizers import TokenizerLike
from .internvl import calculate_internvl_targets, get_internvl_target_ratios
_T = TypeVar("_T")
IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<image>"
AUDIO_START = "<so_start>"
AUDIO_END = "<so_end>"
AUDIO_CONTEXT = "<so_embedding>"
# Profiling
# MAX_FRAMES = 16
DEFAULT_NUM_TILES = 12
def calculate_timestamps(
indices: list[int] | torch.Tensor,
frame_duration_ms: int,
):
if not isinstance(indices, list):
indices = indices.tolist()
timestamps = [int(i) * frame_duration_ms / 1000.0 for i in indices]
return timestamps
def input_conditioner(x: torch.Tensor, norm_mean: torch.Tensor, norm_std: torch.Tensor):
return (x - norm_mean) / norm_std
def dynamic_preprocess(
image,
*,
image_size=512,
max_num_tiles=12,
use_thumbnail=True,
idx=0,
):
orig_width, orig_height = image.size
target_ratios = get_internvl_target_ratios(1, max_num_tiles)
blocks, target_width, target_height = calculate_internvl_targets(
orig_width=orig_width,
orig_height=orig_height,
target_ratios=target_ratios,
image_size=image_size,
use_thumbnail=False,
)
image = np.asarray(
image.convert("RGB") if image.mode != "RGB" else image, dtype=np.uint8
)
image = torch.from_numpy(image).unsqueeze(0) # (1, H, W, 3)
image = image.permute(0, 3, 1, 2) # (1, 3, H, W)
resized_img = torch.nn.functional.interpolate(
image,
size=(target_height, target_width),
mode="bicubic",
align_corners=False,
antialias=True,
)
B, C, H, W = resized_img.shape
hp, wp = H // image_size, W // image_size
patches = (
resized_img.reshape(B, C, hp, image_size, wp, image_size)
.permute(0, 2, 4, 1, 3, 5)
.reshape(B * hp * wp, C, image_size, image_size)
/ 255.0
)
if use_thumbnail and patches.shape[0] > 1:
thumb = (
torch.nn.functional.interpolate(
image,
size=(image_size, image_size),
mode="bicubic",
align_corners=False,
antialias=True,
)
/ 255.0
)
patches = torch.cat([patches, thumb], dim=0)
return list(patches)
def image_to_pixel_values(
image: Image.Image,
*,
input_size: int,
max_num: int,
use_thumbnail: bool,
idx: int,
) -> torch.Tensor:
images = dynamic_preprocess(
image,
image_size=input_size,
max_num_tiles=max_num,
use_thumbnail=use_thumbnail,
idx=idx,
)
pixel_values = torch.stack(images)
return pixel_values
def video_to_pixel_values(
video: npt.NDArray,
*,
input_size: int,
max_num_tiles: int = 1,
use_thumbnail: bool,
) -> torch.Tensor:
assert max_num_tiles == 1, "Video modality always uses one tile"
# (num_frames, H, W, C) -> (num_frames, C, H, W)
video_tensor = torch.from_numpy(video).permute(0, 3, 1, 2)
if video_tensor.shape[2] != input_size or video_tensor.shape[3] != input_size:
video_tensor = torch.nn.functional.interpolate(
video_tensor,
size=(input_size, input_size),
mode="bicubic",
align_corners=False,
antialias=True,
)
video_tensor = video_tensor / 255.0
return video_tensor
class DynamicResolutionImageTiler:
CONV_MERGING = False
PIXEL_SHUFFLE = True
USE_THUMBNAIL = False
def __init__(
self,
*,
max_model_len: int,
patch_size: int,
min_num_patches: int,
max_num_patches: int,
downsample_ratio: int,
norm_mean: Sequence[float],
norm_std: Sequence[float],
factor_max: float = 1.0,
use_thumbnail: bool = False,
) -> None:
assert use_thumbnail is False, "use_thumbnail is not supported"
self._patch_size: int = patch_size
self._max_model_len = max_model_len
self._min_num_patches = min_num_patches
self._max_num_patches = max_num_patches if max_num_patches > 0 else float("inf")
self._factor_max = factor_max
self.norm_mean = torch.tensor(norm_mean).reshape(3, 1, 1)
self.norm_std = torch.tensor(norm_std).reshape(3, 1, 1)
assert downsample_ratio < 1
reduction_factor = 1 / downsample_ratio
assert reduction_factor == 2.0
self._downsample_ratio = int(reduction_factor) ** (
self.PIXEL_SHUFFLE + self.CONV_MERGING
)
assert self._downsample_ratio == 2
def _get_num_embeddings(self, width: int, height: int) -> int:
num_patches = (width // self._patch_size) * (height // self._patch_size)
num_tokens = num_patches // (self._downsample_ratio**2)
return num_tokens
def width_and_height_for_max_num_tokens_available(
self,
target_num_tokens_post_shuffle: int,
) -> tuple[int, int]:
"""
TODO: optimize this so it squeezes closer to target number of tokens.
Calculate image dimensions that produce approximately `target` tokens after
pixel_shuffle.
With pixel_shuffle enabled, each 2x2 patch grid becomes 1 token, so we
need 4*B patches to get B tokens.
Examples:
>>> PATCH_SIZE = 16
>>> DOWNSAMPLE_RATIO = 0.5
>>> tiler = DynamicResolutionImageTiler(
... max_model_len=16384,
... patch_size=PATCH_SIZE,
... downsample_ratio=DOWNSAMPLE_RATIO,
... min_num_patches=4,
... max_num_patches=0,
... )
>>> width, height = tiler.width_and_height_for_max_num_tokens_available(
... target_num_tokens_post_shuffle=8192,
... )
>>> assert width, height == (2880, 2880)
>>> assert (width // PATCH_SIZE) * (
... height // PATCH_SIZE
... ) // 2**2 == 8100 # tokens post-shuffle
>>> assert tiler._get_num_embeddings(width=width, height=height) == 8100
"""
side_pixels = (
math.isqrt(target_num_tokens_post_shuffle)
* self._downsample_ratio
* self._patch_size
)
assert isinstance(side_pixels, int) and side_pixels % self._patch_size == 0
return side_pixels, side_pixels
def max_num_tokens_available(self, text_prompt_length: int) -> int:
return self._max_model_len - text_prompt_length - 4
def _images_to_pixel_values_lst(
self,
text_prompt_length: int,
images: list[Image.Image],
) -> tuple[list[torch.Tensor], list[int]]:
num_tokens_available = self.max_num_tokens_available(text_prompt_length)
params_per_image = self.compute_params(images, num_tokens_available)
feature_sizes = []
images = []
for param in params_per_image:
for t in self.apply_params(param):
assert t.ndim == 3, f"{t.ndim=}: expected 3 dim tensor"
images.append(t)
feature_sizes.append(param.num_embeddings)
return images, feature_sizes
feature_size_cache: dict[Image.Image, int] = {}
@classmethod
def get_cached_feature_size(cls, image: Image.Image) -> int:
feature_size = cls.feature_size_cache[id(image)]
# hard assert that we only use the feature size once
del cls.feature_size_cache[id(image)]
return feature_size
@dataclass
class DynamicResolutionParams:
media: Image.Image
num_tiles: int
num_embeddings: int
patch_size: tuple[int, int]
def apply_params(self, params: DynamicResolutionParams) -> list[torch.Tensor]:
target_size = (
params.patch_size[1] * self._patch_size,
params.patch_size[0] * self._patch_size,
)
image = np.asarray(
params.media.convert("RGB") if params.media.mode != "RGB" else params.media,
dtype=np.uint8,
)
resized_img = (
torch.nn.functional.interpolate(
torch.from_numpy(image).unsqueeze(0).permute(0, 3, 1, 2),
size=target_size,
mode="bicubic",
align_corners=False,
antialias=True,
)
/ 255.0
)
return list(resized_img)
def process_media(
self,
media: Image.Image,
num_tokens_available: int,
) -> tuple[DynamicResolutionParams, int]:
"""Process a single media item and return its parameters.
Args:
media: The media item to process
num_tokens_available: Number of tokens available for this media
Returns:
DynamicResolutionParams for the media
"""
current_num_tokens_available = num_tokens_available
assert isinstance(media, Image.Image), (
"Dynamic resolution is only supported for image media"
)
orig_width, orig_height = media.width, media.height
closest_patch_height = round(orig_height / self._patch_size + 0.5)
closest_patch_width = round(orig_width / self._patch_size + 0.5)
patches = closest_patch_height * closest_patch_width
factor = min(
math.sqrt(current_num_tokens_available / patches), self._factor_max
)
target_patch_height = math.floor(factor * closest_patch_height)
target_patch_width = math.floor(factor * closest_patch_width)
# Consider self._min_num_patches if > current_num_tokens_available.
if (
current_num_tokens_available > self._min_num_patches
and target_patch_height * target_patch_width < self._min_num_patches
):
up_factor = math.sqrt(
self._min_num_patches / (target_patch_height * target_patch_width)
)
target_patch_height = math.ceil(up_factor * target_patch_height)
target_patch_width = math.ceil(up_factor * target_patch_width)
# Round patch grid to be divisible by 2 (pixel-shuffle OR conv-merging)
# or by 4 when BOTH are enabled (two successive 2x reductions)
if self.PIXEL_SHUFFLE or self.CONV_MERGING:
required_divisor = 4 if (self.PIXEL_SHUFFLE and self.CONV_MERGING) else 2
rem_h = target_patch_height % required_divisor
if rem_h != 0:
inc_h = required_divisor - rem_h
if (
target_patch_height + inc_h
) * target_patch_width <= current_num_tokens_available:
target_patch_height += inc_h
else:
target_patch_height = max(
required_divisor, target_patch_height - rem_h
)
rem_w = target_patch_width % required_divisor
if rem_w != 0:
inc_w = required_divisor - rem_w
if (
target_patch_height * (target_patch_width + inc_w)
<= current_num_tokens_available
):
target_patch_width += inc_w
else:
target_patch_width = max(
required_divisor, target_patch_width - rem_w
)
# Calculate embeddings for the main dynamic resolution image
num_embeddings = self._get_num_embeddings(
target_patch_width * self._patch_size,
target_patch_height * self._patch_size,
)
token_count = target_patch_width * target_patch_height
# Add thumbnail embeddings if enabled and image area is below threshold
num_tiles = 1 # Base dynamic resolution image
return self.DynamicResolutionParams(
media=media,
num_tiles=num_tiles,
num_embeddings=num_embeddings,
patch_size=(target_patch_width, target_patch_height),
), token_count
def compute_params(
self,
media_list: list[Image.Image],
num_tokens_available: int,
) -> list[DynamicResolutionParams]:
"""Compute parameters for all media with iterative token budgeting.
Args:
media_list: List of media items to process
num_tokens_available: Total number of tokens available across all media
Returns:
List of ImageTilingParams for each media item
"""
num_tokens_available = (
num_tokens_available
* (4 if self.PIXEL_SHUFFLE else 1)
* (4 if self.CONV_MERGING else 1)
)
# When the number of available token is too small,
# allow self._min_num_patches per media and let the sample be truncated.
num_tokens_available = max(
num_tokens_available, self._min_num_patches * len(media_list)
)
# Clip the number of tokens available per media to >min and <max patches.
num_tokens_available_per_media = [
int(
max(
min(num_tokens_available, self._max_num_patches),
self._min_num_patches,
)
)
for _ in range(len(media_list))
]
# prevent infinite loop in any case
for _ in range(10):
# Step 1: Process each media with current token budget
params = []
token_counts = []
for media, tokens_for_media in zip(
media_list, num_tokens_available_per_media
):
param, token_count = self.process_media(media, tokens_for_media)
params.append(param)
token_counts.append(token_count)
self.feature_size_cache[id(param.media)] = param.num_embeddings
# Step 2: Check if total tokens is within budget
total_tokens = sum(token_counts)
if total_tokens <= num_tokens_available:
# We're within budget, return the params
return params
# Step 3: We're over budget, need to scale down
# Calculate scaling factor to get under budget
scaling_factor = num_tokens_available / total_tokens
# Recalculate token budgets for each media based on scaling
# Each media gets a proportional share of the total budget
scaled_down_num_tokens_available_per_media = [
max(self._min_num_patches, int(token_count * scaling_factor))
for token_count in token_counts
]
scaled_down = any(
[
scaled_down_num_tokens_available_per_media[i]
< num_tokens_available_per_media[i]
for i in range(len(num_tokens_available_per_media))
]
)
# If there wasn't scaling down, we're stuck with min_num_patches per media,
# else try with the scaled down num_tokens_available_per_media.
if not scaled_down:
num_tokens_available_per_media = [self._min_num_patches] * len(
media_list
)
else:
num_tokens_available_per_media = (
scaled_down_num_tokens_available_per_media
)
ctx = f"{params=} {total_tokens=} {num_tokens_available=}"
raise ValueError(
f"Should be unreachable - `return params` above must be reached: {ctx}"
)
@staticmethod
def stack(images: list[torch.Tensor], patch_size: int) -> torch.Tensor:
assert len(images) > 0, "No images to stack"
def rearrange_img(x):
py = x.shape[-2] // patch_size
px = x.shape[-1] // patch_size
x = einops.rearrange(
x,
"c (py yy) (px xx) -> (py px) (c yy xx)",
py=py,
yy=patch_size,
px=px,
xx=patch_size,
)
return x
imgs = [rearrange_img(img) for img in images]
pixel_values_flat = torch.cat(imgs, dim=0).unsqueeze(0)
return pixel_values_flat
class BaseNanoNemotronVLProcessor(ABC):
"""
This model doesn't define its own HF processor,
so we implement our own one here.
The code to insert image tokens is based on:
https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
"""
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
*args,
max_model_len: int,
max_num_tiles: int | None = None,
**kwargs,
) -> None:
super().__init__()
self.config = config
self.tokenizer = tokenizer
self.max_num_tiles = max_num_tiles or DEFAULT_NUM_TILES
image_size: int = config.force_image_size
patch_size: int = config.patch_size
downsample_ratio: int = config.downsample_ratio
self.num_image_token = int(
(image_size // patch_size) ** 2 * (downsample_ratio**2)
)
self.image_size = image_size
self.use_thumbnail: bool = config.use_thumbnail
self.norm_mean = torch.Tensor(config.norm_mean).reshape(1, 3, 1, 1)
self.norm_std = torch.Tensor(config.norm_std).reshape(1, 3, 1, 1)
self.dynamic_tiler: DynamicResolutionImageTiler | None = None
if self.use_dynamic_resolution(config):
self.dynamic_tiler = DynamicResolutionImageTiler(
max_model_len=max_model_len,
patch_size=patch_size,
downsample_ratio=downsample_ratio,
min_num_patches=config.vision_config.args["min_num_patches"],
max_num_patches=config.vision_config.args["max_num_patches"],
norm_mean=config.norm_mean,
norm_std=config.norm_std,
)
@staticmethod
def use_dynamic_resolution(config: PretrainedConfig) -> bool:
return "min_num_patches" in config.vision_config.args
@property
@abstractmethod
def image_token_id(self) -> int:
raise NotImplementedError
@abstractmethod
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
raise NotImplementedError
def get_num_image_tokens(
self,
*,
image_width: int,
image_height: int,
max_num_tiles: int,
) -> int:
target_ratios = get_internvl_target_ratios(1, max_num_tiles)
num_patches, _, _ = calculate_internvl_targets(
orig_width=image_width,
orig_height=image_height,
target_ratios=target_ratios,
image_size=self.image_size,
use_thumbnail=self.use_thumbnail,
)
return num_patches * self.num_image_token
def _images_to_pixel_values_lst(
self,
images: list[Image.Image],
max_num_tiles: int,
) -> list[torch.Tensor]:
return [
image_to_pixel_values(
image,
input_size=self.image_size,
max_num=max_num_tiles,
use_thumbnail=self.use_thumbnail,
idx=idx,
)
for idx, image in enumerate(images)
]
def _preprocess_image(
self,
text: list[str],
images: list[Image.Image],
max_num_tiles: int,
) -> tuple[list[str], dict[str, Any]]:
if len(images) == 0:
return text, {}
image_inputs: dict[str, Any]
if tiler := self.dynamic_tiler:
sans_images = text[0].replace("<image>", "")
text_prompt_length = len(
self.tokenizer(sans_images, add_special_tokens=False).input_ids
)
pixel_values_lst, num_tokens_per_image = tiler._images_to_pixel_values_lst(
text_prompt_length=text_prompt_length,
images=images,
)
imgs_sizes = [(pv.shape[-2], pv.shape[-1]) for pv in pixel_values_lst]
normalized = [
input_conditioner(img, tiler.norm_mean, tiler.norm_std)
for img in pixel_values_lst
]
image_num_patches = torch.tensor([1] * len(num_tokens_per_image))
image_inputs = {
"pixel_values_flat": normalized,
"imgs_sizes": imgs_sizes,
"num_tokens_per_image": num_tokens_per_image,
}
else:
pixel_values_lst = self._images_to_pixel_values_lst(images, max_num_tiles)
image_num_patches = torch.tensor([len(item) for item in pixel_values_lst])
pixel_values_flat = input_conditioner(
torch.cat(pixel_values_lst), self.norm_mean, self.norm_std
)
image_inputs = {
"pixel_values_flat": pixel_values_flat,
"image_num_patches": image_num_patches,
}
num_tokens_per_image = [
self.num_image_token * len(item) for item in pixel_values_lst
]
assert len(text) == 1, (
"hf_processor is called on the output of get_dummy_text, "
"which should be a single string"
)
parts = [x for x in re.split(r"(<image>)", text[0]) if x]
assert parts.count("<image>") == len(pixel_values_lst), (
"the number of <image> tokens in the text should be the "
"same as the number of images"
)
for i, (feature_size, num_patches) in enumerate(
zip(num_tokens_per_image, image_num_patches, strict=True)
):
image_repl = self.get_image_repl(feature_size, num_patches)
parts[i] = parts[i].replace("<image>", image_repl.full)
text = ["".join(parts)]
return text, image_inputs
def _make_batch_input(self, input_item: _T | list[_T] | None = None) -> list[_T]:
if input_item is None:
input_item = []
if not isinstance(input_item, list):
input_item = [input_item]
return input_item
@abstractmethod
def __call__(
self,
text: str | list[str] | None = None,
images: Image.Image | list[Image.Image] | None = None,
*,
return_tensors: str | TensorType | None = None,
max_num_tiles: int | None = None,
**kwargs,
) -> BatchFeature:
raise NotImplementedError
class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
"""
HF Processor with extended video processing logic.
Code for video processing is adapted from video example:
https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers
"""
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
*,
max_model_len: int,
max_num_tiles: int | None = None,
video_token: str | None = None,
video_pruning_rate: float | None = None,
) -> None:
super().__init__(
config=config,
tokenizer=tokenizer,
max_model_len=max_model_len,
max_num_tiles=max_num_tiles,
)
# add extra video token for video processing
self.video_token = video_token
self.video_pruning_rate = video_pruning_rate
self.audio_extractor: ParakeetExtractor | None = None
raw_sound_config = getattr(config, "sound_config", None)
if raw_sound_config is not None:
self.audio_extractor = ParakeetExtractor(raw_sound_config)
# Pre-tokenize special tokens for video processing
# to avoid repeated tokenization
self._img_start_token_ids = tokenizer.encode(
IMG_START, add_special_tokens=False
)
self._img_end_token_ids = tokenizer.encode(IMG_END, add_special_tokens=False)
self._img_context_token_ids = tokenizer.encode(
IMG_CONTEXT, add_special_tokens=False
)
@property
def supports_video(self) -> bool:
return self.video_token_id is not None
@property
def video_token_id(self) -> int | None:
if self.video_token is None:
return None
return self.tokenizer.get_vocab().get(self.video_token, None)
@property
def image_token_id(self) -> int:
return self.tokenizer.convert_tokens_to_ids(IMG_CONTEXT)
def _videos_to_pixel_values_lst(
self,
videos: list[npt.NDArray],
max_num_tiles: int,
) -> list[torch.Tensor]:
return [
video_to_pixel_values(
video,
input_size=self.image_size,
max_num_tiles=max_num_tiles,
use_thumbnail=self.use_thumbnail,
)
for video in videos
]
def _preprocess_video(
self,
text: list[str],
videos: list[tuple[npt.NDArray, dict[str, Any]]],
max_num_tiles: int,
) -> tuple[list[str], dict[str, Any]]:
if len(videos) == 0 or not self.supports_video:
return text, {}
videos_lst = [v[0] for v in videos]
video_metadata_lst = [v[1] for v in videos]
pixel_values_lst_video = self._videos_to_pixel_values_lst(
videos_lst,
max_num_tiles=max_num_tiles,
)
# We use frame duration in milliseconds (as integer) to ensure
# we have consistent timestamps calculation. At preprocessing
# fps parameter is given in fp32, while at inference it is bf16
# which leads to inaccurate timestamp calculation and causes
# timestamp values to differ.In rare cases this causes
# mismatching number of output tokens for tokenized frame prefixes
frame_duration_ms_lst = [
int(1000.0 / metadata["fps"]) for metadata in video_metadata_lst
]
frames_indices_lst = [
metadata["frames_indices"] for metadata in video_metadata_lst
]
video_num_patches = torch.tensor([len(item) for item in pixel_values_lst_video])
video_inputs = {
"pixel_values_flat_video": input_conditioner(
torch.cat(pixel_values_lst_video), self.norm_mean, self.norm_std
),
"video_num_patches": video_num_patches,
"frames_indices": frames_indices_lst,
"frame_duration_ms": torch.tensor(frame_duration_ms_lst),
}
image_size: int = self.config.force_image_size
patch_size: int = self.config.patch_size
downsample_ratio = self.config.downsample_ratio
tokens_in_single_frame = int(
(image_size * image_size // patch_size**2) * (downsample_ratio**2)
)
for pixel_values, video_metadata, frames_indices, frame_duration_ms in zip(
pixel_values_lst_video,
video_metadata_lst,
frames_indices_lst,
frame_duration_ms_lst,
):
num_frames = pixel_values.shape[0]
if self.video_pruning_rate is not None and self.video_pruning_rate > 0.0:
# Start of EVS-specific code
num_tokens = compute_retained_tokens_count(
tokens_per_frame=tokens_in_single_frame,
num_frames=num_frames,
q=self.video_pruning_rate,
)
# Here we just need placeholders that won't actually be replaced -
# we just need to make sure the total number of tokens is correct
# assign all tokens to the first frame
tokens_per_frame = [num_tokens] + [0] * (num_frames - 1)
# End of EVS-specific code
else:
tokens_per_frame = [tokens_in_single_frame] * num_frames
video_repl = self.get_video_repl(
tokens_per_frame=tokens_per_frame,
frames_indices=frames_indices,
frame_duration_ms=frame_duration_ms,
tokenizer=self.tokenizer,
img_start_token_ids=self._img_start_token_ids,
img_end_token_ids=self._img_end_token_ids,
img_context_token_ids=self._img_context_token_ids,
)
# video_repl.full is a list of token IDs
# Convert token IDs back to text for the HF processor flow
video_repl_text = self.tokenizer.decode(
video_repl.full, skip_special_tokens=False
)
text = [t.replace("<video>", video_repl_text, 1) for t in text]
return text, video_inputs
def _preprocess_audio(
self,
text: list[str],
audios: list[npt.NDArray],
) -> tuple[list[str], dict[str, Any]]:
if len(audios) == 0:
return text, {}
assert self.audio_extractor is not None
extractor = self.audio_extractor
parts = [x for x in re.split(f"({re.escape(AUDIO_CONTEXT)})", text[0]) if x]
token_count = parts.count(AUDIO_CONTEXT)
if token_count != len(audios):
raise ValueError(
"Number of audio tokens in text does not match the number "
f"of audios (tokens={token_count}, audios={len(audios)})."
)
audio_index = 0
for idx, part in enumerate(parts):
if part == AUDIO_CONTEXT:
audio_repl = self.get_audio_repl(audios[audio_index])
parts[idx] = audio_repl.full
audio_index += 1
text = ["".join(parts)]
audio_inputs = extractor(
audios,
sampling_rate=extractor.sampling_rate,
return_tensors="pt",
)
input_audio_features = audio_inputs.input_features
feature_attention_mask = audio_inputs.attention_mask
audio_feature_lengths = feature_attention_mask.sum(dim=1)
audio_inputs = {
"input_audio_features": input_audio_features,
"feature_attention_mask": feature_attention_mask,
"audio_feature_lengths": audio_feature_lengths,
}
return text, audio_inputs
def __call__(
self,
text: str | list[str] | None = None,
images: Image.Image | list[Image.Image] | None = None,
videos: tuple[npt.NDArray, dict[str, Any]]
| list[tuple[npt.NDArray, dict[str, Any]]]
| None = None,
audios: AudioItem | list[AudioItem] | None = None,
*,
return_tensors: str | TensorType | None = None,
max_num_tiles: int | None = None,
**kwargs,
) -> BatchFeature:
# Use default if not provided
if max_num_tiles is None:
max_num_tiles = self.max_num_tiles
text = self._make_batch_input(text)
images = self._make_batch_input(images)
videos = self._make_batch_input(videos)
audios = self._make_batch_input(audios)
text, image_inputs = self._preprocess_image(
text=text,
images=images,
max_num_tiles=max_num_tiles,
)
text, video_inputs = self._preprocess_video(
text=text,
videos=videos,
max_num_tiles=1,
)
text, audio_inputs = self._preprocess_audio(
text=text,
audios=audios,
)
text_inputs = self.tokenizer(text, add_special_tokens=False)
combined_inputs = {**text_inputs, **video_inputs, **audio_inputs}
if self.dynamic_tiler is None:
batch = BatchFeature(
{**combined_inputs, **image_inputs},
tensor_type=return_tensors,
)
else:
batch = BatchFeature(combined_inputs, tensor_type=return_tensors)
# allow images to be exempt from the BatchFeature validation:
# We will .stack() them in _parse_and_validate_image_input
batch.update(image_inputs)
return batch
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
repl_features = IMG_CONTEXT * feature_size
repl_full = IMG_START + repl_features + IMG_END
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
def get_audio_repl(
self,
audio: npt.NDArray,
) -> PromptUpdateDetails[str]:
assert self.audio_extractor is not None
num_tokens = self.audio_extractor.audio_token_count(len(audio))
repl_full = f"{AUDIO_START}{AUDIO_CONTEXT * num_tokens}{AUDIO_END}"
return PromptUpdateDetails.select_text(repl_full, AUDIO_CONTEXT)
@classmethod
def get_video_repl(
cls,
*,
tokens_per_frame: list[int],
frames_indices: list[int],
frame_duration_ms: int,
tokenizer: TokenizerLike,
img_start_token_ids: list[int],
img_end_token_ids: list[int],
img_context_token_ids: list[int],
) -> PromptUpdateDetails[list[int]]:
"""
Build prompt replacement for a video.
The replacement returned is not actually used to replace the placeholder
tokens - it's just used to make sure we allocate the correct number
of tokens.
Actual replacement is done in embed_multimodal of
NemotronH_Nano_VL_V2
(specifically in _process_video_input -> _create_final_video_embeddings).
There, we create the final embeddings with text embeddings for indicator tokens
and video embeddings for video tokens.
This is a single function that handles all cases - non EVS, EVS dummy, EVS real.
The differentiation is done via tokens_per_frame parameter.
- non EVS case - constant value same value across all frames
- EVS dummy - Doesn't matter how tokens are distributed between frames - just
make sure the total number of tokens is correct.
- EVS real (called from get_real_video_repl_for_evs) - different value per frame
Args:
tokens_per_frame (list[int]): number of tokens per frame
frames_indices (list[int]): frame indices
frame_duration_ms (int): duration of each frame in milliseconds
tokenizer (TokenizerLike): tokenizer to use for tokenizing frame separators
img_start_token_ids (list[int]): pre-tokenized IMG_START tokens
img_end_token_ids (list[int]): pre-tokenized IMG_END tokens
img_context_token_ids (list[int]): pre-tokenized IMG_CONTEXT tokens
"""
# TODO: Add support of frame_duration_ms to be None
# At preprocessing step we should allow absent / metadata without
# frames_indices field.
timestamps_enabled = frame_duration_ms is not None
if timestamps_enabled:
timestamps = calculate_timestamps(frames_indices, frame_duration_ms)
assert len(timestamps) == len(tokens_per_frame), (
"timestamps and tokens_per_frame must have the same length"
)
frame_separators = [
f"Frame {i + 1} sampled at {timestamp:.2f} seconds: "
for i, timestamp in enumerate(timestamps)
]
else:
frame_separators = [
f"Frame {i + 1}: " for i, _ in enumerate(tokens_per_frame)
]
# Tokenize frame separator independently
frame_separators_tokenized = [
_seq2tokens(tokenizer, sep) for sep in frame_separators
]
# Tokenize each component independently to avoid tokenizer merging tokens
# across boundaries. This ensures consistent tokenization regardless of
# num_tokens_per_frame values.
all_token_ids = []
for i, num_tokens in enumerate(tokens_per_frame):
frame_sep_token_ids = frame_separators_tokenized[i]
all_token_ids.extend(frame_sep_token_ids)
# Add pre-tokenized special tokens
all_token_ids.extend(img_start_token_ids)
all_token_ids.extend(img_context_token_ids * num_tokens)
all_token_ids.extend(img_end_token_ids)
return PromptUpdateDetails.from_seq(all_token_ids)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Adapted from https://github.com/amalad/vllm/blob/nemotron_parse/vllm/model_executor/models/nemotron_parse.py
# that's based on https://huggingface.co/nvidia/NVIDIA-Nemotron-Parse-v1.1/blob/main/hf_nemotron_parse_modeling.py
from typing import TypeVar
import numpy as np
import torch
from PIL import Image
from timm.data.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
from torchvision import transforms as T
from transformers import BatchFeature, PretrainedConfig, TensorType
from vllm.tokenizers import TokenizerLike
_T = TypeVar("_T")
DEFAULT_FINAL_IMAGE_SIZE = (2048, 1648)
class NemotronParseImageProcessor:
"""
NemotronParse Image Processor
"""
def __init__(
self,
final_size: tuple = DEFAULT_FINAL_IMAGE_SIZE,
**kwargs,
):
# Ensure final_size is properly formatted
if isinstance(final_size, (list, tuple)) and len(final_size) >= 2:
self.final_size = (int(final_size[0]), int(final_size[1]))
elif isinstance(final_size, (int, float)):
self.final_size = (int(final_size), int(final_size))
else:
self.final_size = DEFAULT_FINAL_IMAGE_SIZE # Default fallback
self.norm_mean = torch.Tensor(OPENAI_CLIP_MEAN).reshape(1, 3, 1, 1)
self.norm_std = torch.Tensor(OPENAI_CLIP_STD).reshape(1, 3, 1, 1)
# Create transforms
self._create_transforms()
def _create_transforms(self):
"""Create transform objects."""
try:
import albumentations as A
except ImportError as err:
raise ImportError(
"The package `albumentations` is required to use "
"NemotronParse model. Please install it with `pip install "
"albumentations`."
) from err
# Ensure final_size is a tuple of integers
if isinstance(self.final_size, (list, tuple)):
self.target_height, self.target_width = (
int(self.final_size[0]),
int(self.final_size[1]),
)
else:
self.target_height = self.target_width = int(self.final_size)
import cv2
self.transform = A.Compose(
[
A.PadIfNeeded(
min_height=self.target_height,
min_width=self.target_width,
border_mode=cv2.BORDER_CONSTANT,
fill=[255, 255, 255],
p=1.0,
),
]
)
self.torch_transform = T.Compose(
[
T.ToTensor(),
]
)
def _resize_with_aspect_ratio(self, image: np.ndarray) -> np.ndarray:
"""Resize image maintaining aspect ratio (exact replica of original
LongestMaxSizeHW)."""
height, width = image.shape[:2]
max_size_height = self.target_height
max_size_width = self.target_width
# Original LongestMaxSizeHW algorithm from custom_augmentations.py
aspect_ratio = width / height
new_height = height
new_width = width
# If height too big then scale image down
if height > max_size_height:
new_height = max_size_height
new_width = int(new_height * aspect_ratio)
# If width too big, scale image down further
if new_width > max_size_width:
new_width = max_size_width
new_height = int(new_width / aspect_ratio)
# Use cv2.INTER_LINEAR like the original
import cv2
return cv2.resize(
image, (new_width, new_height), interpolation=cv2.INTER_LINEAR
)
def _pad_to_size(self, image: np.ndarray) -> np.ndarray:
"""Pad image to target size with white padding (matches A.PadIfNeeded
behavior)."""
h, w = image.shape[:2]
min_height, min_width = self.target_height, self.target_width
# Only pad if image is smaller than target (matches A.PadIfNeeded logic)
pad_h = max(0, min_height - h)
pad_w = max(0, min_width - w)
if pad_h == 0 and pad_w == 0:
return image
# A.PadIfNeeded pads to bottom-right with constant value
if len(image.shape) == 3:
# Color image - pad bottom and right with white (255, 255, 255)
padded = np.pad(
image,
((0, pad_h), (0, pad_w), (0, 0)),
mode="constant",
constant_values=255,
)
else:
# Grayscale image - pad with white (255)
padded = np.pad(
image, ((0, pad_h), (0, pad_w)), mode="constant", constant_values=255
)
return padded
def preprocess(
self,
images: Image.Image | list[Image.Image],
**kwargs,
) -> dict[str, torch.Tensor]:
"""
Preprocess an image or batch of images for the NemotronParse model.
Args:
images: Input image(s)
"""
# Ensure images is a list
if not isinstance(images, list):
images = [images]
# Convert PIL images to numpy arrays if needed
processed_images = []
for image in images:
if isinstance(image, Image.Image):
image = np.asarray(image)
processed_images.append(image)
# Apply NemotronParse-specific transforms
pixel_values = []
for image in processed_images:
# Manual resize with aspect ratio preservation
# (replaces LongestMaxSizeHW)
processed_image = self._resize_with_aspect_ratio(image)
# Apply remaining albumentations transforms if available
if self.transform is not None:
transformed = self.transform(image=processed_image)
processed_image = transformed["image"]
else:
# Fallback: just pad to target size
processed_image = self._pad_to_size(processed_image)
# Convert to tensor
pixel_values_tensor = self.torch_transform(processed_image)
# Handle grayscale images
if pixel_values_tensor.shape[0] == 1:
pixel_values_tensor = pixel_values_tensor.expand(3, -1, -1)
pixel_values.append(pixel_values_tensor)
# Stack into batch
pixel_values = torch.stack(pixel_values)
# Normalize pixel values
normalized_values = (pixel_values - self.norm_mean) / self.norm_std
return {"pixel_values": normalized_values}
def __call__(
self, images: Image.Image | list[Image.Image], **kwargs
) -> dict[str, torch.Tensor]:
return self.preprocess(images, **kwargs)
class NemotronParseProcessor:
"""
NemotronParse Processor
"""
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
**kwargs,
) -> None:
super().__init__()
self.config = config
self.tokenizer = tokenizer
self.image_processor = NemotronParseImageProcessor(final_size=config.image_size)
def _make_batch_input(self, input_item: _T | list[_T] | None = None) -> list[_T]:
if input_item is None:
input_item = []
if not isinstance(input_item, list):
input_item = [input_item]
return input_item
def __call__(
self,
text: str | list[str] | None = None,
images: Image.Image | list[Image.Image] | None = None,
return_tensors: str | TensorType | None = None,
**kwargs,
) -> BatchFeature:
text = self._make_batch_input(text)
images = self._make_batch_input(images)
image_inputs = {} if len(images) == 0 else self.image_processor(images)
text_inputs = self.tokenizer(text, add_special_tokens=False, **kwargs)
combined_outputs = BatchFeature(
data={**text_inputs, **image_inputs},
tensor_type=return_tensors,
)
return combined_outputs
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from abc import ABC
import torch
import torchvision.transforms as T
from PIL import Image
from transformers import PretrainedConfig
from transformers.image_processing_utils_fast import BaseImageProcessorFast
from vllm.multimodal.image import convert_image_mode
from vllm.multimodal.processing import PromptUpdateDetails
from vllm.tokenizers import TokenizerLike
from .internvl import InternVLProcessor
# Configure PIL to handle large images without warnings
# This prevents DecompressionBombWarning for legitimate large images
Image.MAX_IMAGE_PIXELS = None # Disable the limit entirely
# Alternative: Set a specific higher limit
# Image.MAX_IMAGE_PIXELS = 300000000 # ~300M pixels
def build_transform(input_size: int):
return T.Compose(
[
T.Lambda(lambda img: convert_image_mode(img, "RGB")),
T.Resize(
(input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
),
T.ToTensor(),
]
)
# adapted from https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1
def find_closest_aspect_ratio(
aspect_ratio: float,
target_ratios: list[tuple[int, int]],
*,
width: int,
height: int,
image_size: int,
) -> tuple[int, int]:
best_factor = float("-inf")
best_ratio = (1, 1)
area = width * height
for rw, rh in target_ratios:
target_aspect_ratio = rw / rh
size_factor = min((rw * rh * image_size * image_size) / area, 0.6)
ratio_closeness = min(
target_aspect_ratio / aspect_ratio, aspect_ratio / target_aspect_ratio
)
factor = size_factor * ratio_closeness
if factor > best_factor:
best_factor = factor
best_ratio = (rw, rh)
return best_ratio
def calculate_nemotron_vl_targets(
*,
orig_width: int,
orig_height: int,
target_ratios: list[tuple[int, int]],
image_size: int,
use_thumbnail: bool,
) -> tuple[int, int, int]:
aspect_ratio = orig_width / orig_height
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio,
target_ratios,
width=orig_width,
height=orig_height,
image_size=image_size,
)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# add thumbnail image if num_blocks != 1
if use_thumbnail and blocks != 1:
blocks += 1
return blocks, target_width, target_height
def dynamic_preprocess_nemotron_vl(
image: Image.Image,
*,
target_ratios: list[tuple[int, int]],
image_size: int,
use_thumbnail: bool,
) -> list[Image.Image]:
orig_width, orig_height = image.size
# calculate the number of blocks without thumbnail
blocks, target_width, target_height = calculate_nemotron_vl_targets(
orig_width=orig_width,
orig_height=orig_height,
target_ratios=target_ratios,
image_size=image_size,
use_thumbnail=False,
)
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size,
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
def get_nemotron_vl_target_ratios(
min_num: int,
max_num: int,
) -> list[tuple[int, int]]:
target_ratios = {
(i, j)
for n in range(min_num, max_num + 1)
for i in range(1, n + 1)
for j in range(1, n + 1)
if min_num <= i * j <= max_num
}
return sorted(target_ratios, key=lambda x: x[0] * x[1])
def image_to_pixel_values_nemotron_vl(
image: Image.Image,
*,
input_size: int,
min_num: int,
max_num: int,
use_thumbnail: bool,
transform: T.Compose | None = None,
) -> torch.Tensor:
target_ratios = get_nemotron_vl_target_ratios(min_num, max_num)
if transform is None:
transform = build_transform(input_size=input_size)
images = dynamic_preprocess_nemotron_vl(
image,
target_ratios=target_ratios,
image_size=input_size,
use_thumbnail=use_thumbnail,
)
pixel_values = torch.stack([transform(image) for image in images])
return pixel_values
class NemotronVLProcessor(InternVLProcessor):
IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<image>"
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
image_processor: BaseImageProcessorFast,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> None:
ABC.__init__(self)
self.config = config
self.tokenizer = tokenizer
self.image_processor = image_processor
image_size: int = config.force_image_size
patch_size: int = config.patch_size
if min_dynamic_patch is None:
min_dynamic_patch = 1
assert isinstance(min_dynamic_patch, int)
if max_dynamic_patch is None:
max_dynamic_patch = self.image_processor.max_num_tiles
assert isinstance(max_dynamic_patch, int)
if dynamic_image_size is None:
dynamic_image_size = True
assert isinstance(dynamic_image_size, bool)
self.num_image_token = int(
(image_size // patch_size) ** 2 * (config.downsample_ratio**2)
)
self.image_size = image_size
self.min_dynamic_patch = min_dynamic_patch
self.max_dynamic_patch = max_dynamic_patch
self.dynamic_image_size = dynamic_image_size
if image_processor is not None:
self.use_thumbnail = image_processor.use_thumbnail
else:
self.use_thumbnail = getattr(config, "use_thumbnail", True)
@property
def image_token_id(self) -> int:
return self.tokenizer.get_vocab()[self.IMG_CONTEXT]
def _get_transform(self) -> T.Compose:
return build_transform(input_size=self.image_size)
def get_num_image_tokens(
self,
*,
image_width: int,
image_height: int,
) -> int:
target_ratios = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
)
num_patches, _, _ = calculate_nemotron_vl_targets(
orig_width=image_width,
orig_height=image_height,
image_size=self.image_size,
target_ratios=target_ratios,
use_thumbnail=self.use_thumbnail,
)
return num_patches * self.num_image_token
def _images_to_pixel_values_lst(
self,
images: list[Image.Image],
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> list[torch.Tensor]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=False, # Applied in image_to_pixel_values
)
return [
image_to_pixel_values_nemotron_vl(
image,
input_size=self.image_size,
min_num=min_num,
max_num=max_num,
use_thumbnail=self.use_thumbnail,
transform=self._get_transform(),
)
for image in images
]
def _replace_image_tokens(
self,
text: list[str],
pixel_values_lst: list[torch.Tensor],
) -> list[str]:
"""Replace <image> placeholders with image tokens."""
for pixel_values in pixel_values_lst:
num_patches = pixel_values.shape[0]
feature_size = num_patches * self.num_image_token
image_repl = self.get_image_repl(feature_size, num_patches)
# Use temporary placeholder to avoid replacing tokens we just inserted
NVL_IMAGE_CONTEXT = image_repl.full.replace("<image>", "<NVL_IMG_CONTEXT>")
text = [t.replace("<image>", NVL_IMAGE_CONTEXT, 1) for t in text]
return [t.replace("<NVL_IMG_CONTEXT>", self.IMG_CONTEXT) for t in text]
def _preprocess_image(
self,
text: list[str],
images: list[Image.Image],
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> tuple[list[str], dict[str, torch.Tensor]]:
if len(images) == 0:
image_inputs = {}
else:
pixel_values_lst = self._images_to_pixel_values_lst(
images,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
image_inputs = {
"pixel_values_flat": torch.cat(pixel_values_lst),
"image_num_patches": torch.tensor(
[len(item) for item in pixel_values_lst]
),
}
text = self._replace_image_tokens(text, pixel_values_lst)
return text, image_inputs
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
repl_features = self.IMG_CONTEXT * feature_size
repl_full = self.IMG_START + repl_features + self.IMG_END
return PromptUpdateDetails.select_text(repl_full, self.IMG_CONTEXT)
# SigLIP normalization constants
SIGLIP_MEAN = (0.5, 0.5, 0.5)
SIGLIP_STD = (0.5, 0.5, 0.5)
def build_siglip_transform(input_size: int):
"""Build transform for SigLIP vision encoder with normalization.
Extends the base transform from nemotron_vl with SigLIP-specific normalization.
"""
return T.Compose(
[
build_transform(input_size=input_size),
T.Normalize(mean=SIGLIP_MEAN, std=SIGLIP_STD),
]
)
class LlamaNemotronVLEmbedProcessor(NemotronVLProcessor):
"""
Processor for LlamaNemotronVL embedding model.
Inherits from NemotronVLProcessor and specializes it for embedding tasks:
- Uses SigLIP transform with normalization instead of base transform
- Uses different image context token (<IMG_CONTEXT> vs <image>)
"""
IMG_CONTEXT = "<IMG_CONTEXT>"
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
processor_config: dict,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> None:
if min_dynamic_patch is None:
min_dynamic_patch = processor_config.get(
"min_input_tiles",
getattr(config, "min_dynamic_patch", 1),
)
if max_dynamic_patch is None:
max_dynamic_patch = processor_config.get(
"max_input_tiles",
getattr(config, "max_dynamic_patch", 1),
)
if dynamic_image_size is None:
dynamic_image_size = processor_config.get(
"dynamic_image_size",
getattr(config, "dynamic_image_size", True),
)
super().__init__(
config=config,
tokenizer=tokenizer,
image_processor=None,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
def _get_transform(self) -> T.Compose:
"""Override to add SigLIP normalization."""
return build_siglip_transform(input_size=self.image_size)
def _replace_image_tokens(
self,
text: list[str],
pixel_values_lst: list[torch.Tensor],
) -> list[str]:
"""Override with simpler token replacement for embedding model.
No temporary placeholder needed because IMG_CONTEXT is <IMG_CONTEXT>,
not <image>, so there's no collision risk.
"""
for pixel_values in pixel_values_lst:
num_patches = pixel_values.shape[0]
feature_size = num_patches * self.num_image_token
image_repl = self.get_image_repl(feature_size, num_patches)
text = [t.replace("<image>", image_repl.full, 1) for t in text]
return text
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# adapted from https://huggingface.co/nvidia/NVLM-D-72B/blob/main/modeling_nvlm_d.py
# --------------------------------------------------------
# NVLM-D
# Copyright (c) 2024 NVIDIA
# Licensed under Apache 2.0 License [see LICENSE for details]
# --------------------------------------------------------
from vllm.multimodal.processing import PromptUpdateDetails
from .internvl import BaseInternVLProcessor
IMG_PAD = "<|vision_pad|>"
class NVLMProcessor(BaseInternVLProcessor):
@property
def image_token_id(self) -> int:
return self.tokenizer.get_vocab()[IMG_PAD]
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
if num_patches is None:
raise NotImplementedError("Embedding inputs are not supported")
tile_pos_identifiers = [f"<tile_{i}>" for i in range(1, num_patches)]
if self.use_thumbnail:
tile_pos_identifiers += ["<tile_global_thumbnail>"]
context_size = feature_size // num_patches
features = "".join(
identifier + IMG_PAD * context_size for identifier in tile_pos_identifiers
)
# We include the start and end as well because "<Image><tile" is
# tokenized as ["<Image", "><", "tile"], resulting in assertion error
# when trying to find "<tile" as a subsequence of "<Image><tile"
repl = "<Image>" + features + "</Image>"
return PromptUpdateDetails.select_text(repl, IMG_PAD)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py
# --------------------------------------------------------
# SkyworkR1V
# Copyright (c) 2025 Skywork
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import torch
import torchvision.transforms as T
from PIL import Image
from transformers import BatchFeature, PretrainedConfig, TensorType
from vllm.multimodal.image import convert_image_mode
from vllm.multimodal.processing import PromptUpdateDetails
from vllm.tokenizers import TokenizerLike
IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<IMG_CONTEXT>"
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
def build_transform(input_size: int):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
return T.Compose(
[
T.Lambda(lambda img: convert_image_mode(img, "RGB")),
T.Resize(
(input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD),
]
)
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
def find_closest_aspect_ratio(
aspect_ratio: float,
target_ratios: list[tuple[int, int]],
*,
width: int,
height: int,
image_size: int,
) -> tuple[int, int]:
best_ratio_diff = float("inf")
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def resolve_skyworkr1v_min_max_num(
*,
min_dynamic_patch: int,
max_dynamic_patch: int,
dynamic_image_size: bool,
use_thumbnail: bool,
) -> tuple[int, int]:
min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
if use_thumbnail and max_dynamic_patch != 1:
max_dynamic_patch += 1
return min_dynamic_patch, max_dynamic_patch
def get_skyworkr1v_target_ratios(
min_num: int,
max_num: int,
) -> list[tuple[int, int]]:
target_ratios = {
(i, j)
for n in range(min_num, max_num + 1)
for i in range(1, n + 1)
for j in range(1, n + 1)
if min_num <= i * j <= max_num
}
return sorted(target_ratios, key=lambda x: x[0] * x[1])
def calculate_skyworkr1v_targets(
*,
orig_width: int,
orig_height: int,
target_ratios: list[tuple[int, int]],
image_size: int,
use_thumbnail: bool,
) -> tuple[int, int, int]:
aspect_ratio = orig_width / orig_height
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio,
target_ratios,
width=orig_width,
height=orig_height,
image_size=image_size,
)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# add thumbnail image if num_blocks != 1
if use_thumbnail and blocks != 1:
blocks += 1
return blocks, target_width, target_height
def dynamic_preprocess_skyworkr1v(
image: Image.Image,
*,
target_ratios: list[tuple[int, int]],
image_size: int,
use_thumbnail: bool,
) -> list[Image.Image]:
orig_width, orig_height = image.size
# calculate the number of blocks without thumbnail
blocks, target_width, target_height = calculate_skyworkr1v_targets(
orig_width=orig_width,
orig_height=orig_height,
target_ratios=target_ratios,
image_size=image_size,
use_thumbnail=False,
)
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size,
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B
def image_to_pixel_values_skyworkr1v(
image: Image.Image,
*,
input_size: int,
min_num: int,
max_num: int,
use_thumbnail: bool,
) -> torch.Tensor:
target_ratios = get_skyworkr1v_target_ratios(min_num, max_num)
transform = build_transform(input_size=input_size)
images = dynamic_preprocess_skyworkr1v(
image,
target_ratios=target_ratios,
image_size=input_size,
use_thumbnail=use_thumbnail,
)
pixel_values = torch.stack([transform(image) for image in images])
return pixel_values
class SkyworkR1VProcessor:
"""
This model doesn't define its own HF processor,
so we implement our own one here.
The code to insert image tokens is based on:
https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py#L252
"""
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> None:
super().__init__()
self.config = config
self.tokenizer = tokenizer
image_size: int = config.vision_config.image_size
patch_size: int = config.vision_config.patch_size
if min_dynamic_patch is None:
min_dynamic_patch = config.min_dynamic_patch
assert isinstance(min_dynamic_patch, int)
if max_dynamic_patch is None:
max_dynamic_patch = config.max_dynamic_patch
assert isinstance(max_dynamic_patch, int)
if dynamic_image_size is None:
dynamic_image_size = config.dynamic_image_size
assert isinstance(dynamic_image_size, bool)
self.num_image_token = int(
(image_size // patch_size) ** 2 * (config.downsample_ratio**2)
)
self.image_size = image_size
self.min_dynamic_patch = min_dynamic_patch
self.max_dynamic_patch = max_dynamic_patch
self.dynamic_image_size = dynamic_image_size
self.use_thumbnail: bool = config.use_thumbnail
@property
def image_token_id(self) -> int:
return self.tokenizer.get_vocab()[IMG_CONTEXT]
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
repl_features = IMG_CONTEXT * feature_size
repl_full = IMG_START + repl_features + IMG_END
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
def resolve_min_max_num(
self,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
use_thumbnail: bool | None = None,
) -> tuple[int, int]:
min_dynamic_patch = (
self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
)
max_dynamic_patch = (
self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
)
dynamic_image_size = (
self.dynamic_image_size
if dynamic_image_size is None
else dynamic_image_size
)
use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
return resolve_skyworkr1v_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
)
def resolve_target_ratios(
self,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
use_thumbnail: bool | None = None,
) -> list[tuple[int, int]]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
)
return get_skyworkr1v_target_ratios(min_num, max_num)
def get_num_image_tokens(
self,
*,
image_width: int,
image_height: int,
) -> int:
target_ratios = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
)
num_patches, _, _ = calculate_skyworkr1v_targets(
orig_width=image_width,
orig_height=image_height,
image_size=self.image_size,
target_ratios=target_ratios,
use_thumbnail=self.use_thumbnail,
)
return num_patches * self.num_image_token
def _images_to_pixel_values_lst(
self,
images: list[Image.Image],
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> list[torch.Tensor]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=False, # Applied in image_to_pixel_values
)
return [
image_to_pixel_values_skyworkr1v(
image,
input_size=self.image_size,
min_num=min_num,
max_num=max_num,
use_thumbnail=self.use_thumbnail,
)
for image in images
]
def __call__(
self,
text: str | list[str] | None = None,
images: Image.Image | list[Image.Image] | None = None,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
return_tensors: str | TensorType | None = None,
) -> BatchFeature:
if text is None:
text = []
if not isinstance(text, list):
text = [text]
if images is None:
images = []
if not isinstance(images, list):
images = [images]
if len(images) == 0:
image_inputs = {}
else:
pixel_values_lst = self._images_to_pixel_values_lst(
images,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
image_inputs = {
"pixel_values_flat": torch.cat(pixel_values_lst),
"image_num_patches": torch.tensor(
[len(item) for item in pixel_values_lst]
),
}
for pixel_values in pixel_values_lst:
num_patches = pixel_values.shape[0]
feature_size = num_patches * self.num_image_token
image_repl = self.get_image_repl(feature_size, num_patches)
text = [t.replace("<image>", image_repl.full, 1) for t in text]
text_inputs = self.tokenizer(text)
combined_outputs = {**text_inputs, **image_inputs}
return BatchFeature(combined_outputs, tensor_type=return_tensors)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment