Unverified Commit 657855ab authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Misc] Cleanup more configs and processors (#37560)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent e27b8ba3
...@@ -55,7 +55,7 @@ _CLASS_TO_MODULE: dict[str, str] = { ...@@ -55,7 +55,7 @@ _CLASS_TO_MODULE: dict[str, str] = {
"OvisConfig": "vllm.transformers_utils.configs.ovis", "OvisConfig": "vllm.transformers_utils.configs.ovis",
"PixelShuffleSiglip2VisionConfig": "vllm.transformers_utils.configs.isaac", "PixelShuffleSiglip2VisionConfig": "vllm.transformers_utils.configs.isaac",
"RadioConfig": "vllm.transformers_utils.configs.radio", "RadioConfig": "vllm.transformers_utils.configs.radio",
"SpeculatorsConfig": "vllm.transformers_utils.configs.speculators.base", "SpeculatorsConfig": "vllm.transformers_utils.configs.speculators",
"UltravoxConfig": "vllm.transformers_utils.configs.ultravox", "UltravoxConfig": "vllm.transformers_utils.configs.ultravox",
"Step3VLConfig": "vllm.transformers_utils.configs.step3_vl", "Step3VLConfig": "vllm.transformers_utils.configs.step3_vl",
"Step3VisionEncoderConfig": "vllm.transformers_utils.configs.step3_vl", "Step3VisionEncoderConfig": "vllm.transformers_utils.configs.step3_vl",
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from .base import SpeculatorsConfig
__all__ = ["SpeculatorsConfig"]
...@@ -8,9 +8,6 @@ from transformers import PretrainedConfig ...@@ -8,9 +8,6 @@ from transformers import PretrainedConfig
from vllm.transformers_utils.configs.speculators.algos import ( from vllm.transformers_utils.configs.speculators.algos import (
SUPPORTED_SPECULATORS_TYPES, SUPPORTED_SPECULATORS_TYPES,
) )
__all__ = ["SpeculatorsConfig"]
from vllm.transformers_utils.utils import without_trust_remote_code from vllm.transformers_utils.utils import without_trust_remote_code
......
...@@ -21,7 +21,9 @@ __all__ = [ ...@@ -21,7 +21,9 @@ __all__ = [
"HunYuanVLProcessor", "HunYuanVLProcessor",
"HunYuanVLImageProcessor", "HunYuanVLImageProcessor",
"InternVLProcessor", "InternVLProcessor",
"IsaacProcessor",
"KimiAudioProcessor", "KimiAudioProcessor",
"KimiK25Processor",
"MistralCommonPixtralProcessor", "MistralCommonPixtralProcessor",
"MistralCommonVoxtralProcessor", "MistralCommonVoxtralProcessor",
"NanoNemotronVLProcessor", "NanoNemotronVLProcessor",
...@@ -32,6 +34,7 @@ __all__ = [ ...@@ -32,6 +34,7 @@ __all__ = [
"Ovis2_5Processor", "Ovis2_5Processor",
"QwenVLProcessor", "QwenVLProcessor",
"Qwen3ASRProcessor", "Qwen3ASRProcessor",
"Step3VLProcessor",
] ]
_CLASS_TO_MODULE: dict[str, str] = { _CLASS_TO_MODULE: dict[str, str] = {
...@@ -45,7 +48,9 @@ _CLASS_TO_MODULE: dict[str, str] = { ...@@ -45,7 +48,9 @@ _CLASS_TO_MODULE: dict[str, str] = {
"HunYuanVLProcessor": "vllm.transformers_utils.processors.hunyuan_vl", "HunYuanVLProcessor": "vllm.transformers_utils.processors.hunyuan_vl",
"HunYuanVLImageProcessor": "vllm.transformers_utils.processors.hunyuan_vl_image", "HunYuanVLImageProcessor": "vllm.transformers_utils.processors.hunyuan_vl_image",
"InternVLProcessor": "vllm.transformers_utils.processors.internvl", "InternVLProcessor": "vllm.transformers_utils.processors.internvl",
"IsaacProcessor": "vllm.transformers_utils.processors.isaac",
"KimiAudioProcessor": "vllm.transformers_utils.processors.kimi_audio", "KimiAudioProcessor": "vllm.transformers_utils.processors.kimi_audio",
"KimiK25Processor": "vllm.transformers_utils.processors.kimi_k25",
"MistralCommonPixtralProcessor": "vllm.transformers_utils.processors.pixtral", "MistralCommonPixtralProcessor": "vllm.transformers_utils.processors.pixtral",
"MistralCommonVoxtralProcessor": "vllm.transformers_utils.processors.voxtral", "MistralCommonVoxtralProcessor": "vllm.transformers_utils.processors.voxtral",
"NanoNemotronVLProcessor": "vllm.transformers_utils.processors.nano_nemotron_vl", "NanoNemotronVLProcessor": "vllm.transformers_utils.processors.nano_nemotron_vl",
...@@ -56,6 +61,7 @@ _CLASS_TO_MODULE: dict[str, str] = { ...@@ -56,6 +61,7 @@ _CLASS_TO_MODULE: dict[str, str] = {
"Ovis2_5Processor": "vllm.transformers_utils.processors.ovis2_5", "Ovis2_5Processor": "vllm.transformers_utils.processors.ovis2_5",
"QwenVLProcessor": "vllm.transformers_utils.processors.qwen_vl", "QwenVLProcessor": "vllm.transformers_utils.processors.qwen_vl",
"Qwen3ASRProcessor": "vllm.transformers_utils.processors.qwen3_asr", "Qwen3ASRProcessor": "vllm.transformers_utils.processors.qwen3_asr",
"Step3VLProcessor": "vllm.transformers_utils.processors.step3_vl",
} }
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
import math
from typing import Any
import numpy as np
import PIL.Image
import torch
import torch.nn.functional as F
from transformers import BatchFeature, ProcessorMixin, TensorType
from typing_extensions import TypedDict, Unpack
MAX_PIXELS = 60_000_000 # 60-megapixel ceiling ≈ 8200 × 7300 px
# Vision preprocessing constants
VISION_MEAN = (0.5, 0.5, 0.5)
VISION_STD = (0.5, 0.5, 0.5)
VISION_SCALE = 1 / 255
def _make_writeable(arr: np.ndarray) -> np.ndarray:
"""Return *arr* itself if it is already writeable, otherwise try to flip the
write flag in-place and finally fall back to `arr.copy()`.
This guarantees the buffer handed to `torch.from_numpy()` is always
writeable, silencing the PyTorch warning about undefined behaviour.
"""
if arr.flags.writeable:
return arr
# First, try the cheap path — in-place flag toggle (works for mmap'd arrays
# and some shared memory buffers):
try:
arr.setflags(write=True)
return arr # success: no data copy
except ValueError:
# Buffer is inherently read-only (e.g. backed by PyAV / PIL): make copy
return arr.copy()
def extract_image_pil(image: PIL.Image.Image) -> torch.Tensor | None:
if image.width * image.height > MAX_PIXELS:
raise ValueError(
f"Image (w={image.width}, h={image.height}) > MAX=`{MAX_PIXELS}`"
)
img = image if image.mode == "RGB" else image.convert("RGB")
arr = np.asarray(img)
arr = _make_writeable(arr)
return torch.from_numpy(arr)
def get_image_size_for_max_num_patches(
image_height: int,
image_width: int,
patch_size: int,
max_num_patches: int,
min_num_patches: int | None = None,
eps: float = 1e-5,
pixel_shuffle_scale: int = 1,
) -> tuple[int, int]:
r"""Compute a target resolution whose patch grid satisfies patching parametrization.
Args:
image_height (`int`):
Height in pixels of the source image prior to any resizing.
image_width (`int`):
Width in pixels of the source image prior to any resizing.
patch_size (`int`):
Size of the square patch used by the vision encoder.
max_num_patches (`int`):
Upper bound on `(height / patch_size) * (width / patch_size)` after
resizing.
min_num_patches (`int`, *optional*):
Lower bound on the number of patches. When provided the image will
be scaled up if necessary.
eps (`float`, *optional*, defaults to 1e-5):
Convergence tolerance for the internal binary search to determine
the target dimensions.
pixel_shuffle_scale (`int`, *optional*, defaults to 1):
Additional stride multiplier applied when pixel shuffle later
reduces spatial resolution.
Returns:
`tuple[int, int]`: Height and width (in pixels) that are multiples of
`patch_size * pixel_shuffle_scale` and respect both the maximum and
optional minimum patch-count constraints.
"""
def get_scaled_image_size(scale, original_size, patch_size, pixel_shuffle_scale):
scaled_size = scale * original_size
divisor = patch_size * pixel_shuffle_scale
scaled_size = math.ceil(scaled_size / divisor) * divisor
scaled_size = max(divisor, scaled_size)
return int(scaled_size)
# Ensure divisibility
divisor = patch_size * pixel_shuffle_scale
adjusted_height = math.ceil(image_height / divisor) * divisor
adjusted_height = max(divisor, adjusted_height)
adjusted_width = math.ceil(image_width / divisor) * divisor
adjusted_width = max(divisor, adjusted_width)
num_patches = (adjusted_height / patch_size) * (adjusted_width / patch_size)
if min_num_patches is not None and num_patches < min_num_patches:
# Scale up
scale_min, scale_max = 1.0, 100.0
while (scale_max - scale_min) >= eps:
scale = (scale_min + scale_max) / 2
target_height = get_scaled_image_size(
scale, image_height, patch_size, pixel_shuffle_scale
)
target_width = get_scaled_image_size(
scale, image_width, patch_size, pixel_shuffle_scale
)
num_patches = (target_height / patch_size) * (target_width / patch_size)
if num_patches >= min_num_patches:
scale_max = scale
else:
scale_min = scale
scale = scale_max
target_height = get_scaled_image_size(
scale, image_height, patch_size, pixel_shuffle_scale
)
target_width = get_scaled_image_size(
scale, image_width, patch_size, pixel_shuffle_scale
)
return target_height, target_width
elif num_patches <= max_num_patches:
return adjusted_height, adjusted_width
else:
# Scale down
scale_min, scale_max = eps / 10, 1.0
while (scale_max - scale_min) >= eps:
scale = (scale_min + scale_max) / 2
target_height = get_scaled_image_size(
scale, image_height, patch_size, pixel_shuffle_scale
)
target_width = get_scaled_image_size(
scale, image_width, patch_size, pixel_shuffle_scale
)
num_patches = (target_height / patch_size) * (target_width / patch_size)
if num_patches <= max_num_patches:
scale_min = scale
else:
scale_max = scale
scale = scale_min
target_height = get_scaled_image_size(
scale, image_height, patch_size, pixel_shuffle_scale
)
target_width = get_scaled_image_size(
scale, image_width, patch_size, pixel_shuffle_scale
)
return target_height, target_width
_MEAN_TENSOR = torch.tensor(VISION_MEAN, dtype=torch.float32).view(1, 1, 1, -1)
_STD_TENSOR = torch.tensor(VISION_STD, dtype=torch.float32).view(1, 1, 1, -1)
def prepare_image_tensor(
image: torch.Tensor,
scale: float = VISION_SCALE,
) -> torch.Tensor:
r"""Standardize RGB images prior to patch extraction via rescaling and whitening.
Args:
image (`torch.Tensor`):
Tensor with shape `(..., height, width, 3)` containing RGB values.
The tensor is converted to floating point if needed.
scale (`float`, *optional*, defaults to `VISION_SCALE`):
Scalar multiplier applied before normalization.
Returns:
`torch.Tensor`: Normalized tensor with the same shape as the input and
dtype `torch.float32`.
"""
if not torch.is_floating_point(image):
image = image.float()
rescaled = image * scale
# Use precomputed tensors and move to the correct device if needed
mean_tensor = _MEAN_TENSOR.to(image.device)
std_tensor = _STD_TENSOR.to(image.device)
normalized = (rescaled - mean_tensor) / std_tensor
return normalized
def patchify_vision(image: torch.Tensor, patch_size: int) -> torch.Tensor:
r"""Convert normalized images into flattened ViT-style patches.
Args:
image (`torch.Tensor`):
Tensor of shape `(num_images, height, width, channels)`.
patch_size (`int`):
Edge length of the square patches
Returns:
`torch.Tensor`:
Patch tensor where each position stores the flattened pixels
belonging to that patch.
Raises:
ValueError: If `height` or `width` is not divisible by `patch_size`.
"""
num_images, height, width, channels = image.shape
if height % patch_size or width % patch_size:
raise ValueError(
"Dimensions of images "
f"{image.shape} are not divisible by patch_size={patch_size}."
)
patches = image.reshape(
num_images,
height // patch_size,
patch_size,
width // patch_size,
patch_size,
channels,
)
patches = patches.permute(0, 1, 3, 2, 4, 5)
patches = patches.reshape(
num_images,
height // patch_size,
width // patch_size,
channels * patch_size * patch_size,
)
return patches
def process_vision_for_patches(
images: torch.Tensor,
patch_size: int,
max_num_patches: int,
min_num_patches: int | None = None,
pixel_shuffle_scale: int = 1,
) -> tuple[torch.Tensor, list[int]]:
r"""Resize, normalize, and patchify RGB images for the vision encoder.
Args:
images (`torch.Tensor`):
Either `(height, width, channels)` for a single image or
`(num_images, height, width, channels)` for a batch. Channels are
expected to be RGB.
patch_size (`int`):
Edge length of square patches; implicitly controls resize grid granularity.
max_num_patches (`int`):
Maximum number of patches allowed after resizing.
min_num_patches (`int`, *optional*):
Minimum number of patches. If provided, the routine upsamples images
as needed to satisfy the lower bound.
pixel_shuffle_scale (`int`, *optional*, defaults to 1):
Pixel shuffle scale factor; influences the target grid that the
function produces.
Returns:
`tuple[torch.Tensor, list[int]]`: A pair `(patches, dims_virtual)`
where `patches` has shape `(num_images, target_h / patch_size, target_w
/ patch_size, channels * patch_size**2)` and `dims_virtual` encodes
effective `(images, height, width)` dimensions after optional pixel
shuffling.
"""
# Add batch dim if single image
if images.dim() == 3:
images = images.unsqueeze(0)
# Permute to channel first for resize
images = images.permute(0, 3, 1, 2)
# Get target dimensions
_, _, orig_height, orig_width = images.shape
target_height, target_width = get_image_size_for_max_num_patches(
orig_height,
orig_width,
patch_size,
max_num_patches,
min_num_patches=min_num_patches,
pixel_shuffle_scale=pixel_shuffle_scale,
)
# Resize
images = F.interpolate(
images,
size=(target_height, target_width),
mode="bilinear",
align_corners=False,
)
# Back to channel last
images = images.permute(0, 2, 3, 1)
# Normalize
images = prepare_image_tensor(images)
# Patchify
patches = patchify_vision(images, patch_size=patch_size)
# Calculate dimensions for the patches
n_images, h_patches, w_patches, _ = patches.shape
dims_virtual = (
[1, h_patches, w_patches]
if pixel_shuffle_scale == 1
else [1, h_patches // pixel_shuffle_scale, w_patches // pixel_shuffle_scale]
)
return patches, dims_virtual
class IsaacImageProcessorKwargs(TypedDict, total=False):
patch_size: int
max_num_patches: int
min_num_patches: int
pixel_shuffle_scale: int
class IsaacImageProcessor:
patch_size = 16
max_num_patches = 6144
min_num_patches = 256
pixel_shuffle_scale = 2
valid_kwargs = IsaacImageProcessorKwargs
model_input_names = ["pixel_values", "image_grid_thw"]
def __init__(self, kwargs):
self.patch_size = kwargs.pop("patch_size", self.patch_size)
self.vision_max_num_patches = kwargs.pop(
"vision_max_num_patches", self.max_num_patches
)
self.vision_min_num_patches = kwargs.pop(
"vision_min_num_patches", self.min_num_patches
)
self.pixel_shuffle_scale = kwargs.pop("pixel_shuffle_scale", 2)
def preprocess(
self,
images: list[torch.Tensor],
return_tensors: str | TensorType | None,
**kwargs: Unpack[IsaacImageProcessorKwargs],
) -> BatchFeature:
"""Preprocess images into format compatible with vLLM input processing."""
all_pixel_values: list[torch.Tensor] = []
all_image_grids: list[torch.Tensor] = []
for image in images:
image_tensor = extract_image_pil(image)
patches, dims_virtual = process_vision_for_patches(
image_tensor,
patch_size=self.patch_size,
max_num_patches=self.vision_max_num_patches,
min_num_patches=self.vision_min_num_patches,
pixel_shuffle_scale=self.pixel_shuffle_scale,
)
# Isaac packs a dummy temporal dim for images
patches = patches.unsqueeze(1) # [N, T=1, Hp, Wp, D]
hp, wp, dim = patches.shape[-3], patches.shape[-2], patches.shape[-1]
current_num_patches = hp * wp
pixel_values = patches.reshape(current_num_patches, dim) # [N_tokens, D]
# Use real patch dimensions for image_grid_thw, not virtual dimensions
# This ensures the vision model receives correct grid info for pixel shuffle
dims_real = [1, hp, wp] # Real patch dimensions
image_grid_thw = torch.tensor(dims_real).unsqueeze(0)
all_pixel_values.append(pixel_values)
all_image_grids.append(image_grid_thw)
if all_pixel_values:
final_pixel_values = torch.cat(all_pixel_values, dim=0)
final_image_grids = torch.cat(all_image_grids, dim=0)
else:
final_pixel_values = torch.empty(0, 0)
final_image_grids = torch.empty(0, 3)
return BatchFeature(
data={
"pixel_values": final_pixel_values,
"image_grid_thw": final_image_grids,
},
tensor_type=return_tensors,
)
class IsaacProcessor(ProcessorMixin):
attributes = ["image_processor", "tokenizer"]
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
self.image_token = kwargs.pop("image_token", "<image>")
self.image_processor = image_processor
self.tokenizer = tokenizer
def __call__(self, text=None, images=None, **kwargs) -> BatchFeature:
result = {}
if images is not None:
image_inputs = self.image_processor.preprocess(images, **kwargs)
image_grid_thw = image_inputs["image_grid_thw"]
result.update(image_inputs)
if text is not None:
if not isinstance(text, list):
text = [text]
text = text.copy() # below lines change text in-place
merge_length = self.image_processor.pixel_shuffle_scale**2
index = 0
for i in range(len(text)):
while self.image_token in text[i]:
num_image_tokens = image_grid_thw[index].prod() // merge_length
text[i] = text[i].replace(
self.image_token, "<|placeholder|>" * num_image_tokens, 1
)
index += 1
text[i] = text[i].replace("<|placeholder|>", "<|image_pad|>")
if text is not None:
result.update(self.tokenizer(text, **kwargs))
return BatchFeature(result)
def apply_chat_template(
self,
messages: list[dict[str, Any]],
tokenize: bool = False,
add_generation_prompt: bool = False,
**kwargs,
) -> Any:
# Convert mixed content messages to simple text format
processed_messages = []
for message in messages:
if "content" in message and isinstance(message["content"], list):
# Handle mixed content (text + image)
text_parts = []
for content_item in message["content"]:
if content_item.get("type") == "text":
text_parts.append(content_item.get("text", ""))
elif content_item.get("type") == "image":
# Replace image with vision token
text_parts.append(self.image_token)
processed_message = {
"role": message.get("role", "user"),
"content": "".join(text_parts),
}
processed_messages.append(processed_message)
else:
# Regular text message
processed_messages.append(message)
kwargs["return_dict"] = False
return self.tokenizer.apply_chat_template(
processed_messages,
tokenize=tokenize,
add_generation_prompt=add_generation_prompt,
**kwargs,
)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
from transformers import BatchFeature
from transformers.processing_utils import ProcessorMixin
from vllm.multimodal.inputs import VisionChunk
class KimiK25Processor(ProcessorMixin):
attributes = ["tokenizer"]
tokenizer_class = "AutoTokenizer"
def __init__(
self, media_processor=None, tokenizer=None, media_token_id: int | None = None
):
super().__init__(tokenizer)
self.media_processor = media_processor
self.media_token_id = media_token_id
assert self.media_token_id is not None
# We do not support str input for text here
def __call__(
self,
vision_chunks: list[VisionChunk] | None = None,
*,
text: list[int] | str,
**kwargs,
) -> BatchFeature:
"""
Args:
vision_chunks: List of VisionChunk items to be processed.
For image: VisionChunkImage with type='image', image=PIL.Image
For video_chunk: VisionChunkVideo with type='video_chunk',
video_chunk=list[PIL.Image]
text: The token ids to be fed to a model (required).
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **input_ids** -- list of token ids to be fed to a model.
- **pixel_values** -- Pixel values to be fed to a model.
Returned when `vision_chunks` is not `None`.
- **grid_thws** -- list of image 3D grid in LLM.
Returned when `vision_chunks` is not `None`.
"""
mm_inputs = {}
input_ids = self.tokenizer.encode(text) if isinstance(text, str) else text
if vision_chunks is not None:
assert isinstance(vision_chunks, list)
mm_inputs = self.media_processor.preprocess(vision_chunks)
num_tokens_per_chunk = [
self.media_processor.media_tokens_calculator(chunk)
for chunk in vision_chunks
]
new_input_ids = []
for token in input_ids:
if token == self.media_token_id:
new_input_ids.extend(
[self.media_token_id] * num_tokens_per_chunk.pop(0)
)
else:
new_input_ids.append(token)
input_ids = new_input_ids
# XXX: _apply_hf_processor_text_mm will call tolist() on input_ids
return BatchFeature(
data={
"input_ids": torch.tensor([input_ids]),
**mm_inputs,
}
)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from itertools import product
from math import ceil
import numpy as np
import torch
from PIL import Image
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
from transformers import BatchFeature, PretrainedConfig, TensorType
from vllm.tokenizers import TokenizerLike
MAX_IMAGE_SIZE: int = 3024
ImageWithPatches = tuple[Image.Image, list[Image.Image], list[bool] | None]
class Step3VisionProcessor:
def __init__(self, size, interpolation_mode="bicubic", patch_size=None):
mean = [0.48145466, 0.4578275, 0.40821073]
std = [0.26862954, 0.26130258, 0.27577711]
patch_size = patch_size if patch_size is not None else size
self.transform = transforms.Compose(
[
transforms.ToTensor(),
transforms.Normalize(mean, std),
transforms.Resize(
(size, size),
interpolation=InterpolationMode.BICUBIC
if interpolation_mode == "bicubic"
else InterpolationMode.BILINEAR,
antialias=True,
),
]
)
self.patch_transform = (
transforms.Compose(
[
transforms.ToTensor(),
transforms.Normalize(mean, std),
transforms.Resize(
(patch_size, patch_size),
interpolation=InterpolationMode.BICUBIC
if interpolation_mode == "bicubic"
else InterpolationMode.BILINEAR,
antialias=True,
),
]
)
if patch_size is not None
else None
)
def __call__(self, image, is_patch=False):
if is_patch:
assert self.patch_transform is not None
return {"pixel_values": self.patch_transform(image).unsqueeze(0)}
return {"pixel_values": self.transform(image).unsqueeze(0)}
class ImagePatcher:
def __init__(self, enable_patch: bool = True) -> None:
self.enable_patch = enable_patch
def determine_window_size(self, long: int, short: int) -> int:
if long < 728:
return short if long / short > 1.5 else 0
return min(short, 504) if long / short > 4 else 504
def slide_window(
self,
width: int,
height: int,
sizes: list[tuple[int, int]],
steps: list[tuple[int, int]],
img_rate_thr: float = 0.6,
) -> tuple[list[tuple[int, int, int, int]], tuple[int, int]]:
assert 1 >= img_rate_thr >= 0, "The `in_rate_thr` should lie in 0~1"
windows = []
# Sliding windows.
for size, step in zip(sizes, steps):
size_w, size_h = size
step_w, step_h = step
x_num = 1 if width <= size_w else ceil((width - size_w) / step_w + 1)
x_start = [step_w * i for i in range(x_num)]
if len(x_start) > 1 and x_start[-1] + size_w > width:
x_start[-1] = width - size_w
y_num = 1 if height <= size_h else ceil((height - size_h) / step_h + 1)
y_start = [step_h * i for i in range(y_num)]
if len(y_start) > 1 and y_start[-1] + size_h > height:
y_start[-1] = height - size_h
start = np.array(list(product(y_start, x_start)), dtype=int)
start[:, [0, 1]] = start[:, [1, 0]]
windows.append(np.concatenate([start, start + size], axis=1))
windows = np.concatenate(windows, axis=0)
return [
(int(box[0]), int(box[1]), int(box[2] - box[0]), int(box[3] - box[1]))
for box in windows
], (x_num, y_num)
def square_pad(self, img: Image.Image) -> Image.Image:
w, h = img.size
if w == h:
return img
size = max(w, h)
padded = Image.new(img.mode, (size, size), 0)
padded.paste(img, (0, 0))
return padded
def get_image_size_for_padding(
self, img_width: int, img_height: int
) -> tuple[int, int]:
ratio = img_width / img_height
if min(img_height, img_width) < 32 and (ratio > 4 or ratio < 1 / 4):
new_size = max(img_height, img_width)
return new_size, new_size
return img_width, img_height
def get_image_size_for_preprocess(
self, img_width: int, img_height: int
) -> tuple[int, int]:
if max(img_height, img_width) > MAX_IMAGE_SIZE:
scale_factor = MAX_IMAGE_SIZE / max(img_height, img_width)
img_width = int(img_width * scale_factor)
img_height = int(img_height * scale_factor)
return img_width, img_height
def get_image_size_for_crop(
self, img_width: int, img_height: int, window_size: int
):
w_ratio = img_width / window_size
h_ratio = img_height / window_size
if w_ratio < 1:
width_new = img_width
else:
decimal_w = w_ratio - img_width // window_size
w_ratio = int(w_ratio) + 1 if decimal_w > 0.2 else int(w_ratio)
width_new = window_size * w_ratio
if h_ratio < 1:
height_new = img_height
else:
decimal_h = h_ratio - img_height // window_size
h_ratio = int(h_ratio) + 1 if decimal_h > 0.2 else int(h_ratio)
height_new = window_size * h_ratio
return int(width_new), int(height_new)
def patch_crop(self, img: Image.Image, i: int, j: int, th: int, tw: int):
target = img.crop((j, i, j + tw, i + th))
return target
def get_num_patches(self, img_width: int, img_height: int) -> tuple[int, int]:
img_width, img_height = self.get_image_size_for_padding(img_width, img_height)
img_width, img_height = self.get_image_size_for_preprocess(
img_width, img_height
)
window_size = self.determine_window_size(
max(img_height, img_width), min(img_height, img_width)
)
if window_size == 0 or not self.enable_patch:
return 0, 0
else:
img_width, img_height = self.get_image_size_for_crop(
img_width, img_height, window_size
)
center_list, (x_num, y_num) = self.slide_window(
img_width,
img_height,
[(window_size, window_size)],
[(window_size, window_size)],
)
full_rows = (len(center_list) - 1) // x_num + 1
if len(center_list) > 0 and len(center_list) % x_num == 0:
full_rows -= 1
return len(center_list), full_rows
def __call__(
self, img: Image.Image
) -> tuple[Image.Image, list[Image.Image], list[bool] | None]:
img_width, img_height = img.size
new_img_width, new_img_height = self.get_image_size_for_padding(
img_width, img_height
)
if new_img_width != img_width or new_img_height != img_height:
img = self.square_pad(img)
img_width, img_height = img.size
new_img_width, new_img_height = self.get_image_size_for_preprocess(
img_width, img_height
)
img = img.resize((new_img_width, new_img_height), Image.Resampling.BILINEAR)
window_size = self.determine_window_size(
max(new_img_height, new_img_width), min(new_img_height, new_img_width)
)
if window_size == 0 or not self.enable_patch:
return img, [], None
else:
new_img_width, new_img_height = self.get_image_size_for_crop(
new_img_width, new_img_height, window_size
)
if (new_img_width, new_img_height) != (img_width, img_height):
img_for_crop = img.resize(
(new_img_width, new_img_height), Image.Resampling.BILINEAR
)
else:
img_for_crop = img
patches = []
newlines = []
center_list, (x_num, y_num) = self.slide_window(
new_img_width,
new_img_height,
[(window_size, window_size)],
[(window_size, window_size)],
)
for patch_id, center_lf_point in enumerate(center_list):
x, y, patch_w, patch_h = center_lf_point
big_patch = self.patch_crop(img_for_crop, y, x, patch_h, patch_w)
patches.append(big_patch)
if (patch_id + 1) % x_num == 0:
newlines.append(patch_id)
if newlines and newlines[-1] == len(patches) - 1:
newlines.pop()
return (
img,
patches,
[i in newlines for i in range(len(patches))]
if len(patches) > 0
else None,
)
class Step3VLProcessor:
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
) -> None:
super().__init__()
self.config = config
self.tokenizer = tokenizer
self.image_size = 728
self.patch_size = 504
self.image_preprocessor = Step3VisionProcessor(
self.image_size, "bilinear", self.patch_size
)
self.num_image_feature_size = 169
self.num_patch_feature_size = 81
self.image_token = "<im_patch>"
self.image_feature_placeholder = self.image_token * self.num_image_feature_size
self.patch_feature_placeholder = self.image_token * self.num_patch_feature_size
# Respect vision config switch to enable/disable patch extraction.
# For video understanding, it's preferable to disable patch.
enable_patch = getattr(self.config.vision_config, "enable_patch", True)
self.patcher = ImagePatcher(enable_patch=enable_patch)
@property
def image_token_id(self) -> int:
return self.tokenizer.get_vocab()[self.image_token]
def get_num_image_tokens(self, img_width: int, img_height: int) -> int:
num_patches, num_newlines = self.patcher.get_num_patches(img_width, img_height)
return (
num_patches * (self.num_patch_feature_size + 2)
+ self.num_image_feature_size
+ 2
+ num_newlines
)
def _split_images(self, images: list[Image.Image]) -> list[ImageWithPatches]:
result = []
for img in images:
result.append(self.patcher(img))
return result
def _convert_images_to_pixel_values(
self,
images: list[Image.Image],
is_patch: bool = False,
) -> list[torch.Tensor]:
return [
self.image_preprocessor(img, is_patch=is_patch)["pixel_values"]
for img in images
]
def _get_patch_repl(
self,
num_patches: int,
patch_newline_mask: list[bool] | None,
) -> tuple[str, list[int]]:
text = ""
token_ids = []
for i in range(num_patches):
assert (
patch_newline_mask is not None
and len(patch_newline_mask) == num_patches
)
text += f"<patch_start>{self.patch_feature_placeholder}<patch_end>"
token_ids.extend(
[self.tokenizer.convert_tokens_to_ids("<patch_start>")]
+ [self.image_token_id] * self.num_patch_feature_size
+ [self.tokenizer.convert_tokens_to_ids("<patch_end>")]
)
if patch_newline_mask and patch_newline_mask[i]:
text += "<patch_newline>"
token_ids.append(
self.tokenizer.convert_tokens_to_ids("<patch_newline>")
)
return text, token_ids
def _get_image_repl(
self,
num_images: int,
) -> tuple[str, list[int]]:
text = f"<im_start>{self.image_feature_placeholder}<im_end>"
token_ids = (
[self.tokenizer.convert_tokens_to_ids("<im_start>")]
+ [self.image_token_id] * self.num_image_feature_size
+ [self.tokenizer.convert_tokens_to_ids("<im_end>")]
)
return text * num_images, token_ids * num_images
def _get_image_repl_features(
self,
num_images: int,
num_patches: int,
patch_new_line_idx: list[bool] | None,
) -> tuple[str, list[int]]:
if num_patches > 0:
patch_repl, patch_repl_ids = self._get_patch_repl(
num_patches, patch_new_line_idx
)
else:
patch_repl = ""
patch_repl_ids = []
image_repl, image_repl_ids = self._get_image_repl(num_images)
return patch_repl + image_repl, patch_repl_ids + image_repl_ids
def replace_placeholder(self, text: str, placeholder: str, repls: list[str]) -> str:
parts = text.split(placeholder)
if len(parts) - 1 != len(repls):
raise ValueError(
"The number of placeholders does not match the number of replacements."
)
result = [parts[0]]
for i, repl in enumerate(repls):
result.append(repl)
result.append(parts[i + 1])
return "".join(result)
def __call__(
self,
text: str | list[str] | None = None,
images: Image.Image | list[Image.Image] | None = None,
return_tensors: str | TensorType | None = None,
) -> BatchFeature:
if text is None:
text = []
if not isinstance(text, list):
text = [text]
if images is None:
images = []
if not isinstance(images, list):
images = [images]
if len(images) == 0:
image_inputs = {}
text_inputs = self.tokenizer(text)
else:
split_images_data = self._split_images(images)
pixel_values_lst = []
patch_pixel_values_lst = []
patch_newline_mask_lst = []
image_repl_str_lst = []
image_repl_ids_lst = []
num_patches = []
for raw_img, img_patches, patch_newline_mask in split_images_data:
pixel_values_lst.extend(self._convert_images_to_pixel_values([raw_img]))
if len(img_patches) > 0:
patch_pixel_values_lst.extend(
self._convert_images_to_pixel_values(img_patches, is_patch=True)
)
num_patches.append(len(img_patches))
image_repl_str, image_repl_ids = self._get_image_repl_features(
1, len(img_patches), patch_newline_mask
)
image_repl_str_lst.append(image_repl_str)
image_repl_ids_lst.extend(image_repl_ids)
if patch_newline_mask is not None:
patch_newline_mask_lst.extend(patch_newline_mask)
pixel_values = torch.cat(pixel_values_lst)
patch_size = self.patch_size
image_inputs = {
"pixel_values": pixel_values,
"num_patches": num_patches,
"patch_pixel_values": (
torch.cat(patch_pixel_values_lst)
if patch_pixel_values_lst
else pixel_values.new_empty((0, 3, patch_size, patch_size))
),
"patch_newline_mask": torch.tensor(
patch_newline_mask_lst, dtype=torch.bool
),
}
text = [
self.replace_placeholder(t, self.image_token, image_repl_str_lst)
for t in text
]
text_inputs = self.tokenizer(text)
return BatchFeature(
{
**text_inputs,
**image_inputs,
},
tensor_type=return_tensors,
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment