Unverified Commit 37aadf62 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Model] Update Kimi-K25 and Isaac processors to fit HF-style (#37693)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent d7d2b5e4
......@@ -334,15 +334,14 @@ class IsaacProcessingInfo(BaseProcessingInfo):
return IsaacConfig()
def get_image_processor(self, **kwargs) -> IsaacImageProcessor:
return IsaacImageProcessor(kwargs)
return IsaacImageProcessor(**kwargs)
def get_hf_processor(self, **kwargs) -> IsaacProcessor:
hf_config = self.get_hf_config()
return self.ctx.init_processor(
IsaacProcessor,
return IsaacProcessor(
tokenizer=self.get_tokenizer(),
image_processor=self.get_image_processor(),
image_processor=self.get_image_processor(**kwargs),
image_token=hf_config.vision_token,
)
......
......@@ -104,19 +104,25 @@ class KimiK25ProcessingInfo(BaseProcessingInfo):
def __init__(self, ctx: InputProcessingContext) -> None:
super().__init__(ctx)
self.hf_config = self.get_hf_config()
self.media_token_id = self.hf_config.media_placeholder_token_id
media_processor = cached_get_image_processor(
self.hf_config = hf_config = self.get_hf_config()
tokenizer = self.get_tokenizer()
image_processor = cached_get_image_processor(
self.ctx.model_config.model,
trust_remote_code=self.ctx.model_config.trust_remote_code,
)
self.media_processor = media_processor
self.media_token_id = media_token_id = hf_config.media_placeholder_token_id
self.media_token = tokenizer.decode(media_token_id)
self.image_processor = image_processor
self.hf_processor = KimiK25Processor(
media_processor=self.media_processor,
tokenizer=self.get_tokenizer(),
media_token_id=self.media_token_id,
tokenizer=tokenizer,
image_processor=image_processor,
media_token_id=media_token_id,
)
self.media_tokens_calculator = self.media_processor.media_tokens_calculator
self.media_tokens_calculator = image_processor.media_tokens_calculator
def get_hf_processor(self):
return self.hf_processor
......@@ -132,20 +138,15 @@ class KimiK25ProcessingInfo(BaseProcessingInfo):
class KimiK25DummyInputsBuilder(BaseDummyInputsBuilder[KimiK25ProcessingInfo]):
"""Builds dummy inputs for Kimi-K2.5 model profiling."""
def __init__(self, info: KimiK25ProcessingInfo) -> None:
super().__init__(info)
self.media_token_id = self.info.media_token_id
self.frame_per_chunk = self.info.media_processor.num_frames_per_chunk
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
num_media = mm_counts.get("vision_chunk", 0)
return "<|media_pad|>" * num_media
return self.info.media_token * num_media
def get_dummy_mm_items(self):
dummy_videos = self._get_dummy_images(
height=MaxImageTokenMeta.height,
width=MaxImageTokenMeta.width,
num_images=self.frame_per_chunk,
num_images=self.info.image_processor.num_frames_per_chunk,
)
video_chunk_dummy_item = VisionChunkVideo(
......@@ -236,9 +237,6 @@ class KimiK25MultiModalProcessor(BaseMultiModalProcessor[KimiK25ProcessingInfo])
),
]
def split_video_chunks(self, video):
return self.info.media_processor.split_video_chunks(video)
@MULTIMODAL_REGISTRY.register_processor(
KimiK25MultiModalProcessor,
......
......@@ -6,12 +6,14 @@ import math
from typing import Any
import numpy as np
import PIL.Image
import torch
import torch.nn.functional as F
from PIL import Image
from transformers import BatchFeature, ProcessorMixin, TensorType
from typing_extensions import TypedDict, Unpack
from vllm.tokenizers.hf import HfTokenizer
MAX_PIXELS = 60_000_000 # 60-megapixel ceiling ≈ 8200 × 7300 px
# Vision preprocessing constants
......@@ -39,7 +41,7 @@ def _make_writeable(arr: np.ndarray) -> np.ndarray:
return arr.copy()
def extract_image_pil(image: PIL.Image.Image) -> torch.Tensor | None:
def extract_image_pil(image: Image.Image) -> torch.Tensor:
if image.width * image.height > MAX_PIXELS:
raise ValueError(
f"Image (w={image.width}, h={image.height}) > MAX=`{MAX_PIXELS}`"
......@@ -314,31 +316,30 @@ class IsaacImageProcessorKwargs(TypedDict, total=False):
class IsaacImageProcessor:
patch_size = 16
max_num_patches = 6144
min_num_patches = 256
pixel_shuffle_scale = 2
valid_kwargs = IsaacImageProcessorKwargs
model_input_names = ["pixel_values", "image_grid_thw"]
def __init__(self, kwargs):
self.patch_size = kwargs.pop("patch_size", self.patch_size)
self.vision_max_num_patches = kwargs.pop(
"vision_max_num_patches", self.max_num_patches
)
self.vision_min_num_patches = kwargs.pop(
"vision_min_num_patches", self.min_num_patches
)
self.pixel_shuffle_scale = kwargs.pop("pixel_shuffle_scale", 2)
def preprocess(
def __init__(
self,
images: list[torch.Tensor],
return_tensors: str | TensorType | None,
patch_size: int = 16,
vision_max_num_patches: int = 6144,
vision_min_num_patches: int = 256,
pixel_shuffle_scale: int = 2,
) -> None:
self.patch_size = patch_size
self.vision_max_num_patches = vision_max_num_patches
self.vision_min_num_patches = vision_min_num_patches
self.pixel_shuffle_scale = pixel_shuffle_scale
def __call__(
self,
images: Image.Image | list[Image.Image],
return_tensors: str | TensorType | None = None,
**kwargs: Unpack[IsaacImageProcessorKwargs],
) -> BatchFeature:
"""Preprocess images into format compatible with vLLM input processing."""
if not isinstance(images, list):
images = [images]
all_pixel_values: list[torch.Tensor] = []
all_image_grids: list[torch.Tensor] = []
......@@ -388,23 +389,40 @@ class IsaacImageProcessor:
class IsaacProcessor(ProcessorMixin):
attributes = ["image_processor", "tokenizer"]
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
self.image_token = kwargs.pop("image_token", "<image>")
def __init__(
self,
image_processor: IsaacImageProcessor,
tokenizer: HfTokenizer,
image_token: str = "<image>",
):
self.image_processor = image_processor
self.tokenizer = tokenizer
def __call__(self, text=None, images=None, **kwargs) -> BatchFeature:
result = {}
self.image_token = image_token
def __call__(
self,
text: str | list[str] | None = None,
images: Image.Image | list[Image.Image] | None = None,
return_tensors: str | TensorType | None = None,
**kwargs,
) -> BatchFeature:
if images is not None:
image_inputs = self.image_processor.preprocess(images, **kwargs)
image_inputs = self.image_processor(
images,
return_tensors=return_tensors,
**kwargs,
)
image_grid_thw = image_inputs["image_grid_thw"]
result.update(image_inputs)
else:
image_inputs = {}
image_grid_thw = []
if text is not None:
if not isinstance(text, list):
text = [text]
if text is not None:
if not isinstance(text, list):
text = [text]
if image_inputs:
text = text.copy() # below lines change text in-place
merge_length = self.image_processor.pixel_shuffle_scale**2
index = 0
......@@ -417,10 +435,14 @@ class IsaacProcessor(ProcessorMixin):
index += 1
text[i] = text[i].replace("<|placeholder|>", "<|image_pad|>")
if text is not None:
result.update(self.tokenizer(text, **kwargs))
text_inputs = self.tokenizer(text, return_tensors=return_tensors)
else:
text_inputs = {}
return BatchFeature(result)
return BatchFeature(
data={**text_inputs, **image_inputs},
tensor_type=return_tensors,
)
def apply_chat_template(
self,
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
from transformers import BatchFeature
from transformers import BaseImageProcessor, BatchFeature, TensorType
from transformers.processing_utils import ProcessorMixin
from vllm.multimodal.inputs import VisionChunk
from vllm.tokenizers.hf import HfTokenizer
class KimiK25Processor(ProcessorMixin):
attributes = ["tokenizer"]
tokenizer_class = "AutoTokenizer"
attributes = ["image_processor", "tokenizer"]
def __init__(
self, media_processor=None, tokenizer=None, media_token_id: int | None = None
):
super().__init__(tokenizer)
self.media_processor = media_processor
self,
image_processor: BaseImageProcessor,
tokenizer: HfTokenizer,
media_token_id: int,
) -> None:
self.image_processor = image_processor
self.tokenizer = tokenizer
self.media_token_id = media_token_id
assert self.media_token_id is not None
def __call__(
self,
text: str | list[str] | None = None,
vision_chunks: list[VisionChunk] | None = None,
*,
text: list[int] | str,
return_tensors: str | TensorType | None = None,
**kwargs,
) -> BatchFeature:
"""
Args:
vision_chunks: List of VisionChunk items to be processed.
For image: VisionChunkImage with type='image', image=PIL.Image
For video_chunk: VisionChunkVideo with type='video_chunk',
video_chunk=list[PIL.Image]
text: The token ids to be fed to a model (required).
text: The text to be field to the model.
vision_chunks: List of `VisionChunk` items to be processed.
For image: `VisionChunkImage` with
`type='image', image=PIL.Image`
For video_chunk: `VisionChunkVideo` with
`type='video_chunk', video_chunk=list[PIL.Image]`
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
......@@ -42,31 +45,44 @@ class KimiK25Processor(ProcessorMixin):
- **grid_thws** -- list of image 3D grid in LLM.
Returned when `vision_chunks` is not `None`.
"""
mm_inputs = {}
input_ids = self.tokenizer.encode(text) if isinstance(text, str) else text
if vision_chunks is not None:
assert isinstance(vision_chunks, list)
mm_inputs = self.media_processor.preprocess(vision_chunks)
mm_inputs = self.image_processor.preprocess(
vision_chunks,
return_tensors=return_tensors,
)
else:
mm_inputs = {}
if text is not None:
if not isinstance(text, list):
text = [text]
text_inputs = self.tokenizer(text)
# Note: Modify in-place
input_ids: list[list[int]] = text_inputs["input_ids"] # type: ignore
if vision_chunks is not None:
num_tokens_per_chunk = [
self.image_processor.media_tokens_calculator(chunk)
for chunk in vision_chunks
]
num_tokens_per_chunk = [
self.media_processor.media_tokens_calculator(chunk)
for chunk in vision_chunks
]
for i in range(len(input_ids)):
new_input_ids = []
for token in input_ids[i]:
if token == self.media_token_id:
new_input_ids.extend(
[self.media_token_id] * num_tokens_per_chunk.pop(0)
)
else:
new_input_ids.append(token)
new_input_ids = []
for token in input_ids:
if token == self.media_token_id:
new_input_ids.extend(
[self.media_token_id] * num_tokens_per_chunk.pop(0)
)
else:
new_input_ids.append(token)
input_ids = new_input_ids
input_ids[i] = new_input_ids
else:
text_inputs = {}
# XXX: _apply_hf_processor_text_mm will call tolist() on input_ids
return BatchFeature(
data={
"input_ids": torch.tensor([input_ids]),
**mm_inputs,
}
data={**text_inputs, **mm_inputs},
tensor_type=return_tensors,
)
......@@ -286,11 +286,9 @@ class Step3VLImageProcessor:
def __call__(
self,
images: Image.Image | list[Image.Image] | None = None,
images: Image.Image | list[Image.Image],
return_tensors: str | TensorType | None = None,
) -> BatchFeature:
if images is None:
images = []
if not isinstance(images, list):
images = [images]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment