Unverified Commit 37aadf62 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Model] Update Kimi-K25 and Isaac processors to fit HF-style (#37693)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent d7d2b5e4
...@@ -334,15 +334,14 @@ class IsaacProcessingInfo(BaseProcessingInfo): ...@@ -334,15 +334,14 @@ class IsaacProcessingInfo(BaseProcessingInfo):
return IsaacConfig() return IsaacConfig()
def get_image_processor(self, **kwargs) -> IsaacImageProcessor: def get_image_processor(self, **kwargs) -> IsaacImageProcessor:
return IsaacImageProcessor(kwargs) return IsaacImageProcessor(**kwargs)
def get_hf_processor(self, **kwargs) -> IsaacProcessor: def get_hf_processor(self, **kwargs) -> IsaacProcessor:
hf_config = self.get_hf_config() hf_config = self.get_hf_config()
return self.ctx.init_processor( return IsaacProcessor(
IsaacProcessor,
tokenizer=self.get_tokenizer(), tokenizer=self.get_tokenizer(),
image_processor=self.get_image_processor(), image_processor=self.get_image_processor(**kwargs),
image_token=hf_config.vision_token, image_token=hf_config.vision_token,
) )
......
...@@ -104,19 +104,25 @@ class KimiK25ProcessingInfo(BaseProcessingInfo): ...@@ -104,19 +104,25 @@ class KimiK25ProcessingInfo(BaseProcessingInfo):
def __init__(self, ctx: InputProcessingContext) -> None: def __init__(self, ctx: InputProcessingContext) -> None:
super().__init__(ctx) super().__init__(ctx)
self.hf_config = self.get_hf_config()
self.media_token_id = self.hf_config.media_placeholder_token_id self.hf_config = hf_config = self.get_hf_config()
media_processor = cached_get_image_processor(
tokenizer = self.get_tokenizer()
image_processor = cached_get_image_processor(
self.ctx.model_config.model, self.ctx.model_config.model,
trust_remote_code=self.ctx.model_config.trust_remote_code, trust_remote_code=self.ctx.model_config.trust_remote_code,
) )
self.media_processor = media_processor
self.media_token_id = media_token_id = hf_config.media_placeholder_token_id
self.media_token = tokenizer.decode(media_token_id)
self.image_processor = image_processor
self.hf_processor = KimiK25Processor( self.hf_processor = KimiK25Processor(
media_processor=self.media_processor, tokenizer=tokenizer,
tokenizer=self.get_tokenizer(), image_processor=image_processor,
media_token_id=self.media_token_id, media_token_id=media_token_id,
) )
self.media_tokens_calculator = self.media_processor.media_tokens_calculator self.media_tokens_calculator = image_processor.media_tokens_calculator
def get_hf_processor(self): def get_hf_processor(self):
return self.hf_processor return self.hf_processor
...@@ -132,20 +138,15 @@ class KimiK25ProcessingInfo(BaseProcessingInfo): ...@@ -132,20 +138,15 @@ class KimiK25ProcessingInfo(BaseProcessingInfo):
class KimiK25DummyInputsBuilder(BaseDummyInputsBuilder[KimiK25ProcessingInfo]): class KimiK25DummyInputsBuilder(BaseDummyInputsBuilder[KimiK25ProcessingInfo]):
"""Builds dummy inputs for Kimi-K2.5 model profiling.""" """Builds dummy inputs for Kimi-K2.5 model profiling."""
def __init__(self, info: KimiK25ProcessingInfo) -> None:
super().__init__(info)
self.media_token_id = self.info.media_token_id
self.frame_per_chunk = self.info.media_processor.num_frames_per_chunk
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
num_media = mm_counts.get("vision_chunk", 0) num_media = mm_counts.get("vision_chunk", 0)
return "<|media_pad|>" * num_media return self.info.media_token * num_media
def get_dummy_mm_items(self): def get_dummy_mm_items(self):
dummy_videos = self._get_dummy_images( dummy_videos = self._get_dummy_images(
height=MaxImageTokenMeta.height, height=MaxImageTokenMeta.height,
width=MaxImageTokenMeta.width, width=MaxImageTokenMeta.width,
num_images=self.frame_per_chunk, num_images=self.info.image_processor.num_frames_per_chunk,
) )
video_chunk_dummy_item = VisionChunkVideo( video_chunk_dummy_item = VisionChunkVideo(
...@@ -236,9 +237,6 @@ class KimiK25MultiModalProcessor(BaseMultiModalProcessor[KimiK25ProcessingInfo]) ...@@ -236,9 +237,6 @@ class KimiK25MultiModalProcessor(BaseMultiModalProcessor[KimiK25ProcessingInfo])
), ),
] ]
def split_video_chunks(self, video):
return self.info.media_processor.split_video_chunks(video)
@MULTIMODAL_REGISTRY.register_processor( @MULTIMODAL_REGISTRY.register_processor(
KimiK25MultiModalProcessor, KimiK25MultiModalProcessor,
......
...@@ -6,12 +6,14 @@ import math ...@@ -6,12 +6,14 @@ import math
from typing import Any from typing import Any
import numpy as np import numpy as np
import PIL.Image
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from PIL import Image
from transformers import BatchFeature, ProcessorMixin, TensorType from transformers import BatchFeature, ProcessorMixin, TensorType
from typing_extensions import TypedDict, Unpack from typing_extensions import TypedDict, Unpack
from vllm.tokenizers.hf import HfTokenizer
MAX_PIXELS = 60_000_000 # 60-megapixel ceiling ≈ 8200 × 7300 px MAX_PIXELS = 60_000_000 # 60-megapixel ceiling ≈ 8200 × 7300 px
# Vision preprocessing constants # Vision preprocessing constants
...@@ -39,7 +41,7 @@ def _make_writeable(arr: np.ndarray) -> np.ndarray: ...@@ -39,7 +41,7 @@ def _make_writeable(arr: np.ndarray) -> np.ndarray:
return arr.copy() return arr.copy()
def extract_image_pil(image: PIL.Image.Image) -> torch.Tensor | None: def extract_image_pil(image: Image.Image) -> torch.Tensor:
if image.width * image.height > MAX_PIXELS: if image.width * image.height > MAX_PIXELS:
raise ValueError( raise ValueError(
f"Image (w={image.width}, h={image.height}) > MAX=`{MAX_PIXELS}`" f"Image (w={image.width}, h={image.height}) > MAX=`{MAX_PIXELS}`"
...@@ -314,31 +316,30 @@ class IsaacImageProcessorKwargs(TypedDict, total=False): ...@@ -314,31 +316,30 @@ class IsaacImageProcessorKwargs(TypedDict, total=False):
class IsaacImageProcessor: class IsaacImageProcessor:
patch_size = 16
max_num_patches = 6144
min_num_patches = 256
pixel_shuffle_scale = 2
valid_kwargs = IsaacImageProcessorKwargs valid_kwargs = IsaacImageProcessorKwargs
model_input_names = ["pixel_values", "image_grid_thw"] model_input_names = ["pixel_values", "image_grid_thw"]
def __init__(self, kwargs): def __init__(
self.patch_size = kwargs.pop("patch_size", self.patch_size)
self.vision_max_num_patches = kwargs.pop(
"vision_max_num_patches", self.max_num_patches
)
self.vision_min_num_patches = kwargs.pop(
"vision_min_num_patches", self.min_num_patches
)
self.pixel_shuffle_scale = kwargs.pop("pixel_shuffle_scale", 2)
def preprocess(
self, self,
images: list[torch.Tensor], patch_size: int = 16,
return_tensors: str | TensorType | None, vision_max_num_patches: int = 6144,
vision_min_num_patches: int = 256,
pixel_shuffle_scale: int = 2,
) -> None:
self.patch_size = patch_size
self.vision_max_num_patches = vision_max_num_patches
self.vision_min_num_patches = vision_min_num_patches
self.pixel_shuffle_scale = pixel_shuffle_scale
def __call__(
self,
images: Image.Image | list[Image.Image],
return_tensors: str | TensorType | None = None,
**kwargs: Unpack[IsaacImageProcessorKwargs], **kwargs: Unpack[IsaacImageProcessorKwargs],
) -> BatchFeature: ) -> BatchFeature:
"""Preprocess images into format compatible with vLLM input processing.""" """Preprocess images into format compatible with vLLM input processing."""
if not isinstance(images, list):
images = [images]
all_pixel_values: list[torch.Tensor] = [] all_pixel_values: list[torch.Tensor] = []
all_image_grids: list[torch.Tensor] = [] all_image_grids: list[torch.Tensor] = []
...@@ -388,23 +389,40 @@ class IsaacImageProcessor: ...@@ -388,23 +389,40 @@ class IsaacImageProcessor:
class IsaacProcessor(ProcessorMixin): class IsaacProcessor(ProcessorMixin):
attributes = ["image_processor", "tokenizer"] attributes = ["image_processor", "tokenizer"]
def __init__(self, image_processor=None, tokenizer=None, **kwargs): def __init__(
self.image_token = kwargs.pop("image_token", "<image>") self,
image_processor: IsaacImageProcessor,
tokenizer: HfTokenizer,
image_token: str = "<image>",
):
self.image_processor = image_processor self.image_processor = image_processor
self.tokenizer = tokenizer self.tokenizer = tokenizer
def __call__(self, text=None, images=None, **kwargs) -> BatchFeature: self.image_token = image_token
result = {}
def __call__(
self,
text: str | list[str] | None = None,
images: Image.Image | list[Image.Image] | None = None,
return_tensors: str | TensorType | None = None,
**kwargs,
) -> BatchFeature:
if images is not None: if images is not None:
image_inputs = self.image_processor.preprocess(images, **kwargs) image_inputs = self.image_processor(
images,
return_tensors=return_tensors,
**kwargs,
)
image_grid_thw = image_inputs["image_grid_thw"] image_grid_thw = image_inputs["image_grid_thw"]
result.update(image_inputs) else:
image_inputs = {}
image_grid_thw = []
if text is not None: if text is not None:
if not isinstance(text, list): if not isinstance(text, list):
text = [text] text = [text]
if image_inputs:
text = text.copy() # below lines change text in-place text = text.copy() # below lines change text in-place
merge_length = self.image_processor.pixel_shuffle_scale**2 merge_length = self.image_processor.pixel_shuffle_scale**2
index = 0 index = 0
...@@ -417,10 +435,14 @@ class IsaacProcessor(ProcessorMixin): ...@@ -417,10 +435,14 @@ class IsaacProcessor(ProcessorMixin):
index += 1 index += 1
text[i] = text[i].replace("<|placeholder|>", "<|image_pad|>") text[i] = text[i].replace("<|placeholder|>", "<|image_pad|>")
if text is not None: text_inputs = self.tokenizer(text, return_tensors=return_tensors)
result.update(self.tokenizer(text, **kwargs)) else:
text_inputs = {}
return BatchFeature(result) return BatchFeature(
data={**text_inputs, **image_inputs},
tensor_type=return_tensors,
)
def apply_chat_template( def apply_chat_template(
self, self,
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch from transformers import BaseImageProcessor, BatchFeature, TensorType
from transformers import BatchFeature
from transformers.processing_utils import ProcessorMixin from transformers.processing_utils import ProcessorMixin
from vllm.multimodal.inputs import VisionChunk from vllm.multimodal.inputs import VisionChunk
from vllm.tokenizers.hf import HfTokenizer
class KimiK25Processor(ProcessorMixin): class KimiK25Processor(ProcessorMixin):
attributes = ["tokenizer"] attributes = ["image_processor", "tokenizer"]
tokenizer_class = "AutoTokenizer"
def __init__( def __init__(
self, media_processor=None, tokenizer=None, media_token_id: int | None = None self,
): image_processor: BaseImageProcessor,
super().__init__(tokenizer) tokenizer: HfTokenizer,
self.media_processor = media_processor media_token_id: int,
) -> None:
self.image_processor = image_processor
self.tokenizer = tokenizer
self.media_token_id = media_token_id self.media_token_id = media_token_id
assert self.media_token_id is not None
def __call__( def __call__(
self, self,
text: str | list[str] | None = None,
vision_chunks: list[VisionChunk] | None = None, vision_chunks: list[VisionChunk] | None = None,
*, return_tensors: str | TensorType | None = None,
text: list[int] | str,
**kwargs, **kwargs,
) -> BatchFeature: ) -> BatchFeature:
""" """
Args: Args:
vision_chunks: List of VisionChunk items to be processed. text: The text to be field to the model.
For image: VisionChunkImage with type='image', image=PIL.Image vision_chunks: List of `VisionChunk` items to be processed.
For video_chunk: VisionChunkVideo with type='video_chunk', For image: `VisionChunkImage` with
video_chunk=list[PIL.Image] `type='image', image=PIL.Image`
text: The token ids to be fed to a model (required). For video_chunk: `VisionChunkVideo` with
`type='video_chunk', video_chunk=list[PIL.Image]`
Returns: Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields: [`BatchFeature`]: A [`BatchFeature`] with the following fields:
...@@ -42,31 +45,44 @@ class KimiK25Processor(ProcessorMixin): ...@@ -42,31 +45,44 @@ class KimiK25Processor(ProcessorMixin):
- **grid_thws** -- list of image 3D grid in LLM. - **grid_thws** -- list of image 3D grid in LLM.
Returned when `vision_chunks` is not `None`. Returned when `vision_chunks` is not `None`.
""" """
mm_inputs = {}
input_ids = self.tokenizer.encode(text) if isinstance(text, str) else text
if vision_chunks is not None: if vision_chunks is not None:
assert isinstance(vision_chunks, list) mm_inputs = self.image_processor.preprocess(
mm_inputs = self.media_processor.preprocess(vision_chunks) vision_chunks,
return_tensors=return_tensors,
)
else:
mm_inputs = {}
if text is not None:
if not isinstance(text, list):
text = [text]
text_inputs = self.tokenizer(text)
# Note: Modify in-place
input_ids: list[list[int]] = text_inputs["input_ids"] # type: ignore
if vision_chunks is not None:
num_tokens_per_chunk = [
self.image_processor.media_tokens_calculator(chunk)
for chunk in vision_chunks
]
num_tokens_per_chunk = [ for i in range(len(input_ids)):
self.media_processor.media_tokens_calculator(chunk) new_input_ids = []
for chunk in vision_chunks for token in input_ids[i]:
] if token == self.media_token_id:
new_input_ids.extend(
[self.media_token_id] * num_tokens_per_chunk.pop(0)
)
else:
new_input_ids.append(token)
new_input_ids = [] input_ids[i] = new_input_ids
for token in input_ids: else:
if token == self.media_token_id: text_inputs = {}
new_input_ids.extend(
[self.media_token_id] * num_tokens_per_chunk.pop(0)
)
else:
new_input_ids.append(token)
input_ids = new_input_ids
# XXX: _apply_hf_processor_text_mm will call tolist() on input_ids
return BatchFeature( return BatchFeature(
data={ data={**text_inputs, **mm_inputs},
"input_ids": torch.tensor([input_ids]), tensor_type=return_tensors,
**mm_inputs,
}
) )
...@@ -286,11 +286,9 @@ class Step3VLImageProcessor: ...@@ -286,11 +286,9 @@ class Step3VLImageProcessor:
def __call__( def __call__(
self, self,
images: Image.Image | list[Image.Image] | None = None, images: Image.Image | list[Image.Image],
return_tensors: str | TensorType | None = None, return_tensors: str | TensorType | None = None,
) -> BatchFeature: ) -> BatchFeature:
if images is None:
images = []
if not isinstance(images, list): if not isinstance(images, list):
images = [images] images = [images]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment