Commit 5042815a authored by Roger Wang's avatar Roger Wang Committed by khluu
Browse files
parent afb390ab
...@@ -686,6 +686,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen ...@@ -686,6 +686,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | ✅︎ | ✅︎ | | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | ✅︎ | ✅︎ |
| `KeyeVL1_5ForConditionalGeneration` | Keye-VL-1_5-8B | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-1_5-8B` | ✅︎ | ✅︎ | | `KeyeVL1_5ForConditionalGeneration` | Keye-VL-1_5-8B | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-1_5-8B` | ✅︎ | ✅︎ |
| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ | | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ |
| `KimiK25ForConditionalGeneration` | Kimi-K2.5 | T + I<sup>+</sup> | `moonshotai/Kimi-K2.5` | | ✅︎ |
| `LightOnOCRForConditionalGeneration` | LightOnOCR-1B | T + I<sup>+</sup> | `lightonai/LightOnOCR-1B`, etc | ✅︎ | ✅︎ | | `LightOnOCRForConditionalGeneration` | LightOnOCR-1B | T + I<sup>+</sup> | `lightonai/LightOnOCR-1B`, etc | ✅︎ | ✅︎ |
| `Lfm2VlForConditionalGeneration` | LFM2-VL | T + I<sup>+</sup> | `LiquidAI/LFM2-VL-450M`, `LiquidAI/LFM2-VL-3B`, `LiquidAI/LFM2-VL-8B-A1B`, etc. | ✅︎ | ✅︎ | | `Lfm2VlForConditionalGeneration` | LFM2-VL | T + I<sup>+</sup> | `LiquidAI/LFM2-VL-450M`, `LiquidAI/LFM2-VL-3B`, `LiquidAI/LFM2-VL-8B-A1B`, etc. | ✅︎ | ✅︎ |
| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | ✅︎ | ✅︎ | | `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | ✅︎ | ✅︎ |
......
...@@ -771,6 +771,11 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -771,6 +771,11 @@ _MULTIMODAL_EXAMPLE_MODELS = {
) )
}, },
), ),
"KimiK25ForConditionalGeneration": _HfExamplesInfo(
"moonshotai/Kimi-K2.5",
trust_remote_code=True,
is_available_online=False,
),
"LightOnOCRForConditionalGeneration": _HfExamplesInfo( "LightOnOCRForConditionalGeneration": _HfExamplesInfo(
"lightonai/LightOnOCR-1B-1025" "lightonai/LightOnOCR-1B-1025"
), ),
......
...@@ -46,6 +46,9 @@ from vllm.multimodal.inputs import ( ...@@ -46,6 +46,9 @@ from vllm.multimodal.inputs import (
MultiModalBatchedField, MultiModalBatchedField,
MultiModalFlatField, MultiModalFlatField,
MultiModalSharedField, MultiModalSharedField,
VisionChunk,
VisionChunkImage,
VisionChunkVideo,
) )
from vllm.multimodal.processing import BaseMultiModalProcessor from vllm.multimodal.processing import BaseMultiModalProcessor
from vllm.multimodal.utils import MEDIA_CONNECTOR_REGISTRY, MediaConnector from vllm.multimodal.utils import MEDIA_CONNECTOR_REGISTRY, MediaConnector
...@@ -336,7 +339,9 @@ ChatTemplateContentFormatOption = Literal["auto", "string", "openai"] ...@@ -336,7 +339,9 @@ ChatTemplateContentFormatOption = Literal["auto", "string", "openai"]
ChatTemplateContentFormat = Literal["string", "openai"] ChatTemplateContentFormat = Literal["string", "openai"]
ModalityStr = Literal["image", "audio", "video", "image_embeds", "audio_embeds"] ModalityStr = Literal[
"image", "audio", "video", "image_embeds", "audio_embeds", "vision_chunk"
]
_T = TypeVar("_T") _T = TypeVar("_T")
...@@ -449,6 +454,78 @@ def _get_embeds_data( ...@@ -449,6 +454,78 @@ def _get_embeds_data(
raise NotImplementedError(type(data_items)) raise NotImplementedError(type(data_items))
def rebuild_mm_uuids_from_mm_data(
mm_uuids: MultiModalUUIDDict,
mm_data: MultiModalDataDict,
) -> MultiModalUUIDDict:
"""Rebuild mm_uuids after vision_chunk processing.
When videos are split into chunks, the original UUIDs need to be updated
to reflect the new UUIDs generated for each chunk.
Args:
mm_uuids: Original UUIDs dictionary
mm_data: Processed multimodal data with vision_chunk items
Returns:
Updated UUIDs dictionary with chunk UUIDs
"""
vision_chunks = mm_data.get("vision_chunk")
if vision_chunks is None:
return mm_uuids
new_uuids = dict(mm_uuids)
vision_chunk_uuids = []
for item in vision_chunks:
# vision_chunk items are always dicts (VisionChunkImage/VisionChunkVideo)
assert isinstance(item, dict)
uuid_val = item.get("uuid")
if uuid_val is not None:
vision_chunk_uuids.append(uuid_val)
if vision_chunk_uuids:
new_uuids["vision_chunk"] = vision_chunk_uuids
return new_uuids
def build_video_prompts_from_mm_data(
mm_data: MultiModalDataDict,
) -> list[str]:
"""Build video prompts from vision_chunk data.
Collects prompts from video chunks and groups them by video_idx.
Args:
mm_data: Processed multimodal data with vision_chunk items
Returns:
List of video prompts, one per video.
"""
vision_chunks = mm_data.get("vision_chunk")
if vision_chunks is None:
return []
# Group chunks by video_idx
video_prompts_dict: dict[int, list[str]] = defaultdict(list)
for item in vision_chunks:
# vision_chunk items are always dicts (VisionChunkImage/VisionChunkVideo)
assert isinstance(item, dict)
if item.get("type") == "video_chunk":
video_idx = item.get("video_idx", 0)
prompt = item.get("prompt", "")
video_prompts_dict[video_idx].append(prompt)
# Build prompts in video order
video_prompts = []
for video_idx in sorted(video_prompts_dict.keys()):
video_prompts.append("".join(video_prompts_dict[video_idx]))
return video_prompts
class BaseMultiModalItemTracker(ABC, Generic[_T]): class BaseMultiModalItemTracker(ABC, Generic[_T]):
""" """
Tracks multi-modal items in a given request and ensures that the number Tracks multi-modal items in a given request and ensures that the number
...@@ -462,6 +539,13 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): ...@@ -462,6 +539,13 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
self._model_config = model_config self._model_config = model_config
self._items_by_modality = defaultdict[str, list[_T]](list) self._items_by_modality = defaultdict[str, list[_T]](list)
# Track original modality for each vision_chunk item (image or video)
self._modality_order = defaultdict[str, list[str]](list)
@cached_property
def use_unified_vision_chunk_modality(self) -> bool:
"""Check if model uses unified vision_chunk modality for images/videos."""
return getattr(self._model_config.hf_config, "use_unified_vision_chunk", False)
@property @property
def model_config(self) -> ModelConfig: def model_config(self) -> ModelConfig:
...@@ -499,11 +583,31 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): ...@@ -499,11 +583,31 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
media. media.
""" """
input_modality = modality.replace("_embeds", "") input_modality = modality.replace("_embeds", "")
num_items = len(self._items_by_modality[modality]) + 1 original_modality = modality
use_vision_chunk = (
self.use_unified_vision_chunk_modality
and original_modality in ["video", "image"]
)
# If use_unified_vision_chunk_modality is enabled,
# map image/video to vision_chunk
if use_vision_chunk:
# To avoid validation fail
# because models with use_unified_vision_chunk_modality=True
# will only accept vision_chunk modality.
input_modality = "vision_chunk"
num_items = len(self._items_by_modality[input_modality]) + 1
else:
num_items = len(self._items_by_modality[original_modality]) + 1
self.mm_processor.validate_num_items(input_modality, num_items) self.mm_processor.validate_num_items(input_modality, num_items)
self._items_by_modality[modality].append(item) # Track original modality for vision_chunk items
if use_vision_chunk:
self._items_by_modality[input_modality].append(item) # type: ignore
self._modality_order["vision_chunk"].append(original_modality)
else:
self._items_by_modality[original_modality].append(item)
return self.model_cls.get_placeholder_str(modality, num_items) return self.model_cls.get_placeholder_str(modality, num_items)
...@@ -515,6 +619,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): ...@@ -515,6 +619,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
def _resolve_items( def _resolve_items(
items_by_modality: dict[str, list[tuple[object, str | None]]], items_by_modality: dict[str, list[tuple[object, str | None]]],
mm_processor: BaseMultiModalProcessor, mm_processor: BaseMultiModalProcessor,
vision_chunk_modality_order: dict[str, list[str]],
) -> tuple[MultiModalDataDict, MultiModalUUIDDict]: ) -> tuple[MultiModalDataDict, MultiModalUUIDDict]:
if "image" in items_by_modality and "image_embeds" in items_by_modality: if "image" in items_by_modality and "image_embeds" in items_by_modality:
raise ValueError("Mixing raw image and embedding inputs is not allowed") raise ValueError("Mixing raw image and embedding inputs is not allowed")
...@@ -546,6 +651,74 @@ def _resolve_items( ...@@ -546,6 +651,74 @@ def _resolve_items(
if "video" in items_by_modality: if "video" in items_by_modality:
mm_data["video"] = [data for data, uuid in items_by_modality["video"]] mm_data["video"] = [data for data, uuid in items_by_modality["video"]]
mm_uuids["video"] = [uuid for data, uuid in items_by_modality["video"]] mm_uuids["video"] = [uuid for data, uuid in items_by_modality["video"]]
if "vision_chunk" in items_by_modality:
# Process vision_chunk items - extract from (data, modality) tuples
# and convert to VisionChunk types with proper UUID handling
vision_chunk_items = items_by_modality["vision_chunk"]
modality_order = vision_chunk_modality_order.get("vision_chunk", [])
mm_uuids["vision_chunk"] = [
uuid for data, uuid in items_by_modality["vision_chunk"]
]
# Filter out None items (from asyncio.sleep(0) placeholders)
filtered_items = [
(idx, item)
for idx, item in enumerate(vision_chunk_items)
if item is not None
]
assert len(filtered_items) == len(modality_order), (
f"vision_chunk items ({len(filtered_items)}) and "
f"modality_order ({len(modality_order)}) must have same length"
)
processed_chunks: list[VisionChunk] = []
video_idx = 0
for i, (idx, item) in enumerate(filtered_items):
inner_modality = modality_order[i]
data, uuid = item
uuid_val = uuid if idx < len(mm_uuids["vision_chunk"]) else None
if inner_modality == "image":
# Cast data to proper type for image
# Use .media (PIL.Image) directly to avoid redundant
# bytes→PIL conversion in media_processor
if hasattr(data, "media"):
image_data = data.media # type: ignore[union-attr]
processed_chunks.append(
VisionChunkImage(type="image", image=image_data, uuid=uuid_val)
)
else:
processed_chunks.append(data) # type: ignore[arg-type]
elif inner_modality == "video":
# For video, we may need to split into chunks
# if processor supports it
# For now, just wrap as a video chunk placeholder
if hasattr(mm_processor, "split_video_chunks") and data is not None:
try:
video_uuid = uuid_val or random_uuid()
# video await result is (video_data, video_meta) tuple
if isinstance(data, tuple) and len(data) >= 1:
video_data = data[0]
else:
video_data = data
video_chunks = mm_processor.split_video_chunks(video_data)
for i, vc in enumerate(video_chunks):
processed_chunks.append(
VisionChunkVideo(
type="video_chunk",
video_chunk=vc["video_chunk"],
uuid=f"{video_uuid}-{i}",
video_idx=video_idx,
prompt=vc["prompt"],
)
)
video_idx += 1
except Exception as e:
logger.warning("Failed to split video chunks: %s", e)
processed_chunks.append(data) # type: ignore[arg-type]
else:
processed_chunks.append(data) # type: ignore[arg-type]
mm_data["vision_chunk"] = processed_chunks
return mm_data, mm_uuids return mm_data, mm_uuids
...@@ -557,7 +730,9 @@ class MultiModalItemTracker(BaseMultiModalItemTracker[tuple[object, str | None]] ...@@ -557,7 +730,9 @@ class MultiModalItemTracker(BaseMultiModalItemTracker[tuple[object, str | None]]
if not self._items_by_modality: if not self._items_by_modality:
return None, None return None, None
return _resolve_items(dict(self._items_by_modality), self.mm_processor) return _resolve_items(
dict(self._items_by_modality), self.mm_processor, self._modality_order
)
def create_parser(self) -> "BaseMultiModalContentParser": def create_parser(self) -> "BaseMultiModalContentParser":
return MultiModalContentParser(self) return MultiModalContentParser(self)
...@@ -577,7 +752,9 @@ class AsyncMultiModalItemTracker( ...@@ -577,7 +752,9 @@ class AsyncMultiModalItemTracker(
for modality, coros in self._items_by_modality.items() for modality, coros in self._items_by_modality.items()
} }
return _resolve_items(resolved_items_by_modality, self.mm_processor) return _resolve_items(
resolved_items_by_modality, self.mm_processor, self._modality_order
)
def create_parser(self) -> "BaseMultiModalContentParser": def create_parser(self) -> "BaseMultiModalContentParser":
return AsyncMultiModalContentParser(self) return AsyncMultiModalContentParser(self)
......
...@@ -782,6 +782,7 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -782,6 +782,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
), ),
# Backend for Video IO # Backend for Video IO
# - "opencv": Default backend that uses OpenCV stream buffered backend. # - "opencv": Default backend that uses OpenCV stream buffered backend.
# - "identity": Returns raw video bytes for model processor to handle.
# #
# Custom backend implementations can be registered # Custom backend implementations can be registered
# via `@VIDEO_LOADER_REGISTRY.register("my_custom_video_loader")` and # via `@VIDEO_LOADER_REGISTRY.register("my_custom_video_loader")` and
......
This diff is collapsed.
This diff is collapsed.
...@@ -359,6 +359,7 @@ _MULTIMODAL_MODELS = { ...@@ -359,6 +359,7 @@ _MULTIMODAL_MODELS = {
), ),
"RForConditionalGeneration": ("rvl", "RForConditionalGeneration"), "RForConditionalGeneration": ("rvl", "RForConditionalGeneration"),
"KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"), # noqa: E501 "KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"), # noqa: E501
"KimiK25ForConditionalGeneration": ("kimi_k25", "KimiK25ForConditionalGeneration"), # noqa: E501
"LightOnOCRForConditionalGeneration": ( "LightOnOCRForConditionalGeneration": (
"lightonocr", "lightonocr",
"LightOnOCRForConditionalGeneration", "LightOnOCRForConditionalGeneration",
......
...@@ -20,6 +20,7 @@ from typing import ( ...@@ -20,6 +20,7 @@ from typing import (
) )
import numpy as np import numpy as np
from PIL.Image import Image
from typing_extensions import NotRequired, TypeVar from typing_extensions import NotRequired, TypeVar
from vllm.utils.collection_utils import full_groupby, is_list_of from vllm.utils.collection_utils import full_groupby, is_list_of
...@@ -29,7 +30,6 @@ from vllm.utils.jsontree import json_map_leaves ...@@ -29,7 +30,6 @@ from vllm.utils.jsontree import json_map_leaves
if TYPE_CHECKING: if TYPE_CHECKING:
import torch import torch
import torch.types import torch.types
from PIL.Image import Image
from transformers.feature_extraction_utils import BatchFeature from transformers.feature_extraction_utils import BatchFeature
from .media import MediaWithBytes from .media import MediaWithBytes
...@@ -105,6 +105,28 @@ The number of data items allowed per modality is restricted by ...@@ -105,6 +105,28 @@ The number of data items allowed per modality is restricted by
""" """
class VisionChunkImage(TypedDict):
"""Represents an image wrapped as a vision chunk."""
type: Literal["image"]
image: Image
uuid: str | None
class VisionChunkVideo(TypedDict):
"""Represents a video chunk with metadata."""
type: Literal["video_chunk"]
video_chunk: list[Image]
uuid: str | None
prompt: str
video_idx: int
VisionChunk = VisionChunkImage | VisionChunkVideo
"""A vision chunk is either an image or a video chunk."""
@final @final
class MultiModalDataBuiltins(TypedDict, total=False): class MultiModalDataBuiltins(TypedDict, total=False):
"""Type annotations for modality types predefined by vLLM.""" """Type annotations for modality types predefined by vLLM."""
...@@ -118,6 +140,9 @@ class MultiModalDataBuiltins(TypedDict, total=False): ...@@ -118,6 +140,9 @@ class MultiModalDataBuiltins(TypedDict, total=False):
audio: ModalityData[AudioItem] audio: ModalityData[AudioItem]
"""The input audio(s).""" """The input audio(s)."""
vision_chunk: ModalityData[VisionChunk]
"""The input visual atom(s) - unified modality for images and video chunks."""
MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]] MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]]
""" """
......
...@@ -384,6 +384,13 @@ class VideoEmbeddingItems(EmbeddingItems): ...@@ -384,6 +384,13 @@ class VideoEmbeddingItems(EmbeddingItems):
super().__init__(data, "video", expected_hidden_size) super().__init__(data, "video", expected_hidden_size)
class VisionChunkProcessorItems(ProcessorBatchItems[Any]):
"""Processor items for vision chunks (unified image and video chunks)."""
def __init__(self, data: Sequence[Any]) -> None:
super().__init__(data, "vision_chunk")
_D = TypeVar("_D", bound=ModalityDataItems[Any, Any]) _D = TypeVar("_D", bound=ModalityDataItems[Any, Any])
...@@ -652,11 +659,23 @@ class MultiModalDataParser: ...@@ -652,11 +659,23 @@ class MultiModalDataParser:
return VideoProcessorItems(new_videos, metadata=metadata_lst) return VideoProcessorItems(new_videos, metadata=metadata_lst)
def _parse_vision_chunk_data(
self,
data: ModalityData[Any],
) -> ModalityDataItems[Any, Any] | None:
"""Parse vision chunk data (unified image and video chunks)."""
if data is None or self._is_empty(data):
return None
if self.is_embeddings(data):
raise ValueError("Do not support embedding data for vision_chunk right now")
return VisionChunkProcessorItems(data)
def _get_subparsers(self) -> Mapping[str, ModalityDataParser]: def _get_subparsers(self) -> Mapping[str, ModalityDataParser]:
return { return {
"audio": self._parse_audio_data, "audio": self._parse_audio_data,
"image": self._parse_image_data, "image": self._parse_image_data,
"video": self._parse_video_data, "video": self._parse_video_data,
"vision_chunk": self._parse_vision_chunk_data,
} }
def parse_mm_data(self, mm_data: MultiModalDataDict) -> MultiModalDataItems: def parse_mm_data(self, mm_data: MultiModalDataDict) -> MultiModalDataItems:
......
...@@ -235,6 +235,27 @@ class VideoLoader: ...@@ -235,6 +235,27 @@ class VideoLoader:
VIDEO_LOADER_REGISTRY = ExtensionManager() VIDEO_LOADER_REGISTRY = ExtensionManager()
@VIDEO_LOADER_REGISTRY.register("identity")
class IdentityVideoLoader(VideoLoader):
"""IdentityVideoLoader returns raw video bytes without decoding.
This allows the model processor to handle video decoding and
is required for models like Kimi-K2.5 that need custom video chunk splitting.
NOTE: This is temporary for Kimi-K2.5 testing. Remember to change back
to opencv before release if needed.
"""
@classmethod
def load_bytes(
cls,
data: bytes,
num_frames: int = -1,
**kwargs: Any,
) -> tuple[Any, Any]:
return data, None
@VIDEO_LOADER_REGISTRY.register("opencv") @VIDEO_LOADER_REGISTRY.register("opencv")
class OpenCVVideoBackend(VideoLoader): class OpenCVVideoBackend(VideoLoader):
def get_cv2_video_api(self): def get_cv2_video_api(self):
......
...@@ -53,8 +53,8 @@ _REASONING_PARSERS_TO_REGISTER = { ...@@ -53,8 +53,8 @@ _REASONING_PARSERS_TO_REGISTER = {
"HunyuanA13BReasoningParser", "HunyuanA13BReasoningParser",
), ),
"kimi_k2": ( "kimi_k2": (
"deepseek_r1_reasoning_parser", "kimi_k2_reasoning_parser",
"DeepSeekR1ReasoningParser", "KimiK2ReasoningParser",
), ),
"minimax_m2": ( "minimax_m2": (
"minimax_m2_reasoning_parser", "minimax_m2_reasoning_parser",
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
from typing import TYPE_CHECKING, Any
from transformers import PreTrainedTokenizerBase
from vllm.entrypoints.openai.engine.protocol import DeltaMessage
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParser
from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
from .identity_reasoning_parser import IdentityReasoningParser
if TYPE_CHECKING:
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
)
else:
ChatCompletionRequest = Any
logger = init_logger(__name__)
class KimiK2ReasoningParser(ReasoningParser):
"""
Kimi K2 parser that delegates to either DeepSeekR1ReasoningParser or
IdentityReasoningParser based on `thinking` and `separate_reasoning`.
Unlike DeepSeekV3ReasoningParser which defaults to NOT thinking,
KimiK2ReasoningParser defaults to thinking mode (uses DeepSeekR1ReasoningParser).
"""
def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
super().__init__(tokenizer, *args, **kwargs)
chat_kwargs = kwargs.pop("chat_template_kwargs", {}) or {}
# Key difference: default to True instead of False
thinking = bool(chat_kwargs.pop("thinking", True))
if thinking:
self._parser = DeepSeekR1ReasoningParser(tokenizer, *args, **kwargs)
else:
self._parser = IdentityReasoningParser(tokenizer, *args, **kwargs)
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
return self._parser.is_reasoning_end(input_ids)
def is_reasoning_end_streaming(
self, input_ids: list[int], delta_ids: list[int]
) -> bool:
return self._parser.is_reasoning_end_streaming(input_ids, delta_ids)
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
return self._parser.extract_content_ids(input_ids)
def extract_reasoning(
self, model_output: str, request: "ChatCompletionRequest"
) -> tuple[str | None, str | None]:
return self._parser.extract_reasoning(model_output, request)
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
delta_text: str,
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
return self._parser.extract_reasoning_streaming(
previous_text,
current_text,
delta_text,
previous_token_ids,
current_token_ids,
delta_token_ids,
)
...@@ -20,9 +20,11 @@ from vllm.entrypoints.chat_utils import ( ...@@ -20,9 +20,11 @@ from vllm.entrypoints.chat_utils import (
ChatTemplateContentFormatOption, ChatTemplateContentFormatOption,
ChatTemplateResolutionError, ChatTemplateResolutionError,
ConversationMessage, ConversationMessage,
build_video_prompts_from_mm_data,
load_chat_template, load_chat_template,
parse_chat_messages, parse_chat_messages,
parse_chat_messages_async, parse_chat_messages_async,
rebuild_mm_uuids_from_mm_data,
) )
from vllm.inputs import TextPrompt, TokensPrompt from vllm.inputs import TextPrompt, TokensPrompt
from vllm.logger import init_logger from vllm.logger import init_logger
...@@ -547,6 +549,40 @@ class HfRenderer(RendererLike): ...@@ -547,6 +549,40 @@ class HfRenderer(RendererLike):
**kwargs, **kwargs,
) )
# NOTE: use_unified_vision_chunk is currently specific to Kimi-K2.5
# model which uses unified vision chunks for both images and videos.
if (
getattr(model_config.hf_config, "use_unified_vision_chunk", False)
and mm_uuids is not None
and mm_data is not None
):
mm_uuids = rebuild_mm_uuids_from_mm_data(mm_uuids, mm_data)
# get video placehoder, replace it with runtime video-chunk prompts
video_placeholder = getattr(
model_config.hf_config, "video_placeholder", None
)
if video_placeholder and isinstance(prompt_raw, str):
video_prompts = build_video_prompts_from_mm_data(mm_data)
# replace in order
prompt_raw_parts = prompt_raw.split(video_placeholder)
if len(prompt_raw_parts) == len(video_prompts) + 1:
prompt_raw = "".join(
[
prompt_raw_parts[i] + video_prompts[i]
for i in range(len(video_prompts))
]
)
prompt_raw += prompt_raw_parts[-1]
else:
logger.warning(
"Number of video placeholders (%d) does not match "
"number of videos (%d) in the request.",
len(prompt_raw_parts) - 1,
len(video_prompts),
)
prompt = ( prompt = (
TextPrompt(prompt=prompt_raw) TextPrompt(prompt=prompt_raw)
if isinstance(prompt_raw, str) if isinstance(prompt_raw, str)
...@@ -587,6 +623,40 @@ class HfRenderer(RendererLike): ...@@ -587,6 +623,40 @@ class HfRenderer(RendererLike):
**kwargs, **kwargs,
) )
# NOTE: use_unified_vision_chunk is currently specific to Kimi-K2.5
# model which uses unified vision chunks for both images and videos.
if (
getattr(model_config.hf_config, "use_unified_vision_chunk", False)
and mm_uuids is not None
and mm_data is not None
):
mm_uuids = rebuild_mm_uuids_from_mm_data(mm_uuids, mm_data)
# get video placehoder, replace it with runtime video-chunk prompts
video_placeholder = getattr(
model_config.hf_config, "video_placeholder", None
)
if video_placeholder and isinstance(prompt_raw, str):
video_prompts = build_video_prompts_from_mm_data(mm_data)
# replace in order
prompt_raw_parts = prompt_raw.split(video_placeholder)
if len(prompt_raw_parts) == len(video_prompts) + 1:
prompt_raw = "".join(
[
prompt_raw_parts[i] + video_prompts[i]
for i in range(len(video_prompts))
]
)
prompt_raw += prompt_raw_parts[-1]
else:
logger.warning(
"Number of video placeholders (%d) does not match "
"number of videos (%d) in the request.",
len(prompt_raw_parts) - 1,
len(video_prompts),
)
prompt = ( prompt = (
TextPrompt(prompt=prompt_raw) TextPrompt(prompt=prompt_raw)
if isinstance(prompt_raw, str) if isinstance(prompt_raw, str)
......
...@@ -81,6 +81,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict( ...@@ -81,6 +81,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
isaac="IsaacConfig", isaac="IsaacConfig",
kimi_linear="KimiLinearConfig", kimi_linear="KimiLinearConfig",
kimi_vl="KimiVLConfig", kimi_vl="KimiVLConfig",
kimi_k25="KimiK25Config",
RefinedWeb="RWConfig", # For tiiuae/falcon-40b(-instruct) RefinedWeb="RWConfig", # For tiiuae/falcon-40b(-instruct)
RefinedWebModel="RWConfig", # For tiiuae/falcon-7b(-instruct) RefinedWebModel="RWConfig", # For tiiuae/falcon-7b(-instruct)
jais="JAISConfig", jais="JAISConfig",
......
...@@ -38,6 +38,7 @@ _CLASS_TO_MODULE: dict[str, str] = { ...@@ -38,6 +38,7 @@ _CLASS_TO_MODULE: dict[str, str] = {
"MoonViTConfig": "vllm.transformers_utils.configs.moonvit", "MoonViTConfig": "vllm.transformers_utils.configs.moonvit",
"KimiLinearConfig": "vllm.transformers_utils.configs.kimi_linear", "KimiLinearConfig": "vllm.transformers_utils.configs.kimi_linear",
"KimiVLConfig": "vllm.transformers_utils.configs.kimi_vl", "KimiVLConfig": "vllm.transformers_utils.configs.kimi_vl",
"KimiK25Config": "vllm.transformers_utils.configs.kimi_k25",
"NemotronConfig": "vllm.transformers_utils.configs.nemotron", "NemotronConfig": "vllm.transformers_utils.configs.nemotron",
"NemotronHConfig": "vllm.transformers_utils.configs.nemotron_h", "NemotronHConfig": "vllm.transformers_utils.configs.nemotron_h",
"Olmo3Config": "vllm.transformers_utils.configs.olmo3", "Olmo3Config": "vllm.transformers_utils.configs.olmo3",
...@@ -77,6 +78,7 @@ __all__ = [ ...@@ -77,6 +78,7 @@ __all__ = [
"MoonViTConfig", "MoonViTConfig",
"KimiLinearConfig", "KimiLinearConfig",
"KimiVLConfig", "KimiVLConfig",
"KimiK25Config",
"NemotronConfig", "NemotronConfig",
"NemotronHConfig", "NemotronHConfig",
"Olmo3Config", "Olmo3Config",
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Kimi-K2.5 Model Configuration.
This configuration supports video-chunk as an internal modality type.
A video-chunk is the smallest independently processable unit of video.
"""
from transformers import DeepseekV3Config
from transformers.configuration_utils import PretrainedConfig
class KimiK25VisionConfig(PretrainedConfig):
model_type = "kimi_k25_vision"
def __init__(
self,
# Vision Tower
patch_size: int = 14,
init_pos_emb_height: int = 64,
init_pos_emb_width: int = 64,
init_pos_emb_time: int = 4,
pos_emb_type: str = "divided_fixed",
num_attention_heads: int = 16,
num_hidden_layers: int = 27,
hidden_size: int = 1152,
intermediate_size: int = 4304,
merge_kernel_size: tuple[int, int] = (2, 2),
video_attn_type: str = "spatial_temporal",
merge_type: str = "sd2_tpool",
# MM Projector
mm_projector_type: str = "patchmerger",
mm_hidden_size: int | None = None,
projector_hidden_act: str = "gelu",
projector_ln_eps: float = 1e-5,
**kwargs,
):
super().__init__(**kwargs)
# Vision Tower
self.patch_size = patch_size
self.init_pos_emb_height = init_pos_emb_height
self.init_pos_emb_width = init_pos_emb_width
self.init_pos_emb_time = init_pos_emb_time
self.pos_emb_type = pos_emb_type
self.num_attention_heads = num_attention_heads
self.num_hidden_layers = num_hidden_layers
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.merge_kernel_size = merge_kernel_size
self.video_attn_type = video_attn_type
self.merge_type = merge_type
# MM Projector
self.mm_projector_type = mm_projector_type
if mm_hidden_size is not None:
self.mm_hidden_size = mm_hidden_size
else:
self.mm_hidden_size = hidden_size
self.projector_hidden_act = projector_hidden_act
self.projector_ln_eps = projector_ln_eps
class KimiK25Config(PretrainedConfig):
"""Kimi-K2.5 model configuration.
Kimi-K2.5 extends Kimi-K2 with vision support using video-chunks.
A video-chunk consists of multiple consecutive frames
that are processed together with temporal pooling.
Args:
vision_config: Configuration for the vision tower and projector.
text_config: Configuration for the text model (DeepseekV3).
ignore_index: The ignore index for the loss function.
media_placeholder_token_id: The token ID for media placeholders.
pad_token_id: The token ID for padding.
"""
model_type = "kimi_k25"
def __init__(
self,
vision_config: dict | KimiK25VisionConfig | None = None,
text_config: dict | DeepseekV3Config | None = None,
ignore_index: int = -100,
media_placeholder_token_id: int = 163605,
pad_token_id: int = 0,
use_unified_vision_chunk: bool = False,
video_placeholder: str = "<|kimi_k25_video_placeholder|>",
**kwargs,
):
# Vision config
if vision_config is None:
vision_config = KimiK25VisionConfig()
elif isinstance(vision_config, dict):
vision_config = KimiK25VisionConfig(**vision_config)
self.vision_config: KimiK25VisionConfig = vision_config
# Text config
if text_config is None:
text_config = DeepseekV3Config()
elif isinstance(text_config, dict):
text_config = DeepseekV3Config(**text_config)
self.text_config: DeepseekV3Config = text_config
# Set mm_hidden_size to text hidden size if not explicitly set
if self.vision_config.mm_hidden_size == self.vision_config.hidden_size:
self.vision_config.mm_hidden_size = self.text_config.hidden_size
# Other config
self.ignore_index = ignore_index
self.media_placeholder_token_id = media_placeholder_token_id
self.use_unified_vision_chunk = use_unified_vision_chunk
self.video_placeholder = video_placeholder
# Propagate quantization config from text model
if getattr(self.text_config, "quantization_config", None) is not None:
self.quantization_config = self.text_config.quantization_config
super().__init__(pad_token_id=pad_token_id, **kwargs)
@property
def hidden_size(self) -> int:
"""Get hidden size from text config for compatibility."""
return self.text_config.hidden_size
@property
def vocab_size(self) -> int:
"""Get vocab size from text config for compatibility."""
return self.text_config.vocab_size
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment