Unverified Commit 3a92c6f3 authored by Isotr0py's avatar Isotr0py Committed by GitHub
Browse files

[Misc] Cleanup Kimi-K2.5's vision chunk modality entrypoints (#33157)


Signed-off-by: default avatarIsotr0py <mozf@mail2.sysu.edu.cn>
parent e01ff5c0
...@@ -24,12 +24,25 @@ from vllm.multimodal.utils import ( ...@@ -24,12 +24,25 @@ from vllm.multimodal.utils import (
) )
from vllm.utils.serial_utils import tensor2base64 from vllm.utils.serial_utils import tensor2base64
KIMI_K2_5_MODEL_ID = "moonshotai/Kimi-K2.5"
PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct" PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
QWEN2AUDIO_MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct" QWEN2AUDIO_MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct"
QWEN25OMNI_MODEL_ID = "Qwen/Qwen2.5-Omni-7B" QWEN25OMNI_MODEL_ID = "Qwen/Qwen2.5-Omni-7B"
MISTRAL_MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" MISTRAL_MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
@pytest.fixture(scope="function")
def kimi_k2_5_model_config():
return ModelConfig(
KIMI_K2_5_MODEL_ID,
runner="generate",
trust_remote_code=True,
limit_mm_per_prompt={
"image": 2,
},
)
@pytest.fixture(scope="function") @pytest.fixture(scope="function")
def phi3v_model_config(): def phi3v_model_config():
return ModelConfig( return ModelConfig(
...@@ -163,6 +176,22 @@ def _assert_mm_data_is_image_input( ...@@ -163,6 +176,22 @@ def _assert_mm_data_is_image_input(
assert image_data[i] is None assert image_data[i] is None
def _assert_mm_data_is_vision_chunk_input(
mm_data: MultiModalDataDict | None,
vision_chunk_count: int,
) -> None:
assert mm_data is not None
assert set(mm_data.keys()) == {"vision_chunk"}
vision_chunk_data = mm_data.get("vision_chunk")
assert vision_chunk_data is not None
assert (
isinstance(vision_chunk_data, list)
and len(vision_chunk_data) == vision_chunk_count
)
def _assert_mm_uuids( def _assert_mm_uuids(
mm_uuids: MultiModalUUIDDict | None, mm_uuids: MultiModalUUIDDict | None,
media_count: int, media_count: int,
...@@ -2151,3 +2180,505 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async( ...@@ -2151,3 +2180,505 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
] ]
_assert_mm_data_inputs(mm_data, {"audio": 1}) _assert_mm_data_inputs(mm_data, {"audio": 1})
_assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[audio_uuid]) _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[audio_uuid])
def test_parse_chat_messages_image_vision_chunk(
kimi_k2_5_model_config,
image_url,
):
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this image."},
{
"type": "image_url",
"image_url": {"url": image_url},
},
],
}
]
conversation, mm_data, mm_uuids = parse_chat_messages(
messages,
kimi_k2_5_model_config,
content_format="string",
)
placeholder = "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
expected_conversation = [
{
"role": "user",
"content": f"{placeholder}\nAnalyze this image.",
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None], modality="vision_chunk")
def test_parse_chat_messages_video_vision_chunk(
kimi_k2_5_model_config,
video_url,
):
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this video."},
{
"type": "video_url",
"video_url": {"url": video_url},
},
],
}
]
conversation, mm_data, mm_uuids = parse_chat_messages(
messages,
kimi_k2_5_model_config,
content_format="string",
)
placeholder = "<|kimi_k25_video_placeholder|>"
expected_conversation = [
{
"role": "user",
"content": f"{placeholder}\nAnalyze this video.",
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None], modality="vision_chunk")
def test_parse_chat_messages_image_vision_chunk_with_uuid(
kimi_k2_5_model_config,
image_url,
):
image_uuid = "image_123"
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this image."},
{
"type": "image_url",
"image_url": {"url": image_url},
"uuid": image_uuid,
},
],
}
]
conversation, mm_data, mm_uuids = parse_chat_messages(
messages,
kimi_k2_5_model_config,
content_format="string",
)
placeholder = "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
expected_conversation = [
{
"role": "user",
"content": f"{placeholder}\nAnalyze this image.",
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid], modality="vision_chunk")
def test_parse_chat_messages_video_vision_chunk_with_uuid(
kimi_k2_5_model_config,
video_url,
):
video_uuid = "video_456"
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this video."},
{
"type": "video_url",
"video_url": {"url": video_url},
"uuid": video_uuid,
},
],
}
]
conversation, mm_data, mm_uuids = parse_chat_messages(
messages,
kimi_k2_5_model_config,
content_format="string",
)
placeholder = "<|kimi_k25_video_placeholder|>"
expected_conversation = [
{
"role": "user",
"content": f"{placeholder}\nAnalyze this video.",
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[video_uuid], modality="vision_chunk")
def test_parse_chat_messages_mixed_vision_chunk(
kimi_k2_5_model_config,
image_url,
video_url,
):
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this image and video."},
{
"type": "image_url",
"image_url": {"url": image_url},
},
{
"type": "video_url",
"video_url": {"url": video_url},
},
],
}
]
conversation, mm_data, mm_uuids = parse_chat_messages(
messages,
kimi_k2_5_model_config,
content_format="string",
)
image_placeholder = (
"<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
)
video_placeholder = "<|kimi_k25_video_placeholder|>"
expected_conversation = [
{
"role": "user",
"content": (
f"{image_placeholder}\n{video_placeholder}\n"
"Analyze this image and video."
),
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 2)
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None], modality="vision_chunk")
def test_parse_chat_messages_mixed_vision_chunk_with_uuid(
kimi_k2_5_model_config,
image_url,
video_url,
):
image_uuid = "image_123"
video_uuid = "video_456"
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this image and video."},
{
"type": "image_url",
"image_url": {"url": image_url},
"uuid": image_uuid,
},
{
"type": "video_url",
"video_url": {"url": video_url},
"uuid": video_uuid,
},
],
}
]
conversation, mm_data, mm_uuids = parse_chat_messages(
messages,
kimi_k2_5_model_config,
content_format="string",
)
image_placeholder = (
"<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
)
video_placeholder = "<|kimi_k25_video_placeholder|>"
expected_conversation = [
{
"role": "user",
"content": (
f"{image_placeholder}\n{video_placeholder}\n"
"Analyze this image and video."
),
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 2)
_assert_mm_uuids(
mm_uuids, 2, expected_uuids=[image_uuid, video_uuid], modality="vision_chunk"
)
@pytest.mark.asyncio
async def test_parse_chat_messages_mixed_vision_chunk_async(
kimi_k2_5_model_config,
image_url,
video_url,
):
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this image and video."},
{
"type": "image_url",
"image_url": {"url": image_url},
},
{
"type": "video_url",
"video_url": {"url": video_url},
},
],
}
]
conversation, mm_data, mm_uuids = await parse_chat_messages_async(
messages,
kimi_k2_5_model_config,
content_format="string",
)
image_placeholder = (
"<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
)
video_placeholder = "<|kimi_k25_video_placeholder|>"
expected_conversation = [
{
"role": "user",
"content": (
f"{image_placeholder}\n{video_placeholder}\n"
"Analyze this image and video."
),
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 2)
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None], modality="vision_chunk")
@pytest.mark.asyncio
async def test_parse_chat_messages_mixed_vision_chunk_with_uuid_async(
kimi_k2_5_model_config,
image_url,
video_url,
):
image_uuid = "image_123"
video_uuid = "video_456"
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this image and video."},
{
"type": "image_url",
"image_url": {"url": image_url},
"uuid": image_uuid,
},
{
"type": "video_url",
"video_url": {"url": video_url},
"uuid": video_uuid,
},
],
}
]
conversation, mm_data, mm_uuids = await parse_chat_messages_async(
messages,
kimi_k2_5_model_config,
content_format="string",
)
image_placeholder = (
"<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
)
video_placeholder = "<|kimi_k25_video_placeholder|>"
expected_conversation = [
{
"role": "user",
"content": (
f"{image_placeholder}\n{video_placeholder}\n"
"Analyze this image and video."
),
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 2)
_assert_mm_uuids(
mm_uuids, 2, expected_uuids=[image_uuid, video_uuid], modality="vision_chunk"
)
@pytest.mark.asyncio
async def test_parse_chat_messages_image_vision_chunk_async(
kimi_k2_5_model_config,
image_url,
):
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this image."},
{
"type": "image_url",
"image_url": {"url": image_url},
},
],
}
]
conversation, mm_data, mm_uuids = await parse_chat_messages_async(
messages,
kimi_k2_5_model_config,
content_format="string",
)
placeholder = "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
expected_conversation = [
{
"role": "user",
"content": f"{placeholder}\nAnalyze this image.",
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None], modality="vision_chunk")
@pytest.mark.asyncio
async def test_parse_chat_messages_video_vision_chunk_async(
kimi_k2_5_model_config,
video_url,
):
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this video."},
{
"type": "video_url",
"video_url": {"url": video_url},
},
],
}
]
conversation, mm_data, mm_uuids = await parse_chat_messages_async(
messages,
kimi_k2_5_model_config,
content_format="string",
)
placeholder = "<|kimi_k25_video_placeholder|>"
expected_conversation = [
{
"role": "user",
"content": f"{placeholder}\nAnalyze this video.",
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None], modality="vision_chunk")
@pytest.mark.asyncio
async def test_parse_chat_messages_image_vision_chunk_with_uuid_async(
kimi_k2_5_model_config,
image_url,
):
image_uuid = "image_123"
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this image."},
{
"type": "image_url",
"image_url": {"url": image_url},
"uuid": image_uuid,
},
],
}
]
conversation, mm_data, mm_uuids = await parse_chat_messages_async(
messages,
kimi_k2_5_model_config,
content_format="string",
)
placeholder = "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
expected_conversation = [
{
"role": "user",
"content": f"{placeholder}\nAnalyze this image.",
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid], modality="vision_chunk")
@pytest.mark.asyncio
async def test_parse_chat_messages_video_vision_chunk_with_uuid_async(
kimi_k2_5_model_config,
video_url,
):
video_uuid = "video_456"
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this video."},
{
"type": "video_url",
"video_url": {"url": video_url},
"uuid": video_uuid,
},
],
}
]
conversation, mm_data, mm_uuids = await parse_chat_messages_async(
messages,
kimi_k2_5_model_config,
content_format="string",
)
placeholder = "<|kimi_k25_video_placeholder|>"
expected_conversation = [
{
"role": "user",
"content": f"{placeholder}\nAnalyze this video.",
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[video_uuid], modality="vision_chunk")
...@@ -411,6 +411,11 @@ def test_processing_correctness( ...@@ -411,6 +411,11 @@ def test_processing_correctness(
"Qwen-VL tokenizer requires downloading a font file from " "Qwen-VL tokenizer requires downloading a font file from "
"servers that often refuse connections in CI" "servers that often refuse connections in CI"
) )
if model_id == "moonshotai/Kimi-K2.5":
# FIXME(Isaac): Fix Kimi-K2.5's offline inference about vision chunks.
pytest.skip(
"Kimi-K2.5's offline inference has issues about vision chunks. Fix later."
)
_test_processing_correctness( _test_processing_correctness(
model_id, model_id,
......
...@@ -155,6 +155,12 @@ def initialize_dummy_model( ...@@ -155,6 +155,12 @@ def initialize_dummy_model(
@create_new_process_for_each_test() @create_new_process_for_each_test()
@pytest.mark.parametrize("model_id", get_model_ids_to_test()) @pytest.mark.parametrize("model_id", get_model_ids_to_test())
def test_model_tensor_schema(model_id: str): def test_model_tensor_schema(model_id: str):
if model_id == "moonshotai/Kimi-K2.5":
# FIXME(Isotr0py): Fix Kimi-K2.5's offline inference about vision chunks.
pytest.skip(
"Kimi-K2.5's offline inference has issues about vision chunks. Fix later."
)
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id) model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
model_info.check_available_online(on_fail="skip") model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version( model_info.check_transformers_version(
......
...@@ -786,7 +786,6 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -786,7 +786,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"KimiK25ForConditionalGeneration": _HfExamplesInfo( "KimiK25ForConditionalGeneration": _HfExamplesInfo(
"moonshotai/Kimi-K2.5", "moonshotai/Kimi-K2.5",
trust_remote_code=True, trust_remote_code=True,
is_available_online=False,
), ),
"LightOnOCRForConditionalGeneration": _HfExamplesInfo( "LightOnOCRForConditionalGeneration": _HfExamplesInfo(
"lightonai/LightOnOCR-1B-1025" "lightonai/LightOnOCR-1B-1025"
......
...@@ -454,78 +454,6 @@ def _get_embeds_data( ...@@ -454,78 +454,6 @@ def _get_embeds_data(
raise NotImplementedError(type(data_items)) raise NotImplementedError(type(data_items))
def rebuild_mm_uuids_from_mm_data(
mm_uuids: MultiModalUUIDDict,
mm_data: MultiModalDataDict,
) -> MultiModalUUIDDict:
"""Rebuild mm_uuids after vision_chunk processing.
When videos are split into chunks, the original UUIDs need to be updated
to reflect the new UUIDs generated for each chunk.
Args:
mm_uuids: Original UUIDs dictionary
mm_data: Processed multimodal data with vision_chunk items
Returns:
Updated UUIDs dictionary with chunk UUIDs
"""
vision_chunks = mm_data.get("vision_chunk")
if vision_chunks is None:
return mm_uuids
new_uuids = dict(mm_uuids)
vision_chunk_uuids = []
for item in vision_chunks:
# vision_chunk items are always dicts (VisionChunkImage/VisionChunkVideo)
assert isinstance(item, dict)
uuid_val = item.get("uuid")
if uuid_val is not None:
vision_chunk_uuids.append(uuid_val)
if vision_chunk_uuids:
new_uuids["vision_chunk"] = vision_chunk_uuids
return new_uuids
def build_video_prompts_from_mm_data(
mm_data: MultiModalDataDict,
) -> list[str]:
"""Build video prompts from vision_chunk data.
Collects prompts from video chunks and groups them by video_idx.
Args:
mm_data: Processed multimodal data with vision_chunk items
Returns:
List of video prompts, one per video.
"""
vision_chunks = mm_data.get("vision_chunk")
if vision_chunks is None:
return []
# Group chunks by video_idx
video_prompts_dict: dict[int, list[str]] = defaultdict(list)
for item in vision_chunks:
# vision_chunk items are always dicts (VisionChunkImage/VisionChunkVideo)
assert isinstance(item, dict)
if item.get("type") == "video_chunk":
video_idx = item.get("video_idx", 0)
prompt = item.get("prompt", "")
video_prompts_dict[video_idx].append(prompt)
# Build prompts in video order
video_prompts = []
for video_idx in sorted(video_prompts_dict.keys()):
video_prompts.append("".join(video_prompts_dict[video_idx]))
return video_prompts
class BaseMultiModalItemTracker(ABC, Generic[_T]): class BaseMultiModalItemTracker(ABC, Generic[_T]):
""" """
Tracks multi-modal items in a given request and ensures that the number Tracks multi-modal items in a given request and ensures that the number
...@@ -616,10 +544,72 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): ...@@ -616,10 +544,72 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
raise NotImplementedError raise NotImplementedError
def _resolve_vision_chunk_items(
vision_chunk_items: list[tuple[object, str | None]],
mm_processor: BaseMultiModalProcessor,
vision_chunks_modality_order: list[str],
):
# Process vision_chunk items - extract from (data, modality) tuples
# and convert to VisionChunk types with proper UUID handling
vision_chunks_uuids = [uuid for data, uuid in vision_chunk_items]
assert len(vision_chunk_items) == len(vision_chunks_modality_order), (
f"vision_chunk items ({len(vision_chunk_items)}) and "
f"modality_order ({len(vision_chunks_modality_order)}) must have same length"
)
processed_chunks: list[VisionChunk] = []
video_idx = 0
for inner_modality, (data, uuid) in zip(
vision_chunks_modality_order, vision_chunk_items
):
if inner_modality == "image":
# Cast data to proper type for image
# Use .media (PIL.Image) directly to avoid redundant
# bytes→PIL conversion in media_processor
if hasattr(data, "media"):
image_data = data.media # type: ignore[union-attr]
processed_chunks.append(
VisionChunkImage(type="image", image=image_data, uuid=uuid)
)
else:
processed_chunks.append(data) # type: ignore[arg-type]
elif inner_modality == "video":
# For video, we may need to split into chunks
# if processor supports it
# For now, just wrap as a video chunk placeholder
if hasattr(mm_processor, "split_video_chunks") and data is not None:
try:
video_uuid = uuid or random_uuid()
# video await result is (video_data, video_meta) tuple
if isinstance(data, tuple) and len(data) >= 1:
video_data = data[0]
else:
video_data = data
video_chunks = mm_processor.split_video_chunks(video_data)
for i, vc in enumerate(video_chunks):
processed_chunks.append(
VisionChunkVideo(
type="video_chunk",
video_chunk=vc["video_chunk"],
uuid=f"{video_uuid}-{i}",
video_idx=video_idx,
prompt=vc["prompt"],
)
)
video_idx += 1
except Exception as e:
logger.warning("Failed to split video chunks: %s", e)
processed_chunks.append(data) # type: ignore[arg-type]
else:
processed_chunks.append(data) # type: ignore[arg-type]
return processed_chunks, vision_chunks_uuids
def _resolve_items( def _resolve_items(
items_by_modality: dict[str, list[tuple[object, str | None]]], items_by_modality: dict[str, list[tuple[object, str | None]]],
mm_processor: BaseMultiModalProcessor, mm_processor: BaseMultiModalProcessor,
vision_chunk_modality_order: dict[str, list[str]], modality_order: dict[str, list[str]],
) -> tuple[MultiModalDataDict, MultiModalUUIDDict]: ) -> tuple[MultiModalDataDict, MultiModalUUIDDict]:
if "image" in items_by_modality and "image_embeds" in items_by_modality: if "image" in items_by_modality and "image_embeds" in items_by_modality:
raise ValueError("Mixing raw image and embedding inputs is not allowed") raise ValueError("Mixing raw image and embedding inputs is not allowed")
...@@ -654,71 +644,13 @@ def _resolve_items( ...@@ -654,71 +644,13 @@ def _resolve_items(
if "vision_chunk" in items_by_modality: if "vision_chunk" in items_by_modality:
# Process vision_chunk items - extract from (data, modality) tuples # Process vision_chunk items - extract from (data, modality) tuples
# and convert to VisionChunk types with proper UUID handling # and convert to VisionChunk types with proper UUID handling
vision_chunk_items = items_by_modality["vision_chunk"] processed_chunks, vision_chunk_uuids = _resolve_vision_chunk_items(
modality_order = vision_chunk_modality_order.get("vision_chunk", []) items_by_modality["vision_chunk"],
mm_uuids["vision_chunk"] = [ mm_processor,
uuid for data, uuid in items_by_modality["vision_chunk"] modality_order.get("vision_chunk", []),
]
# Filter out None items (from asyncio.sleep(0) placeholders)
filtered_items = [
(idx, item)
for idx, item in enumerate(vision_chunk_items)
if item is not None
]
assert len(filtered_items) == len(modality_order), (
f"vision_chunk items ({len(filtered_items)}) and "
f"modality_order ({len(modality_order)}) must have same length"
) )
processed_chunks: list[VisionChunk] = []
video_idx = 0
for i, (idx, item) in enumerate(filtered_items):
inner_modality = modality_order[i]
data, uuid = item
uuid_val = uuid if idx < len(mm_uuids["vision_chunk"]) else None
if inner_modality == "image":
# Cast data to proper type for image
# Use .media (PIL.Image) directly to avoid redundant
# bytes→PIL conversion in media_processor
if hasattr(data, "media"):
image_data = data.media # type: ignore[union-attr]
processed_chunks.append(
VisionChunkImage(type="image", image=image_data, uuid=uuid_val)
)
else:
processed_chunks.append(data) # type: ignore[arg-type]
elif inner_modality == "video":
# For video, we may need to split into chunks
# if processor supports it
# For now, just wrap as a video chunk placeholder
if hasattr(mm_processor, "split_video_chunks") and data is not None:
try:
video_uuid = uuid_val or random_uuid()
# video await result is (video_data, video_meta) tuple
if isinstance(data, tuple) and len(data) >= 1:
video_data = data[0]
else:
video_data = data
video_chunks = mm_processor.split_video_chunks(video_data)
for i, vc in enumerate(video_chunks):
processed_chunks.append(
VisionChunkVideo(
type="video_chunk",
video_chunk=vc["video_chunk"],
uuid=f"{video_uuid}-{i}",
video_idx=video_idx,
prompt=vc["prompt"],
)
)
video_idx += 1
except Exception as e:
logger.warning("Failed to split video chunks: %s", e)
processed_chunks.append(data) # type: ignore[arg-type]
else:
processed_chunks.append(data) # type: ignore[arg-type]
mm_data["vision_chunk"] = processed_chunks mm_data["vision_chunk"] = processed_chunks
mm_uuids["vision_chunk"] = vision_chunk_uuids
return mm_data, mm_uuids return mm_data, mm_uuids
......
...@@ -235,27 +235,6 @@ class VideoLoader: ...@@ -235,27 +235,6 @@ class VideoLoader:
VIDEO_LOADER_REGISTRY = ExtensionManager() VIDEO_LOADER_REGISTRY = ExtensionManager()
@VIDEO_LOADER_REGISTRY.register("identity")
class IdentityVideoLoader(VideoLoader):
"""IdentityVideoLoader returns raw video bytes without decoding.
This allows the model processor to handle video decoding and
is required for models like Kimi-K2.5 that need custom video chunk splitting.
NOTE: This is temporary for Kimi-K2.5 testing. Remember to change back
to opencv before release if needed.
"""
@classmethod
def load_bytes(
cls,
data: bytes,
num_frames: int = -1,
**kwargs: Any,
) -> tuple[Any, Any]:
return data, None
@VIDEO_LOADER_REGISTRY.register("opencv") @VIDEO_LOADER_REGISTRY.register("opencv")
class OpenCVVideoBackend(VideoLoader): class OpenCVVideoBackend(VideoLoader):
def get_cv2_video_api(self): def get_cv2_video_api(self):
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import inspect import inspect
from collections import deque import itertools
from collections import defaultdict, deque
from collections.abc import Set from collections.abc import Set
from functools import lru_cache from functools import lru_cache
from typing import Any, cast from typing import TYPE_CHECKING, Any, cast
import jinja2 import jinja2
import jinja2.ext import jinja2.ext
...@@ -20,11 +21,9 @@ from vllm.entrypoints.chat_utils import ( ...@@ -20,11 +21,9 @@ from vllm.entrypoints.chat_utils import (
ChatTemplateContentFormatOption, ChatTemplateContentFormatOption,
ChatTemplateResolutionError, ChatTemplateResolutionError,
ConversationMessage, ConversationMessage,
build_video_prompts_from_mm_data,
load_chat_template, load_chat_template,
parse_chat_messages, parse_chat_messages,
parse_chat_messages_async, parse_chat_messages_async,
rebuild_mm_uuids_from_mm_data,
) )
from vllm.inputs import TextPrompt, TokensPrompt from vllm.inputs import TextPrompt, TokensPrompt
from vllm.logger import init_logger from vllm.logger import init_logger
...@@ -36,6 +35,13 @@ from vllm.utils.func_utils import supports_kw ...@@ -36,6 +35,13 @@ from vllm.utils.func_utils import supports_kw
from .protocol import RendererLike from .protocol import RendererLike
if TYPE_CHECKING:
from vllm.multimodal.inputs import MultiModalDataDict, MultiModalUUIDDict
else:
MultiModalDataDict = dict[str, Any]
MultiModalUUIDDict = dict[str, Any]
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -479,6 +485,104 @@ def safe_apply_chat_template( ...@@ -479,6 +485,104 @@ def safe_apply_chat_template(
raise ValueError(str(e)) from e raise ValueError(str(e)) from e
def rebuild_mm_uuids_from_mm_data(
mm_uuids: "MultiModalUUIDDict",
mm_data: "MultiModalDataDict",
) -> "MultiModalUUIDDict":
"""Rebuild mm_uuids after vision_chunk processing.
When videos are split into chunks, the original UUIDs need to be updated
to reflect the new UUIDs generated for each chunk.
Args:
mm_uuids: Original UUIDs dictionary
mm_data: Processed multimodal data with vision_chunk items
Returns:
Updated UUIDs dictionary with chunk UUIDs
"""
vision_chunks = mm_data.get("vision_chunk")
if vision_chunks is None:
return mm_uuids
assert all(isinstance(item, dict) for item in vision_chunks), (
"Expected all vision_chunk items to be dicts"
)
vision_chunks = cast(list[dict[str, Any]], vision_chunks)
vision_chunk_uuids = [
uuid_val for item in vision_chunks if (uuid_val := item.get("uuid")) is not None
]
if vision_chunk_uuids:
mm_uuids = dict(mm_uuids)
mm_uuids["vision_chunk"] = vision_chunk_uuids
return mm_uuids
def build_video_prompts_from_mm_data(
mm_data: "MultiModalDataDict",
) -> list[str]:
"""Build video prompts from vision_chunk data.
Collects prompts from video chunks and groups them by video_idx.
Args:
mm_data: Processed multimodal data with vision_chunk items
Returns:
List of video prompts, one per video.
"""
vision_chunks = mm_data.get("vision_chunk")
if vision_chunks is None:
return []
# Group chunks by video_idx
video_prompts_dict: dict[int, list[str]] = defaultdict(list)
for item in vision_chunks:
# vision_chunk items are always dicts (VisionChunkImage/VisionChunkVideo)
assert isinstance(item, dict)
if item.get("type") == "video_chunk":
video_idx = item.get("video_idx", 0)
prompt = item.get("prompt", "")
video_prompts_dict[video_idx].append(prompt)
# Build prompts in video order
video_prompts = [
"".join(video_prompts_dict[video_idx])
for video_idx in sorted(video_prompts_dict.keys())
]
return video_prompts
def replace_vision_chunk_video_placeholder(
prompt_raw: str | list[int],
mm_data: "MultiModalDataDict",
video_placeholder: str | None,
) -> str | list[int]:
# get video placehoder, replace it with runtime video-chunk prompts
if video_placeholder and isinstance(prompt_raw, str):
video_prompts = build_video_prompts_from_mm_data(mm_data)
# replace in order
prompt_raw_parts = prompt_raw.split(video_placeholder)
if len(prompt_raw_parts) == len(video_prompts) + 1:
prompt_raw = "".join(
itertools.chain.from_iterable(zip(prompt_raw_parts, video_prompts))
)
prompt_raw += prompt_raw_parts[-1]
else:
logger.warning(
"Number of video placeholders (%d) does not match "
"number of videos (%d) in the request.",
len(prompt_raw_parts) - 1,
len(video_prompts),
)
return prompt_raw
class HfRenderer(RendererLike): class HfRenderer(RendererLike):
@classmethod @classmethod
def from_config( def from_config(
...@@ -496,6 +600,9 @@ class HfRenderer(RendererLike): ...@@ -496,6 +600,9 @@ class HfRenderer(RendererLike):
super().__init__() super().__init__()
self.config = config self.config = config
self.use_unified_vision_chunk = getattr(
config.hf_config, "use_unified_vision_chunk", False
)
if config.skip_tokenizer_init: if config.skip_tokenizer_init:
tokenizer = None tokenizer = None
...@@ -552,7 +659,7 @@ class HfRenderer(RendererLike): ...@@ -552,7 +659,7 @@ class HfRenderer(RendererLike):
# NOTE: use_unified_vision_chunk is currently specific to Kimi-K2.5 # NOTE: use_unified_vision_chunk is currently specific to Kimi-K2.5
# model which uses unified vision chunks for both images and videos. # model which uses unified vision chunks for both images and videos.
if ( if (
getattr(model_config.hf_config, "use_unified_vision_chunk", False) self.use_unified_vision_chunk
and mm_uuids is not None and mm_uuids is not None
and mm_data is not None and mm_data is not None
): ):
...@@ -562,26 +669,11 @@ class HfRenderer(RendererLike): ...@@ -562,26 +669,11 @@ class HfRenderer(RendererLike):
video_placeholder = getattr( video_placeholder = getattr(
model_config.hf_config, "video_placeholder", None model_config.hf_config, "video_placeholder", None
) )
if video_placeholder and isinstance(prompt_raw, str): prompt_raw = replace_vision_chunk_video_placeholder(
video_prompts = build_video_prompts_from_mm_data(mm_data) prompt_raw,
mm_data,
# replace in order video_placeholder,
prompt_raw_parts = prompt_raw.split(video_placeholder) )
if len(prompt_raw_parts) == len(video_prompts) + 1:
prompt_raw = "".join(
[
prompt_raw_parts[i] + video_prompts[i]
for i in range(len(video_prompts))
]
)
prompt_raw += prompt_raw_parts[-1]
else:
logger.warning(
"Number of video placeholders (%d) does not match "
"number of videos (%d) in the request.",
len(prompt_raw_parts) - 1,
len(video_prompts),
)
prompt = ( prompt = (
TextPrompt(prompt=prompt_raw) TextPrompt(prompt=prompt_raw)
...@@ -626,7 +718,7 @@ class HfRenderer(RendererLike): ...@@ -626,7 +718,7 @@ class HfRenderer(RendererLike):
# NOTE: use_unified_vision_chunk is currently specific to Kimi-K2.5 # NOTE: use_unified_vision_chunk is currently specific to Kimi-K2.5
# model which uses unified vision chunks for both images and videos. # model which uses unified vision chunks for both images and videos.
if ( if (
getattr(model_config.hf_config, "use_unified_vision_chunk", False) self.use_unified_vision_chunk
and mm_uuids is not None and mm_uuids is not None
and mm_data is not None and mm_data is not None
): ):
...@@ -636,26 +728,11 @@ class HfRenderer(RendererLike): ...@@ -636,26 +728,11 @@ class HfRenderer(RendererLike):
video_placeholder = getattr( video_placeholder = getattr(
model_config.hf_config, "video_placeholder", None model_config.hf_config, "video_placeholder", None
) )
if video_placeholder and isinstance(prompt_raw, str): prompt_raw = replace_vision_chunk_video_placeholder(
video_prompts = build_video_prompts_from_mm_data(mm_data) prompt_raw,
mm_data,
# replace in order video_placeholder,
prompt_raw_parts = prompt_raw.split(video_placeholder) )
if len(prompt_raw_parts) == len(video_prompts) + 1:
prompt_raw = "".join(
[
prompt_raw_parts[i] + video_prompts[i]
for i in range(len(video_prompts))
]
)
prompt_raw += prompt_raw_parts[-1]
else:
logger.warning(
"Number of video placeholders (%d) does not match "
"number of videos (%d) in the request.",
len(prompt_raw_parts) - 1,
len(video_prompts),
)
prompt = ( prompt = (
TextPrompt(prompt=prompt_raw) TextPrompt(prompt=prompt_raw)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment