Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
3a92c6f3
Unverified
Commit
3a92c6f3
authored
Jan 29, 2026
by
Isotr0py
Committed by
GitHub
Jan 29, 2026
Browse files
[Misc] Cleanup Kimi-K2.5's vision chunk modality entrypoints (#33157)
Signed-off-by:
Isotr0py
<
mozf@mail2.sysu.edu.cn
>
parent
e01ff5c0
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
733 additions
and
204 deletions
+733
-204
tests/entrypoints/test_chat_utils.py
tests/entrypoints/test_chat_utils.py
+531
-0
tests/models/multimodal/processing/test_common.py
tests/models/multimodal/processing/test_common.py
+5
-0
tests/models/multimodal/processing/test_tensor_schema.py
tests/models/multimodal/processing/test_tensor_schema.py
+6
-0
tests/models/registry.py
tests/models/registry.py
+0
-1
vllm/entrypoints/chat_utils.py
vllm/entrypoints/chat_utils.py
+68
-136
vllm/multimodal/video.py
vllm/multimodal/video.py
+0
-21
vllm/renderers/hf.py
vllm/renderers/hf.py
+123
-46
No files found.
tests/entrypoints/test_chat_utils.py
View file @
3a92c6f3
...
@@ -24,12 +24,25 @@ from vllm.multimodal.utils import (
...
@@ -24,12 +24,25 @@ from vllm.multimodal.utils import (
)
)
from
vllm.utils.serial_utils
import
tensor2base64
from
vllm.utils.serial_utils
import
tensor2base64
KIMI_K2_5_MODEL_ID
=
"moonshotai/Kimi-K2.5"
PHI3V_MODEL_ID
=
"microsoft/Phi-3.5-vision-instruct"
PHI3V_MODEL_ID
=
"microsoft/Phi-3.5-vision-instruct"
QWEN2AUDIO_MODEL_ID
=
"Qwen/Qwen2-Audio-7B-Instruct"
QWEN2AUDIO_MODEL_ID
=
"Qwen/Qwen2-Audio-7B-Instruct"
QWEN25OMNI_MODEL_ID
=
"Qwen/Qwen2.5-Omni-7B"
QWEN25OMNI_MODEL_ID
=
"Qwen/Qwen2.5-Omni-7B"
MISTRAL_MODEL_ID
=
"mistralai/Mistral-Small-3.1-24B-Instruct-2503"
MISTRAL_MODEL_ID
=
"mistralai/Mistral-Small-3.1-24B-Instruct-2503"
@
pytest
.
fixture
(
scope
=
"function"
)
def
kimi_k2_5_model_config
():
return
ModelConfig
(
KIMI_K2_5_MODEL_ID
,
runner
=
"generate"
,
trust_remote_code
=
True
,
limit_mm_per_prompt
=
{
"image"
:
2
,
},
)
@
pytest
.
fixture
(
scope
=
"function"
)
@
pytest
.
fixture
(
scope
=
"function"
)
def
phi3v_model_config
():
def
phi3v_model_config
():
return
ModelConfig
(
return
ModelConfig
(
...
@@ -163,6 +176,22 @@ def _assert_mm_data_is_image_input(
...
@@ -163,6 +176,22 @@ def _assert_mm_data_is_image_input(
assert
image_data
[
i
]
is
None
assert
image_data
[
i
]
is
None
def
_assert_mm_data_is_vision_chunk_input
(
mm_data
:
MultiModalDataDict
|
None
,
vision_chunk_count
:
int
,
)
->
None
:
assert
mm_data
is
not
None
assert
set
(
mm_data
.
keys
())
==
{
"vision_chunk"
}
vision_chunk_data
=
mm_data
.
get
(
"vision_chunk"
)
assert
vision_chunk_data
is
not
None
assert
(
isinstance
(
vision_chunk_data
,
list
)
and
len
(
vision_chunk_data
)
==
vision_chunk_count
)
def
_assert_mm_uuids
(
def
_assert_mm_uuids
(
mm_uuids
:
MultiModalUUIDDict
|
None
,
mm_uuids
:
MultiModalUUIDDict
|
None
,
media_count
:
int
,
media_count
:
int
,
...
@@ -2151,3 +2180,505 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
...
@@ -2151,3 +2180,505 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
]
]
_assert_mm_data_inputs
(
mm_data
,
{
"audio"
:
1
})
_assert_mm_data_inputs
(
mm_data
,
{
"audio"
:
1
})
_assert_mm_uuids
(
mm_uuids
,
1
,
modality
=
"audio"
,
expected_uuids
=
[
audio_uuid
])
_assert_mm_uuids
(
mm_uuids
,
1
,
modality
=
"audio"
,
expected_uuids
=
[
audio_uuid
])
def
test_parse_chat_messages_image_vision_chunk
(
kimi_k2_5_model_config
,
image_url
,
):
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"Analyze this image."
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
},
},
],
}
]
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
messages
,
kimi_k2_5_model_config
,
content_format
=
"string"
,
)
placeholder
=
"<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
expected_conversation
=
[
{
"role"
:
"user"
,
"content"
:
f
"
{
placeholder
}
\n
Analyze this image."
,
}
]
assert
conversation
==
expected_conversation
_assert_mm_data_is_vision_chunk_input
(
mm_data
,
1
)
_assert_mm_uuids
(
mm_uuids
,
1
,
expected_uuids
=
[
None
],
modality
=
"vision_chunk"
)
def
test_parse_chat_messages_video_vision_chunk
(
kimi_k2_5_model_config
,
video_url
,
):
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"Analyze this video."
},
{
"type"
:
"video_url"
,
"video_url"
:
{
"url"
:
video_url
},
},
],
}
]
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
messages
,
kimi_k2_5_model_config
,
content_format
=
"string"
,
)
placeholder
=
"<|kimi_k25_video_placeholder|>"
expected_conversation
=
[
{
"role"
:
"user"
,
"content"
:
f
"
{
placeholder
}
\n
Analyze this video."
,
}
]
assert
conversation
==
expected_conversation
_assert_mm_data_is_vision_chunk_input
(
mm_data
,
1
)
_assert_mm_uuids
(
mm_uuids
,
1
,
expected_uuids
=
[
None
],
modality
=
"vision_chunk"
)
def
test_parse_chat_messages_image_vision_chunk_with_uuid
(
kimi_k2_5_model_config
,
image_url
,
):
image_uuid
=
"image_123"
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"Analyze this image."
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
},
"uuid"
:
image_uuid
,
},
],
}
]
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
messages
,
kimi_k2_5_model_config
,
content_format
=
"string"
,
)
placeholder
=
"<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
expected_conversation
=
[
{
"role"
:
"user"
,
"content"
:
f
"
{
placeholder
}
\n
Analyze this image."
,
}
]
assert
conversation
==
expected_conversation
_assert_mm_data_is_vision_chunk_input
(
mm_data
,
1
)
_assert_mm_uuids
(
mm_uuids
,
1
,
expected_uuids
=
[
image_uuid
],
modality
=
"vision_chunk"
)
def
test_parse_chat_messages_video_vision_chunk_with_uuid
(
kimi_k2_5_model_config
,
video_url
,
):
video_uuid
=
"video_456"
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"Analyze this video."
},
{
"type"
:
"video_url"
,
"video_url"
:
{
"url"
:
video_url
},
"uuid"
:
video_uuid
,
},
],
}
]
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
messages
,
kimi_k2_5_model_config
,
content_format
=
"string"
,
)
placeholder
=
"<|kimi_k25_video_placeholder|>"
expected_conversation
=
[
{
"role"
:
"user"
,
"content"
:
f
"
{
placeholder
}
\n
Analyze this video."
,
}
]
assert
conversation
==
expected_conversation
_assert_mm_data_is_vision_chunk_input
(
mm_data
,
1
)
_assert_mm_uuids
(
mm_uuids
,
1
,
expected_uuids
=
[
video_uuid
],
modality
=
"vision_chunk"
)
def
test_parse_chat_messages_mixed_vision_chunk
(
kimi_k2_5_model_config
,
image_url
,
video_url
,
):
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"Analyze this image and video."
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
},
},
{
"type"
:
"video_url"
,
"video_url"
:
{
"url"
:
video_url
},
},
],
}
]
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
messages
,
kimi_k2_5_model_config
,
content_format
=
"string"
,
)
image_placeholder
=
(
"<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
)
video_placeholder
=
"<|kimi_k25_video_placeholder|>"
expected_conversation
=
[
{
"role"
:
"user"
,
"content"
:
(
f
"
{
image_placeholder
}
\n
{
video_placeholder
}
\n
"
"Analyze this image and video."
),
}
]
assert
conversation
==
expected_conversation
_assert_mm_data_is_vision_chunk_input
(
mm_data
,
2
)
_assert_mm_uuids
(
mm_uuids
,
2
,
expected_uuids
=
[
None
,
None
],
modality
=
"vision_chunk"
)
def
test_parse_chat_messages_mixed_vision_chunk_with_uuid
(
kimi_k2_5_model_config
,
image_url
,
video_url
,
):
image_uuid
=
"image_123"
video_uuid
=
"video_456"
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"Analyze this image and video."
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
},
"uuid"
:
image_uuid
,
},
{
"type"
:
"video_url"
,
"video_url"
:
{
"url"
:
video_url
},
"uuid"
:
video_uuid
,
},
],
}
]
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
messages
,
kimi_k2_5_model_config
,
content_format
=
"string"
,
)
image_placeholder
=
(
"<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
)
video_placeholder
=
"<|kimi_k25_video_placeholder|>"
expected_conversation
=
[
{
"role"
:
"user"
,
"content"
:
(
f
"
{
image_placeholder
}
\n
{
video_placeholder
}
\n
"
"Analyze this image and video."
),
}
]
assert
conversation
==
expected_conversation
_assert_mm_data_is_vision_chunk_input
(
mm_data
,
2
)
_assert_mm_uuids
(
mm_uuids
,
2
,
expected_uuids
=
[
image_uuid
,
video_uuid
],
modality
=
"vision_chunk"
)
@
pytest
.
mark
.
asyncio
async
def
test_parse_chat_messages_mixed_vision_chunk_async
(
kimi_k2_5_model_config
,
image_url
,
video_url
,
):
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"Analyze this image and video."
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
},
},
{
"type"
:
"video_url"
,
"video_url"
:
{
"url"
:
video_url
},
},
],
}
]
conversation
,
mm_data
,
mm_uuids
=
await
parse_chat_messages_async
(
messages
,
kimi_k2_5_model_config
,
content_format
=
"string"
,
)
image_placeholder
=
(
"<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
)
video_placeholder
=
"<|kimi_k25_video_placeholder|>"
expected_conversation
=
[
{
"role"
:
"user"
,
"content"
:
(
f
"
{
image_placeholder
}
\n
{
video_placeholder
}
\n
"
"Analyze this image and video."
),
}
]
assert
conversation
==
expected_conversation
_assert_mm_data_is_vision_chunk_input
(
mm_data
,
2
)
_assert_mm_uuids
(
mm_uuids
,
2
,
expected_uuids
=
[
None
,
None
],
modality
=
"vision_chunk"
)
@
pytest
.
mark
.
asyncio
async
def
test_parse_chat_messages_mixed_vision_chunk_with_uuid_async
(
kimi_k2_5_model_config
,
image_url
,
video_url
,
):
image_uuid
=
"image_123"
video_uuid
=
"video_456"
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"Analyze this image and video."
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
},
"uuid"
:
image_uuid
,
},
{
"type"
:
"video_url"
,
"video_url"
:
{
"url"
:
video_url
},
"uuid"
:
video_uuid
,
},
],
}
]
conversation
,
mm_data
,
mm_uuids
=
await
parse_chat_messages_async
(
messages
,
kimi_k2_5_model_config
,
content_format
=
"string"
,
)
image_placeholder
=
(
"<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
)
video_placeholder
=
"<|kimi_k25_video_placeholder|>"
expected_conversation
=
[
{
"role"
:
"user"
,
"content"
:
(
f
"
{
image_placeholder
}
\n
{
video_placeholder
}
\n
"
"Analyze this image and video."
),
}
]
assert
conversation
==
expected_conversation
_assert_mm_data_is_vision_chunk_input
(
mm_data
,
2
)
_assert_mm_uuids
(
mm_uuids
,
2
,
expected_uuids
=
[
image_uuid
,
video_uuid
],
modality
=
"vision_chunk"
)
@
pytest
.
mark
.
asyncio
async
def
test_parse_chat_messages_image_vision_chunk_async
(
kimi_k2_5_model_config
,
image_url
,
):
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"Analyze this image."
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
},
},
],
}
]
conversation
,
mm_data
,
mm_uuids
=
await
parse_chat_messages_async
(
messages
,
kimi_k2_5_model_config
,
content_format
=
"string"
,
)
placeholder
=
"<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
expected_conversation
=
[
{
"role"
:
"user"
,
"content"
:
f
"
{
placeholder
}
\n
Analyze this image."
,
}
]
assert
conversation
==
expected_conversation
_assert_mm_data_is_vision_chunk_input
(
mm_data
,
1
)
_assert_mm_uuids
(
mm_uuids
,
1
,
expected_uuids
=
[
None
],
modality
=
"vision_chunk"
)
@
pytest
.
mark
.
asyncio
async
def
test_parse_chat_messages_video_vision_chunk_async
(
kimi_k2_5_model_config
,
video_url
,
):
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"Analyze this video."
},
{
"type"
:
"video_url"
,
"video_url"
:
{
"url"
:
video_url
},
},
],
}
]
conversation
,
mm_data
,
mm_uuids
=
await
parse_chat_messages_async
(
messages
,
kimi_k2_5_model_config
,
content_format
=
"string"
,
)
placeholder
=
"<|kimi_k25_video_placeholder|>"
expected_conversation
=
[
{
"role"
:
"user"
,
"content"
:
f
"
{
placeholder
}
\n
Analyze this video."
,
}
]
assert
conversation
==
expected_conversation
_assert_mm_data_is_vision_chunk_input
(
mm_data
,
1
)
_assert_mm_uuids
(
mm_uuids
,
1
,
expected_uuids
=
[
None
],
modality
=
"vision_chunk"
)
@
pytest
.
mark
.
asyncio
async
def
test_parse_chat_messages_image_vision_chunk_with_uuid_async
(
kimi_k2_5_model_config
,
image_url
,
):
image_uuid
=
"image_123"
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"Analyze this image."
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
},
"uuid"
:
image_uuid
,
},
],
}
]
conversation
,
mm_data
,
mm_uuids
=
await
parse_chat_messages_async
(
messages
,
kimi_k2_5_model_config
,
content_format
=
"string"
,
)
placeholder
=
"<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
expected_conversation
=
[
{
"role"
:
"user"
,
"content"
:
f
"
{
placeholder
}
\n
Analyze this image."
,
}
]
assert
conversation
==
expected_conversation
_assert_mm_data_is_vision_chunk_input
(
mm_data
,
1
)
_assert_mm_uuids
(
mm_uuids
,
1
,
expected_uuids
=
[
image_uuid
],
modality
=
"vision_chunk"
)
@
pytest
.
mark
.
asyncio
async
def
test_parse_chat_messages_video_vision_chunk_with_uuid_async
(
kimi_k2_5_model_config
,
video_url
,
):
video_uuid
=
"video_456"
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"Analyze this video."
},
{
"type"
:
"video_url"
,
"video_url"
:
{
"url"
:
video_url
},
"uuid"
:
video_uuid
,
},
],
}
]
conversation
,
mm_data
,
mm_uuids
=
await
parse_chat_messages_async
(
messages
,
kimi_k2_5_model_config
,
content_format
=
"string"
,
)
placeholder
=
"<|kimi_k25_video_placeholder|>"
expected_conversation
=
[
{
"role"
:
"user"
,
"content"
:
f
"
{
placeholder
}
\n
Analyze this video."
,
}
]
assert
conversation
==
expected_conversation
_assert_mm_data_is_vision_chunk_input
(
mm_data
,
1
)
_assert_mm_uuids
(
mm_uuids
,
1
,
expected_uuids
=
[
video_uuid
],
modality
=
"vision_chunk"
)
tests/models/multimodal/processing/test_common.py
View file @
3a92c6f3
...
@@ -411,6 +411,11 @@ def test_processing_correctness(
...
@@ -411,6 +411,11 @@ def test_processing_correctness(
"Qwen-VL tokenizer requires downloading a font file from "
"Qwen-VL tokenizer requires downloading a font file from "
"servers that often refuse connections in CI"
"servers that often refuse connections in CI"
)
)
if
model_id
==
"moonshotai/Kimi-K2.5"
:
# FIXME(Isaac): Fix Kimi-K2.5's offline inference about vision chunks.
pytest
.
skip
(
"Kimi-K2.5's offline inference has issues about vision chunks. Fix later."
)
_test_processing_correctness
(
_test_processing_correctness
(
model_id
,
model_id
,
...
...
tests/models/multimodal/processing/test_tensor_schema.py
View file @
3a92c6f3
...
@@ -155,6 +155,12 @@ def initialize_dummy_model(
...
@@ -155,6 +155,12 @@ def initialize_dummy_model(
@
create_new_process_for_each_test
()
@
create_new_process_for_each_test
()
@
pytest
.
mark
.
parametrize
(
"model_id"
,
get_model_ids_to_test
())
@
pytest
.
mark
.
parametrize
(
"model_id"
,
get_model_ids_to_test
())
def
test_model_tensor_schema
(
model_id
:
str
):
def
test_model_tensor_schema
(
model_id
:
str
):
if
model_id
==
"moonshotai/Kimi-K2.5"
:
# FIXME(Isotr0py): Fix Kimi-K2.5's offline inference about vision chunks.
pytest
.
skip
(
"Kimi-K2.5's offline inference has issues about vision chunks. Fix later."
)
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model_id
)
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model_id
)
model_info
.
check_available_online
(
on_fail
=
"skip"
)
model_info
.
check_available_online
(
on_fail
=
"skip"
)
model_info
.
check_transformers_version
(
model_info
.
check_transformers_version
(
...
...
tests/models/registry.py
View file @
3a92c6f3
...
@@ -786,7 +786,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -786,7 +786,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"KimiK25ForConditionalGeneration"
:
_HfExamplesInfo
(
"KimiK25ForConditionalGeneration"
:
_HfExamplesInfo
(
"moonshotai/Kimi-K2.5"
,
"moonshotai/Kimi-K2.5"
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
is_available_online
=
False
,
),
),
"LightOnOCRForConditionalGeneration"
:
_HfExamplesInfo
(
"LightOnOCRForConditionalGeneration"
:
_HfExamplesInfo
(
"lightonai/LightOnOCR-1B-1025"
"lightonai/LightOnOCR-1B-1025"
...
...
vllm/entrypoints/chat_utils.py
View file @
3a92c6f3
...
@@ -454,78 +454,6 @@ def _get_embeds_data(
...
@@ -454,78 +454,6 @@ def _get_embeds_data(
raise
NotImplementedError
(
type
(
data_items
))
raise
NotImplementedError
(
type
(
data_items
))
def
rebuild_mm_uuids_from_mm_data
(
mm_uuids
:
MultiModalUUIDDict
,
mm_data
:
MultiModalDataDict
,
)
->
MultiModalUUIDDict
:
"""Rebuild mm_uuids after vision_chunk processing.
When videos are split into chunks, the original UUIDs need to be updated
to reflect the new UUIDs generated for each chunk.
Args:
mm_uuids: Original UUIDs dictionary
mm_data: Processed multimodal data with vision_chunk items
Returns:
Updated UUIDs dictionary with chunk UUIDs
"""
vision_chunks
=
mm_data
.
get
(
"vision_chunk"
)
if
vision_chunks
is
None
:
return
mm_uuids
new_uuids
=
dict
(
mm_uuids
)
vision_chunk_uuids
=
[]
for
item
in
vision_chunks
:
# vision_chunk items are always dicts (VisionChunkImage/VisionChunkVideo)
assert
isinstance
(
item
,
dict
)
uuid_val
=
item
.
get
(
"uuid"
)
if
uuid_val
is
not
None
:
vision_chunk_uuids
.
append
(
uuid_val
)
if
vision_chunk_uuids
:
new_uuids
[
"vision_chunk"
]
=
vision_chunk_uuids
return
new_uuids
def
build_video_prompts_from_mm_data
(
mm_data
:
MultiModalDataDict
,
)
->
list
[
str
]:
"""Build video prompts from vision_chunk data.
Collects prompts from video chunks and groups them by video_idx.
Args:
mm_data: Processed multimodal data with vision_chunk items
Returns:
List of video prompts, one per video.
"""
vision_chunks
=
mm_data
.
get
(
"vision_chunk"
)
if
vision_chunks
is
None
:
return
[]
# Group chunks by video_idx
video_prompts_dict
:
dict
[
int
,
list
[
str
]]
=
defaultdict
(
list
)
for
item
in
vision_chunks
:
# vision_chunk items are always dicts (VisionChunkImage/VisionChunkVideo)
assert
isinstance
(
item
,
dict
)
if
item
.
get
(
"type"
)
==
"video_chunk"
:
video_idx
=
item
.
get
(
"video_idx"
,
0
)
prompt
=
item
.
get
(
"prompt"
,
""
)
video_prompts_dict
[
video_idx
].
append
(
prompt
)
# Build prompts in video order
video_prompts
=
[]
for
video_idx
in
sorted
(
video_prompts_dict
.
keys
()):
video_prompts
.
append
(
""
.
join
(
video_prompts_dict
[
video_idx
]))
return
video_prompts
class
BaseMultiModalItemTracker
(
ABC
,
Generic
[
_T
]):
class
BaseMultiModalItemTracker
(
ABC
,
Generic
[
_T
]):
"""
"""
Tracks multi-modal items in a given request and ensures that the number
Tracks multi-modal items in a given request and ensures that the number
...
@@ -616,10 +544,72 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
...
@@ -616,10 +544,72 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
raise
NotImplementedError
raise
NotImplementedError
def
_resolve_vision_chunk_items
(
vision_chunk_items
:
list
[
tuple
[
object
,
str
|
None
]],
mm_processor
:
BaseMultiModalProcessor
,
vision_chunks_modality_order
:
list
[
str
],
):
# Process vision_chunk items - extract from (data, modality) tuples
# and convert to VisionChunk types with proper UUID handling
vision_chunks_uuids
=
[
uuid
for
data
,
uuid
in
vision_chunk_items
]
assert
len
(
vision_chunk_items
)
==
len
(
vision_chunks_modality_order
),
(
f
"vision_chunk items (
{
len
(
vision_chunk_items
)
}
) and "
f
"modality_order (
{
len
(
vision_chunks_modality_order
)
}
) must have same length"
)
processed_chunks
:
list
[
VisionChunk
]
=
[]
video_idx
=
0
for
inner_modality
,
(
data
,
uuid
)
in
zip
(
vision_chunks_modality_order
,
vision_chunk_items
):
if
inner_modality
==
"image"
:
# Cast data to proper type for image
# Use .media (PIL.Image) directly to avoid redundant
# bytes→PIL conversion in media_processor
if
hasattr
(
data
,
"media"
):
image_data
=
data
.
media
# type: ignore[union-attr]
processed_chunks
.
append
(
VisionChunkImage
(
type
=
"image"
,
image
=
image_data
,
uuid
=
uuid
)
)
else
:
processed_chunks
.
append
(
data
)
# type: ignore[arg-type]
elif
inner_modality
==
"video"
:
# For video, we may need to split into chunks
# if processor supports it
# For now, just wrap as a video chunk placeholder
if
hasattr
(
mm_processor
,
"split_video_chunks"
)
and
data
is
not
None
:
try
:
video_uuid
=
uuid
or
random_uuid
()
# video await result is (video_data, video_meta) tuple
if
isinstance
(
data
,
tuple
)
and
len
(
data
)
>=
1
:
video_data
=
data
[
0
]
else
:
video_data
=
data
video_chunks
=
mm_processor
.
split_video_chunks
(
video_data
)
for
i
,
vc
in
enumerate
(
video_chunks
):
processed_chunks
.
append
(
VisionChunkVideo
(
type
=
"video_chunk"
,
video_chunk
=
vc
[
"video_chunk"
],
uuid
=
f
"
{
video_uuid
}
-
{
i
}
"
,
video_idx
=
video_idx
,
prompt
=
vc
[
"prompt"
],
)
)
video_idx
+=
1
except
Exception
as
e
:
logger
.
warning
(
"Failed to split video chunks: %s"
,
e
)
processed_chunks
.
append
(
data
)
# type: ignore[arg-type]
else
:
processed_chunks
.
append
(
data
)
# type: ignore[arg-type]
return
processed_chunks
,
vision_chunks_uuids
def
_resolve_items
(
def
_resolve_items
(
items_by_modality
:
dict
[
str
,
list
[
tuple
[
object
,
str
|
None
]]],
items_by_modality
:
dict
[
str
,
list
[
tuple
[
object
,
str
|
None
]]],
mm_processor
:
BaseMultiModalProcessor
,
mm_processor
:
BaseMultiModalProcessor
,
vision_chunk_
modality_order
:
dict
[
str
,
list
[
str
]],
modality_order
:
dict
[
str
,
list
[
str
]],
)
->
tuple
[
MultiModalDataDict
,
MultiModalUUIDDict
]:
)
->
tuple
[
MultiModalDataDict
,
MultiModalUUIDDict
]:
if
"image"
in
items_by_modality
and
"image_embeds"
in
items_by_modality
:
if
"image"
in
items_by_modality
and
"image_embeds"
in
items_by_modality
:
raise
ValueError
(
"Mixing raw image and embedding inputs is not allowed"
)
raise
ValueError
(
"Mixing raw image and embedding inputs is not allowed"
)
...
@@ -654,71 +644,13 @@ def _resolve_items(
...
@@ -654,71 +644,13 @@ def _resolve_items(
if
"vision_chunk"
in
items_by_modality
:
if
"vision_chunk"
in
items_by_modality
:
# Process vision_chunk items - extract from (data, modality) tuples
# Process vision_chunk items - extract from (data, modality) tuples
# and convert to VisionChunk types with proper UUID handling
# and convert to VisionChunk types with proper UUID handling
vision_chunk_items
=
items_by_modality
[
"vision_chunk"
]
processed_chunks
,
vision_chunk_uuids
=
_resolve_vision_chunk_items
(
modality_order
=
vision_chunk_modality_order
.
get
(
"vision_chunk"
,
[])
items_by_modality
[
"vision_chunk"
],
mm_uuids
[
"vision_chunk"
]
=
[
mm_processor
,
uuid
for
data
,
uuid
in
items_by_modality
[
"vision_chunk"
]
modality_order
.
get
(
"vision_chunk"
,
[]),
]
# Filter out None items (from asyncio.sleep(0) placeholders)
filtered_items
=
[
(
idx
,
item
)
for
idx
,
item
in
enumerate
(
vision_chunk_items
)
if
item
is
not
None
]
assert
len
(
filtered_items
)
==
len
(
modality_order
),
(
f
"vision_chunk items (
{
len
(
filtered_items
)
}
) and "
f
"modality_order (
{
len
(
modality_order
)
}
) must have same length"
)
)
processed_chunks
:
list
[
VisionChunk
]
=
[]
video_idx
=
0
for
i
,
(
idx
,
item
)
in
enumerate
(
filtered_items
):
inner_modality
=
modality_order
[
i
]
data
,
uuid
=
item
uuid_val
=
uuid
if
idx
<
len
(
mm_uuids
[
"vision_chunk"
])
else
None
if
inner_modality
==
"image"
:
# Cast data to proper type for image
# Use .media (PIL.Image) directly to avoid redundant
# bytes→PIL conversion in media_processor
if
hasattr
(
data
,
"media"
):
image_data
=
data
.
media
# type: ignore[union-attr]
processed_chunks
.
append
(
VisionChunkImage
(
type
=
"image"
,
image
=
image_data
,
uuid
=
uuid_val
)
)
else
:
processed_chunks
.
append
(
data
)
# type: ignore[arg-type]
elif
inner_modality
==
"video"
:
# For video, we may need to split into chunks
# if processor supports it
# For now, just wrap as a video chunk placeholder
if
hasattr
(
mm_processor
,
"split_video_chunks"
)
and
data
is
not
None
:
try
:
video_uuid
=
uuid_val
or
random_uuid
()
# video await result is (video_data, video_meta) tuple
if
isinstance
(
data
,
tuple
)
and
len
(
data
)
>=
1
:
video_data
=
data
[
0
]
else
:
video_data
=
data
video_chunks
=
mm_processor
.
split_video_chunks
(
video_data
)
for
i
,
vc
in
enumerate
(
video_chunks
):
processed_chunks
.
append
(
VisionChunkVideo
(
type
=
"video_chunk"
,
video_chunk
=
vc
[
"video_chunk"
],
uuid
=
f
"
{
video_uuid
}
-
{
i
}
"
,
video_idx
=
video_idx
,
prompt
=
vc
[
"prompt"
],
)
)
video_idx
+=
1
except
Exception
as
e
:
logger
.
warning
(
"Failed to split video chunks: %s"
,
e
)
processed_chunks
.
append
(
data
)
# type: ignore[arg-type]
else
:
processed_chunks
.
append
(
data
)
# type: ignore[arg-type]
mm_data
[
"vision_chunk"
]
=
processed_chunks
mm_data
[
"vision_chunk"
]
=
processed_chunks
mm_uuids
[
"vision_chunk"
]
=
vision_chunk_uuids
return
mm_data
,
mm_uuids
return
mm_data
,
mm_uuids
...
...
vllm/multimodal/video.py
View file @
3a92c6f3
...
@@ -235,27 +235,6 @@ class VideoLoader:
...
@@ -235,27 +235,6 @@ class VideoLoader:
VIDEO_LOADER_REGISTRY
=
ExtensionManager
()
VIDEO_LOADER_REGISTRY
=
ExtensionManager
()
@
VIDEO_LOADER_REGISTRY
.
register
(
"identity"
)
class
IdentityVideoLoader
(
VideoLoader
):
"""IdentityVideoLoader returns raw video bytes without decoding.
This allows the model processor to handle video decoding and
is required for models like Kimi-K2.5 that need custom video chunk splitting.
NOTE: This is temporary for Kimi-K2.5 testing. Remember to change back
to opencv before release if needed.
"""
@
classmethod
def
load_bytes
(
cls
,
data
:
bytes
,
num_frames
:
int
=
-
1
,
**
kwargs
:
Any
,
)
->
tuple
[
Any
,
Any
]:
return
data
,
None
@
VIDEO_LOADER_REGISTRY
.
register
(
"opencv"
)
@
VIDEO_LOADER_REGISTRY
.
register
(
"opencv"
)
class
OpenCVVideoBackend
(
VideoLoader
):
class
OpenCVVideoBackend
(
VideoLoader
):
def
get_cv2_video_api
(
self
):
def
get_cv2_video_api
(
self
):
...
...
vllm/renderers/hf.py
View file @
3a92c6f3
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
inspect
import
inspect
from
collections
import
deque
import
itertools
from
collections
import
defaultdict
,
deque
from
collections.abc
import
Set
from
collections.abc
import
Set
from
functools
import
lru_cache
from
functools
import
lru_cache
from
typing
import
Any
,
cast
from
typing
import
TYPE_CHECKING
,
Any
,
cast
import
jinja2
import
jinja2
import
jinja2.ext
import
jinja2.ext
...
@@ -20,11 +21,9 @@ from vllm.entrypoints.chat_utils import (
...
@@ -20,11 +21,9 @@ from vllm.entrypoints.chat_utils import (
ChatTemplateContentFormatOption
,
ChatTemplateContentFormatOption
,
ChatTemplateResolutionError
,
ChatTemplateResolutionError
,
ConversationMessage
,
ConversationMessage
,
build_video_prompts_from_mm_data
,
load_chat_template
,
load_chat_template
,
parse_chat_messages
,
parse_chat_messages
,
parse_chat_messages_async
,
parse_chat_messages_async
,
rebuild_mm_uuids_from_mm_data
,
)
)
from
vllm.inputs
import
TextPrompt
,
TokensPrompt
from
vllm.inputs
import
TextPrompt
,
TokensPrompt
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
...
@@ -36,6 +35,13 @@ from vllm.utils.func_utils import supports_kw
...
@@ -36,6 +35,13 @@ from vllm.utils.func_utils import supports_kw
from
.protocol
import
RendererLike
from
.protocol
import
RendererLike
if
TYPE_CHECKING
:
from
vllm.multimodal.inputs
import
MultiModalDataDict
,
MultiModalUUIDDict
else
:
MultiModalDataDict
=
dict
[
str
,
Any
]
MultiModalUUIDDict
=
dict
[
str
,
Any
]
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -479,6 +485,104 @@ def safe_apply_chat_template(
...
@@ -479,6 +485,104 @@ def safe_apply_chat_template(
raise
ValueError
(
str
(
e
))
from
e
raise
ValueError
(
str
(
e
))
from
e
def
rebuild_mm_uuids_from_mm_data
(
mm_uuids
:
"MultiModalUUIDDict"
,
mm_data
:
"MultiModalDataDict"
,
)
->
"MultiModalUUIDDict"
:
"""Rebuild mm_uuids after vision_chunk processing.
When videos are split into chunks, the original UUIDs need to be updated
to reflect the new UUIDs generated for each chunk.
Args:
mm_uuids: Original UUIDs dictionary
mm_data: Processed multimodal data with vision_chunk items
Returns:
Updated UUIDs dictionary with chunk UUIDs
"""
vision_chunks
=
mm_data
.
get
(
"vision_chunk"
)
if
vision_chunks
is
None
:
return
mm_uuids
assert
all
(
isinstance
(
item
,
dict
)
for
item
in
vision_chunks
),
(
"Expected all vision_chunk items to be dicts"
)
vision_chunks
=
cast
(
list
[
dict
[
str
,
Any
]],
vision_chunks
)
vision_chunk_uuids
=
[
uuid_val
for
item
in
vision_chunks
if
(
uuid_val
:
=
item
.
get
(
"uuid"
))
is
not
None
]
if
vision_chunk_uuids
:
mm_uuids
=
dict
(
mm_uuids
)
mm_uuids
[
"vision_chunk"
]
=
vision_chunk_uuids
return
mm_uuids
def
build_video_prompts_from_mm_data
(
mm_data
:
"MultiModalDataDict"
,
)
->
list
[
str
]:
"""Build video prompts from vision_chunk data.
Collects prompts from video chunks and groups them by video_idx.
Args:
mm_data: Processed multimodal data with vision_chunk items
Returns:
List of video prompts, one per video.
"""
vision_chunks
=
mm_data
.
get
(
"vision_chunk"
)
if
vision_chunks
is
None
:
return
[]
# Group chunks by video_idx
video_prompts_dict
:
dict
[
int
,
list
[
str
]]
=
defaultdict
(
list
)
for
item
in
vision_chunks
:
# vision_chunk items are always dicts (VisionChunkImage/VisionChunkVideo)
assert
isinstance
(
item
,
dict
)
if
item
.
get
(
"type"
)
==
"video_chunk"
:
video_idx
=
item
.
get
(
"video_idx"
,
0
)
prompt
=
item
.
get
(
"prompt"
,
""
)
video_prompts_dict
[
video_idx
].
append
(
prompt
)
# Build prompts in video order
video_prompts
=
[
""
.
join
(
video_prompts_dict
[
video_idx
])
for
video_idx
in
sorted
(
video_prompts_dict
.
keys
())
]
return
video_prompts
def
replace_vision_chunk_video_placeholder
(
prompt_raw
:
str
|
list
[
int
],
mm_data
:
"MultiModalDataDict"
,
video_placeholder
:
str
|
None
,
)
->
str
|
list
[
int
]:
# get video placehoder, replace it with runtime video-chunk prompts
if
video_placeholder
and
isinstance
(
prompt_raw
,
str
):
video_prompts
=
build_video_prompts_from_mm_data
(
mm_data
)
# replace in order
prompt_raw_parts
=
prompt_raw
.
split
(
video_placeholder
)
if
len
(
prompt_raw_parts
)
==
len
(
video_prompts
)
+
1
:
prompt_raw
=
""
.
join
(
itertools
.
chain
.
from_iterable
(
zip
(
prompt_raw_parts
,
video_prompts
))
)
prompt_raw
+=
prompt_raw_parts
[
-
1
]
else
:
logger
.
warning
(
"Number of video placeholders (%d) does not match "
"number of videos (%d) in the request."
,
len
(
prompt_raw_parts
)
-
1
,
len
(
video_prompts
),
)
return
prompt_raw
class
HfRenderer
(
RendererLike
):
class
HfRenderer
(
RendererLike
):
@
classmethod
@
classmethod
def
from_config
(
def
from_config
(
...
@@ -496,6 +600,9 @@ class HfRenderer(RendererLike):
...
@@ -496,6 +600,9 @@ class HfRenderer(RendererLike):
super
().
__init__
()
super
().
__init__
()
self
.
config
=
config
self
.
config
=
config
self
.
use_unified_vision_chunk
=
getattr
(
config
.
hf_config
,
"use_unified_vision_chunk"
,
False
)
if
config
.
skip_tokenizer_init
:
if
config
.
skip_tokenizer_init
:
tokenizer
=
None
tokenizer
=
None
...
@@ -552,7 +659,7 @@ class HfRenderer(RendererLike):
...
@@ -552,7 +659,7 @@ class HfRenderer(RendererLike):
# NOTE: use_unified_vision_chunk is currently specific to Kimi-K2.5
# NOTE: use_unified_vision_chunk is currently specific to Kimi-K2.5
# model which uses unified vision chunks for both images and videos.
# model which uses unified vision chunks for both images and videos.
if
(
if
(
getattr
(
model_config
.
hf_config
,
"
use_unified_vision_chunk
"
,
False
)
self
.
use_unified_vision_chunk
and
mm_uuids
is
not
None
and
mm_uuids
is
not
None
and
mm_data
is
not
None
and
mm_data
is
not
None
):
):
...
@@ -562,26 +669,11 @@ class HfRenderer(RendererLike):
...
@@ -562,26 +669,11 @@ class HfRenderer(RendererLike):
video_placeholder
=
getattr
(
video_placeholder
=
getattr
(
model_config
.
hf_config
,
"video_placeholder"
,
None
model_config
.
hf_config
,
"video_placeholder"
,
None
)
)
if
video_placeholder
and
isinstance
(
prompt_raw
,
str
):
prompt_raw
=
replace_vision_chunk_video_placeholder
(
video_prompts
=
build_video_prompts_from_mm_data
(
mm_data
)
prompt_raw
,
mm_data
,
# replace in order
video_placeholder
,
prompt_raw_parts
=
prompt_raw
.
split
(
video_placeholder
)
)
if
len
(
prompt_raw_parts
)
==
len
(
video_prompts
)
+
1
:
prompt_raw
=
""
.
join
(
[
prompt_raw_parts
[
i
]
+
video_prompts
[
i
]
for
i
in
range
(
len
(
video_prompts
))
]
)
prompt_raw
+=
prompt_raw_parts
[
-
1
]
else
:
logger
.
warning
(
"Number of video placeholders (%d) does not match "
"number of videos (%d) in the request."
,
len
(
prompt_raw_parts
)
-
1
,
len
(
video_prompts
),
)
prompt
=
(
prompt
=
(
TextPrompt
(
prompt
=
prompt_raw
)
TextPrompt
(
prompt
=
prompt_raw
)
...
@@ -626,7 +718,7 @@ class HfRenderer(RendererLike):
...
@@ -626,7 +718,7 @@ class HfRenderer(RendererLike):
# NOTE: use_unified_vision_chunk is currently specific to Kimi-K2.5
# NOTE: use_unified_vision_chunk is currently specific to Kimi-K2.5
# model which uses unified vision chunks for both images and videos.
# model which uses unified vision chunks for both images and videos.
if
(
if
(
getattr
(
model_config
.
hf_config
,
"
use_unified_vision_chunk
"
,
False
)
self
.
use_unified_vision_chunk
and
mm_uuids
is
not
None
and
mm_uuids
is
not
None
and
mm_data
is
not
None
and
mm_data
is
not
None
):
):
...
@@ -636,26 +728,11 @@ class HfRenderer(RendererLike):
...
@@ -636,26 +728,11 @@ class HfRenderer(RendererLike):
video_placeholder
=
getattr
(
video_placeholder
=
getattr
(
model_config
.
hf_config
,
"video_placeholder"
,
None
model_config
.
hf_config
,
"video_placeholder"
,
None
)
)
if
video_placeholder
and
isinstance
(
prompt_raw
,
str
):
prompt_raw
=
replace_vision_chunk_video_placeholder
(
video_prompts
=
build_video_prompts_from_mm_data
(
mm_data
)
prompt_raw
,
mm_data
,
# replace in order
video_placeholder
,
prompt_raw_parts
=
prompt_raw
.
split
(
video_placeholder
)
)
if
len
(
prompt_raw_parts
)
==
len
(
video_prompts
)
+
1
:
prompt_raw
=
""
.
join
(
[
prompt_raw_parts
[
i
]
+
video_prompts
[
i
]
for
i
in
range
(
len
(
video_prompts
))
]
)
prompt_raw
+=
prompt_raw_parts
[
-
1
]
else
:
logger
.
warning
(
"Number of video placeholders (%d) does not match "
"number of videos (%d) in the request."
,
len
(
prompt_raw_parts
)
-
1
,
len
(
video_prompts
),
)
prompt
=
(
prompt
=
(
TextPrompt
(
prompt
=
prompt_raw
)
TextPrompt
(
prompt
=
prompt_raw
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment