Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
8b464d96
Unverified
Commit
8b464d96
authored
Apr 28, 2025
by
Cyrus Leung
Committed by
GitHub
Apr 28, 2025
Browse files
[Misc] Clean up Qwen2.5-Omni code (#17301)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
889ebb26
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
75 additions
and
94 deletions
+75
-94
vllm/model_executor/models/qwen2_5_omni_thinker.py
vllm/model_executor/models/qwen2_5_omni_thinker.py
+8
-51
vllm/multimodal/processing.py
vllm/multimodal/processing.py
+67
-43
No files found.
vllm/model_executor/models/qwen2_5_omni_thinker.py
View file @
8b464d96
...
...
@@ -51,11 +51,9 @@ from vllm.model_executor.models.qwen2_audio import (
from
vllm.model_executor.models.qwen2_vl
import
Qwen2VLMultiModalDataParser
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.hasher
import
MultiModalHasher
from
vllm.multimodal.inputs
import
(
ImageItem
,
ModalityData
,
MultiModalDataDict
,
MultiModalFieldConfig
,
MultiModalInputs
,
MultiModalKwargs
,
NestedTensors
)
MultiModalKwargs
,
NestedTensors
)
from
vllm.multimodal.parse
import
(
AudioProcessorItems
,
DictEmbeddingItems
,
ModalityDataItems
,
MultiModalDataItems
,
MultiModalDataParser
)
...
...
@@ -279,46 +277,17 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
)
->
Mapping
[
str
,
MultiModalFieldConfig
]:
return
_qwen2_5_omni_thinker_field_config
(
hf_inputs
)
def
apply
(
def
_maybe_apply_prompt_updates
(
self
,
prompt
:
Union
[
str
,
list
[
int
]],
mm_data
:
MultiModalDataDict
,
mm_items
:
MultiModalDataItems
,
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
return_mm_hashes
:
bool
=
False
,
)
->
MultiModalInputs
:
prompt_ids
:
list
[
int
],
mm_kwargs
:
MultiModalKwargs
,
is_update_applied
:
bool
,
)
->
tuple
[
list
[
int
],
str
,
Mapping
[
str
,
list
[
PlaceholderFeaturesInfo
]]]:
"""
Qwen2.5-Omni reimplements this function to handle `use_audio_in_video`.
"""
mm_items
=
self
.
_to_mm_items
(
mm_data
)
# Create MM hashes to be returned (only used in V1)
# TODO: Use these hash keys for caching operations in apply_hf_processor
# instead of rehashing.
if
return_mm_hashes
:
model_id
=
self
.
info
.
model_id
mm_hashes
=
{
modality
:
[
MultiModalHasher
.
hash_kwargs
(
model_id
=
model_id
,
**
{
modality
:
item
},
**
hf_processor_mm_kwargs
)
for
item
in
items
]
for
modality
,
items
in
mm_items
.
items
()
}
else
:
mm_hashes
=
None
(
prompt_ids
,
mm_kwargs
,
is_update_applied
,
)
=
self
.
_cached_apply_hf_processor
(
prompt
,
mm_items
,
hf_processor_mm_kwargs
,
)
unbound_prompt_updates
=
self
.
_get_prompt_updates
(
mm_items
,
hf_processor_mm_kwargs
,
...
...
@@ -364,22 +333,10 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
tokenizer
=
self
.
info
.
get_tokenizer
()
prompt
=
decode_tokens
(
tokenizer
,
prompt_ids
)
mm_placeholder_ranges
=
{
modality
:
[
item
.
to_range
()
for
item
in
placeholders
]
for
modality
,
placeholders
in
mm_placeholders
.
items
()
}
if
use_audio_in_video
:
mm_kwargs
[
"use_audio_in_video"
]
=
True
return
MultiModalInputs
(
type
=
"multimodal"
,
prompt
=
prompt
,
prompt_token_ids
=
prompt_ids
,
mm_kwargs
=
mm_kwargs
,
mm_hashes
=
mm_hashes
,
mm_placeholders
=
mm_placeholder_ranges
,
)
return
prompt_ids
,
prompt
,
mm_placeholders
def
_get_prompt_updates
(
self
,
...
...
vllm/multimodal/processing.py
View file @
8b464d96
...
...
@@ -1569,35 +1569,18 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
"model (usually arising from an inconsistency between "
"`_call_hf_processor` and `_get_prompt_updates`)."
)
def
apply
(
def
_hash_mm_items
(
self
,
prompt
:
Union
[
str
,
list
[
int
]],
mm_data
:
MultiModalDataDict
,
mm_items
:
MultiModalDataItems
,
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
return_mm_hashes
:
bool
=
False
,
)
->
MultiModalInputs
:
"""
Process multi-modal inputs to be used in vLLM.
The main steps are:
1. Apply HF Processor on prompt text and multi-modal data together,
outputting token IDs and processed tensors.
2. Find and update sequences in the token IDs with placeholder tokens.
The number of placeholder tokens equals the feature size of the
multi-modal data outputted by the multi-modal encoder.
3. Extract information about the placeholder tokens from the
processed token IDs.
"""
mm_items
=
self
.
_to_mm_items
(
mm_data
)
)
->
dict
[
str
,
list
[
str
]]:
"""Create MM hashes to be returned (only used in V1)."""
# Create MM hashes to be returned (only used in V1)
# TODO: Use these hash keys for caching operations in apply_hf_processor
# instead of rehashing.
if
return_mm_hashes
:
model_id
=
self
.
info
.
model_id
mm_hashes
=
{
return
{
modality
:
[
MultiModalHasher
.
hash_kwargs
(
model_id
=
model_id
,
**
{
modality
:
item
},
...
...
@@ -1606,19 +1589,15 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
]
for
modality
,
items
in
mm_items
.
items
()
}
else
:
mm_hashes
=
None
(
prompt_ids
,
mm_kwargs
,
is_update_applied
,
)
=
self
.
_cached_apply_hf_processor
(
prompt
,
mm_items
,
hf_processor_mm_kwargs
,
)
def
_maybe_apply_prompt_updates
(
self
,
mm_items
:
MultiModalDataItems
,
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
prompt_ids
:
list
[
int
],
mm_kwargs
:
MultiModalKwargs
,
is_update_applied
:
bool
,
)
->
tuple
[
list
[
int
],
str
,
Mapping
[
str
,
list
[
PlaceholderFeaturesInfo
]]]:
unbound_prompt_updates
=
self
.
_get_prompt_updates
(
mm_items
,
hf_processor_mm_kwargs
,
...
...
@@ -1652,6 +1631,51 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
)
self
.
_validate_mm_placeholders
(
mm_placeholders
,
mm_item_counts
)
return
prompt_ids
,
prompt
,
mm_placeholders
def
apply
(
self
,
prompt
:
Union
[
str
,
list
[
int
]],
mm_data
:
MultiModalDataDict
,
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
return_mm_hashes
:
bool
=
False
,
)
->
MultiModalInputs
:
"""
Process multi-modal inputs to be used in vLLM.
The main steps are:
1. Apply HF Processor on prompt text and multi-modal data together,
outputting token IDs and processed tensors.
2. Find and update sequences in the token IDs with placeholder tokens.
The number of placeholder tokens equals the feature size of the
multi-modal data outputted by the multi-modal encoder.
3. Extract information about the placeholder tokens from the
processed token IDs.
"""
mm_items
=
self
.
_to_mm_items
(
mm_data
)
mm_hashes
=
(
self
.
_hash_mm_items
(
mm_items
,
hf_processor_mm_kwargs
)
if
return_mm_hashes
else
None
)
(
prompt_ids
,
mm_kwargs
,
is_update_applied
,
)
=
self
.
_cached_apply_hf_processor
(
prompt
,
mm_items
,
hf_processor_mm_kwargs
,
)
prompt_ids
,
prompt
,
mm_placeholders
=
self
.
_maybe_apply_prompt_updates
(
mm_items
=
mm_items
,
hf_processor_mm_kwargs
=
hf_processor_mm_kwargs
,
prompt_ids
=
prompt_ids
,
mm_kwargs
=
mm_kwargs
,
is_update_applied
=
is_update_applied
,
)
mm_placeholder_ranges
=
{
modality
:
[
item
.
to_range
()
for
item
in
placeholders
]
for
modality
,
placeholders
in
mm_placeholders
.
items
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment