Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
987506bc
Unverified
Commit
987506bc
authored
Feb 23, 2026
by
Cyrus Leung
Committed by
GitHub
Feb 22, 2026
Browse files
[Refactor] Simplify dummy data generation (#35025)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
c645e9a2
Changes
78
Hide whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
88 additions
and
129 deletions
+88
-129
vllm/model_executor/models/qwen2_5_omni_thinker.py
vllm/model_executor/models/qwen2_5_omni_thinker.py
+6
-10
vllm/model_executor/models/qwen2_audio.py
vllm/model_executor/models/qwen2_audio.py
+6
-7
vllm/model_executor/models/qwen2_vl.py
vllm/model_executor/models/qwen2_vl.py
+10
-9
vllm/model_executor/models/qwen3_asr.py
vllm/model_executor/models/qwen3_asr.py
+3
-6
vllm/model_executor/models/qwen3_vl.py
vllm/model_executor/models/qwen3_vl.py
+23
-13
vllm/model_executor/models/qwen_vl.py
vllm/model_executor/models/qwen_vl.py
+2
-3
vllm/model_executor/models/rvl.py
vllm/model_executor/models/rvl.py
+2
-3
vllm/model_executor/models/siglip.py
vllm/model_executor/models/siglip.py
+2
-3
vllm/model_executor/models/skyworkr1v.py
vllm/model_executor/models/skyworkr1v.py
+2
-3
vllm/model_executor/models/step3_vl.py
vllm/model_executor/models/step3_vl.py
+2
-3
vllm/model_executor/models/terratorch.py
vllm/model_executor/models/terratorch.py
+1
-2
vllm/model_executor/models/transformers/multimodal.py
vllm/model_executor/models/transformers/multimodal.py
+2
-3
vllm/model_executor/models/ultravox.py
vllm/model_executor/models/ultravox.py
+6
-7
vllm/model_executor/models/voxtral.py
vllm/model_executor/models/voxtral.py
+6
-6
vllm/model_executor/models/whisper.py
vllm/model_executor/models/whisper.py
+6
-7
vllm/multimodal/processing/context.py
vllm/multimodal/processing/context.py
+5
-7
vllm/multimodal/processing/dummy_inputs.py
vllm/multimodal/processing/dummy_inputs.py
+3
-13
vllm/multimodal/registry.py
vllm/multimodal/registry.py
+1
-24
No files found.
vllm/model_executor/models/qwen2_5_omni_thinker.py
View file @
987506bc
...
...
@@ -357,15 +357,13 @@ class Qwen2_5OmniThinkerDummyInputsBuilder(
self
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
],
)
->
MultiModalDataDict
:
num_audios
=
mm_counts
.
get
(
"audio"
,
0
)
num_images
=
mm_counts
.
get
(
"image"
,
0
)
num_videos
=
mm_counts
.
get
(
"video"
,
0
)
mm_processor_kwargs
=
mm_processor_kwargs
or
{}
feature_extractor
=
self
.
info
.
get_feature_extractor
(
**
mm_processor_kwargs
)
feature_extractor
=
self
.
info
.
get_feature_extractor
()
target_audio_length
=
(
min
(
...
...
@@ -375,16 +373,14 @@ class Qwen2_5OmniThinkerDummyInputsBuilder(
*
feature_extractor
.
sampling_rate
)
target_width
,
target_height
=
self
.
info
.
get_image_size_with_most_features
(
max_pixels
=
mm_processor_kwargs
.
get
(
"max_pixels"
,
None
),
)
target_width
,
target_height
=
self
.
info
.
get_image_size_with_most_features
()
target_num_frames
=
self
.
info
.
get_num_frames_with_most_features
(
seq_len
,
mm_counts
)
image_overrides
=
mm_options
.
get
(
"image"
)
if
mm_options
else
None
video_overrides
=
mm_options
.
get
(
"video"
)
if
mm_options
else
None
audio_overrides
=
mm_options
.
get
(
"audio"
)
if
mm_options
else
None
image_overrides
=
mm_options
.
get
(
"image"
)
video_overrides
=
mm_options
.
get
(
"video"
)
audio_overrides
=
mm_options
.
get
(
"audio"
)
mm_data
=
{
"audio"
:
self
.
_get_dummy_audios
(
...
...
vllm/model_executor/models/qwen2_audio.py
View file @
987506bc
...
...
@@ -195,22 +195,21 @@ class Qwen2AudioDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2AudioProcessingIn
self
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
],
)
->
MultiModalDataDict
:
feature_extractor
=
self
.
info
.
get_feature_extractor
(
**
(
mm_processor_kwargs
or
{})
)
feature_extractor
=
self
.
info
.
get_feature_extractor
()
sampling_rate
=
feature_extractor
.
sampling_rate
audio_len
=
feature_extractor
.
chunk_length
*
sampling_rate
num_audios
=
mm_counts
.
get
(
"audio"
,
0
)
audio_overrides
=
mm_options
.
get
(
"audio"
)
if
mm_options
else
None
audio_overrides
=
mm_options
.
get
(
"audio"
)
return
{
"audio"
:
self
.
_get_dummy_audios
(
length
=
audio_len
,
num_audios
=
num_audios
,
overrides
=
audio_overrides
length
=
audio_len
,
num_audios
=
num_audios
,
overrides
=
audio_overrides
,
)
}
...
...
vllm/model_executor/models/qwen2_vl.py
View file @
987506bc
...
...
@@ -925,9 +925,14 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
vision_config
=
hf_config
.
vision_config
patch_size
=
vision_config
.
patch_size
merge_size
=
vision_config
.
spatial_merge_size
if
max_pixels
is
None
:
image_processor
=
self
.
get_image_processor
()
max_pixels
=
image_processor
.
size
[
"longest_edge"
]
mm_kwargs
=
self
.
ctx
.
get_merged_mm_kwargs
({})
size
=
mm_kwargs
.
get
(
"size"
,
image_processor
.
size
)
max_pixels
=
size
[
"longest_edge"
]
unit
=
patch_size
*
merge_size
max_seq_len
=
max_pixels
//
(
unit
*
unit
)
...
...
@@ -1027,22 +1032,18 @@ class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]):
self
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
],
)
->
MultiModalDataDict
:
num_images
=
mm_counts
.
get
(
"image"
,
0
)
num_videos
=
mm_counts
.
get
(
"video"
,
0
)
mm_processor_kwargs
=
mm_processor_kwargs
or
{}
target_width
,
target_height
=
self
.
info
.
get_image_size_with_most_features
(
max_pixels
=
mm_processor_kwargs
.
get
(
"max_pixels"
,
None
)
)
target_width
,
target_height
=
self
.
info
.
get_image_size_with_most_features
()
target_num_frames
=
self
.
info
.
get_num_frames_with_most_features
(
seq_len
,
mm_counts
)
image_overrides
=
mm_options
.
get
(
"image"
)
if
mm_options
else
None
video_overrides
=
mm_options
.
get
(
"video"
)
if
mm_options
else
None
image_overrides
=
mm_options
.
get
(
"image"
)
video_overrides
=
mm_options
.
get
(
"video"
)
return
{
"image"
:
self
.
_get_dummy_images
(
...
...
vllm/model_executor/models/qwen3_asr.py
View file @
987506bc
...
...
@@ -146,14 +146,11 @@ class Qwen3ASRDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3ASRProcessingInfo])
self
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
],
)
->
MultiModalDataDict
:
num_audios
=
mm_counts
.
get
(
"audio"
,
0
)
feature_extractor
=
self
.
info
.
get_feature_extractor
(
**
(
mm_processor_kwargs
or
{})
)
feature_extractor
=
self
.
info
.
get_feature_extractor
()
target_audio_length
=
(
min
(
...
...
@@ -163,7 +160,7 @@ class Qwen3ASRDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3ASRProcessingInfo])
*
feature_extractor
.
sampling_rate
)
audio_overrides
=
mm_options
.
get
(
"audio"
)
if
mm_options
else
None
audio_overrides
=
mm_options
.
get
(
"audio"
)
return
{
"audio"
:
self
.
_get_dummy_audios
(
...
...
vllm/model_executor/models/qwen3_vl.py
View file @
987506bc
...
...
@@ -703,11 +703,18 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
mm_counts
:
Mapping
[
str
,
int
],
)
->
int
:
video_processor
=
self
.
get_video_processor
()
video_max_pixels
=
video_processor
.
size
[
"longest_edge"
]
mm_kwargs
=
self
.
ctx
.
get_merged_mm_kwargs
({})
video_size
=
mm_kwargs
.
get
(
"size"
,
video_processor
.
size
)
temporal_patch_size
=
mm_kwargs
.
get
(
"temporal_patch_size"
,
video_processor
.
temporal_patch_size
)
# video_max_pixels contains the temporal compression factor,
# so we divide by 2 to get the maximum number of image pixels.
video_max_pixels
=
video_size
[
"longest_edge"
]
target_width
,
target_height
=
self
.
get_image_size_with_most_features
(
max_pixels
=
video_max_pixels
//
video_processor
.
temporal_patch_size
max_pixels
=
video_max_pixels
//
temporal_patch_size
)
num_video_soft_tokens
=
self
.
get_num_video_tokens
(
image_width
=
target_width
,
...
...
@@ -789,19 +796,15 @@ class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]):
self
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
],
)
->
MultiModalDataDict
:
num_images
=
mm_counts
.
get
(
"image"
,
0
)
num_videos
=
mm_counts
.
get
(
"video"
,
0
)
image_overrides
=
mm_options
.
get
(
"image"
)
if
mm_options
else
None
video_overrides
=
mm_options
.
get
(
"video"
)
if
mm_options
else
None
image_overrides
=
mm_options
.
get
(
"image"
)
video_overrides
=
mm_options
.
get
(
"video"
)
mm_processor_kwargs
=
mm_processor_kwargs
or
{}
target_image_width
,
target_image_height
=
(
self
.
info
.
get_image_size_with_most_features
(
max_pixels
=
mm_processor_kwargs
.
get
(
"max_pixels"
,
None
),
)
self
.
info
.
get_image_size_with_most_features
()
)
# treat videos as special images
...
...
@@ -826,13 +829,20 @@ class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]):
target_num_frames
=
min
(
target_num_frames
,
num_frames_override
)
target_num_frames
=
max
(
target_num_frames
,
2
)
video_processor
=
self
.
info
.
get_video_processor
(
**
(
mm_processor_kwargs
or
{}))
video_max_pixels
=
video_processor
.
size
[
"longest_edge"
]
video_processor
=
self
.
info
.
get_video_processor
()
mm_kwargs
=
self
.
info
.
ctx
.
get_merged_mm_kwargs
({})
video_size
=
mm_kwargs
.
get
(
"size"
,
video_processor
.
size
)
temporal_patch_size
=
mm_kwargs
.
get
(
"temporal_patch_size"
,
video_processor
.
temporal_patch_size
)
# video_max_pixels contains the temporal compression factor,
# so we divide by 2 to get the maximum number of image pixels.
video_max_pixels
=
video_size
[
"longest_edge"
]
target_video_width
,
target_video_height
=
(
self
.
info
.
get_image_size_with_most_features
(
max_pixels
=
video_max_pixels
//
video_processor
.
temporal_patch_size
max_pixels
=
video_max_pixels
//
temporal_patch_size
)
)
target_video_size
,
_
=
self
.
info
.
_get_vision_info
(
...
...
vllm/model_executor/models/qwen_vl.py
View file @
987506bc
...
...
@@ -617,8 +617,7 @@ class QwenVLDummyInputsBuilder(BaseDummyInputsBuilder[QwenVLProcessingInfo]):
self
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
],
)
->
MultiModalDataDict
:
hf_config
=
self
.
info
.
get_hf_config
()
vision_config
=
hf_config
.
visual
...
...
@@ -626,7 +625,7 @@ class QwenVLDummyInputsBuilder(BaseDummyInputsBuilder[QwenVLProcessingInfo]):
target_width
=
target_height
=
vision_config
[
"image_size"
]
num_images
=
mm_counts
.
get
(
"image"
,
0
)
image_overrides
=
mm_options
.
get
(
"image"
)
if
mm_options
else
None
image_overrides
=
mm_options
.
get
(
"image"
)
return
{
"image"
:
self
.
_get_dummy_images
(
...
...
vllm/model_executor/models/rvl.py
View file @
987506bc
...
...
@@ -40,14 +40,13 @@ class RVLDummyInputsBuilder(LlavaDummyInputsBuilder[RVLProcessingInfo]):
self
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
],
)
->
MultiModalDataDict
:
num_images
=
mm_counts
.
get
(
"image"
,
0
)
target_width
,
target_height
=
self
.
info
.
get_image_size_with_most_features
()
image_overrides
=
mm_options
.
get
(
"image"
)
if
mm_options
else
None
image_overrides
=
mm_options
.
get
(
"image"
)
return
{
"image"
:
self
.
_get_dummy_images
(
...
...
vllm/model_executor/models/siglip.py
View file @
987506bc
...
...
@@ -158,14 +158,13 @@ class SiglipDummyInputsBuilder(BaseDummyInputsBuilder[SiglipProcessingInfo]):
self
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
],
)
->
MultiModalDataDict
:
num_images
=
mm_counts
.
get
(
"image"
,
0
)
target_width
,
target_height
=
self
.
info
.
get_image_size_with_most_features
()
image_overrides
=
mm_options
.
get
(
"image"
)
if
mm_options
else
None
image_overrides
=
mm_options
.
get
(
"image"
)
return
{
"image"
:
self
.
_get_dummy_images
(
...
...
vllm/model_executor/models/skyworkr1v.py
View file @
987506bc
...
...
@@ -529,13 +529,12 @@ class SkyworkR1VDummyInputsBuilder(BaseDummyInputsBuilder[SkyworkR1VProcessingIn
self
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
],
)
->
MultiModalDataDict
:
target_width
,
target_height
=
self
.
info
.
get_image_size_with_most_features
()
num_images
=
mm_counts
.
get
(
"image"
,
0
)
image_overrides
=
mm_options
.
get
(
"image"
)
if
mm_options
else
None
image_overrides
=
mm_options
.
get
(
"image"
)
return
{
"image"
:
self
.
_get_dummy_images
(
...
...
vllm/model_executor/models/step3_vl.py
View file @
987506bc
...
...
@@ -564,13 +564,12 @@ class Step3VLDummyInputsBuilder(BaseDummyInputsBuilder[Step3VLProcessingInfo]):
self
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
],
)
->
MultiModalDataDict
:
target_width
,
target_height
=
self
.
info
.
get_image_size_with_most_features
()
num_images
=
mm_counts
.
get
(
"image"
,
0
)
image_overrides
=
mm_options
.
get
(
"image"
)
if
mm_options
else
None
image_overrides
=
mm_options
.
get
(
"image"
)
return
{
"image"
:
self
.
_get_dummy_images
(
...
...
vllm/model_executor/models/terratorch.py
View file @
987506bc
...
...
@@ -154,8 +154,7 @@ class TerratorchInputBuilder(BaseDummyInputsBuilder[TerratorchProcessingInfo]):
self
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
],
)
->
MultiModalDataDict
:
# Dummy data is generated based on the 'input' section
# defined in the HF configuration file
...
...
vllm/model_executor/models/transformers/multimodal.py
View file @
987506bc
...
...
@@ -101,14 +101,13 @@ class MultiModalDummyInputsBuilder(BaseDummyInputsBuilder[MultiModalProcessingIn
self
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
"BaseDummyOptions"
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
"BaseDummyOptions"
],
)
->
MultiModalDataDict
:
num_images
=
mm_counts
.
get
(
"image"
,
0
)
target_width
,
target_height
=
self
.
info
.
get_max_image_size
()
image_overrides
=
mm_options
.
get
(
"image"
)
if
mm_options
else
None
image_overrides
=
mm_options
.
get
(
"image"
)
return
{
"image"
:
self
.
_get_dummy_images
(
...
...
vllm/model_executor/models/ultravox.py
View file @
987506bc
...
...
@@ -164,12 +164,9 @@ class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo])
self
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
],
)
->
MultiModalDataDict
:
feature_extractor
=
self
.
info
.
get_feature_extractor
(
**
(
mm_processor_kwargs
or
{})
)
feature_extractor
=
self
.
info
.
get_feature_extractor
()
sampling_rate
=
feature_extractor
.
sampling_rate
audio_len
=
(
...
...
@@ -177,11 +174,13 @@ class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo])
)
num_audios
=
mm_counts
.
get
(
"audio"
,
0
)
audio_overrides
=
mm_options
.
get
(
"audio"
)
if
mm_options
else
None
audio_overrides
=
mm_options
.
get
(
"audio"
)
return
{
"audio"
:
self
.
_get_dummy_audios
(
length
=
audio_len
,
num_audios
=
num_audios
,
overrides
=
audio_overrides
length
=
audio_len
,
num_audios
=
num_audios
,
overrides
=
audio_overrides
,
)
}
...
...
vllm/model_executor/models/voxtral.py
View file @
987506bc
...
...
@@ -218,18 +218,19 @@ class VoxtralDummyInputsBuilder(BaseDummyInputsBuilder[VoxtralProcessingInfo]):
self
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
],
)
->
MultiModalDataDict
:
num_audios
=
mm_counts
.
get
(
"audio"
,
0
)
target_length
=
self
.
info
.
get_max_audio_array_len
()
audio_overrides
=
mm_options
.
get
(
"audio"
)
if
mm_options
else
None
audio_overrides
=
mm_options
.
get
(
"audio"
)
return
{
"audio"
:
self
.
_get_dummy_audios
(
length
=
target_length
,
num_audios
=
num_audios
,
overrides
=
audio_overrides
length
=
target_length
,
num_audios
=
num_audios
,
overrides
=
audio_overrides
,
)
}
...
...
@@ -237,8 +238,7 @@ class VoxtralDummyInputsBuilder(BaseDummyInputsBuilder[VoxtralProcessingInfo]):
self
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
],
)
->
ProcessorInputs
:
tokenizer
=
self
.
info
.
get_tokenizer
()
...
...
vllm/model_executor/models/whisper.py
View file @
987506bc
...
...
@@ -695,22 +695,21 @@ class WhisperDummyInputsBuilder(BaseDummyInputsBuilder[WhisperProcessingInfo]):
self
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
],
)
->
MultiModalDataDict
:
feature_extractor
=
self
.
info
.
get_feature_extractor
(
**
(
mm_processor_kwargs
or
{})
)
feature_extractor
=
self
.
info
.
get_feature_extractor
()
sampling_rate
=
feature_extractor
.
sampling_rate
audio_len
=
feature_extractor
.
chunk_length
*
sampling_rate
num_audios
=
mm_counts
.
get
(
"audio"
,
0
)
audio_overrides
=
mm_options
.
get
(
"audio"
)
if
mm_options
else
None
audio_overrides
=
mm_options
.
get
(
"audio"
)
return
{
"audio"
:
self
.
_get_dummy_audios
(
length
=
audio_len
,
num_audios
=
num_audios
,
overrides
=
audio_overrides
length
=
audio_len
,
num_audios
=
num_audios
,
overrides
=
audio_overrides
,
)
}
...
...
vllm/multimodal/processing/context.py
View file @
987506bc
...
...
@@ -266,11 +266,14 @@ class InputProcessingContext:
if
isinstance
(
tokenizer
,
MistralTokenizer
):
tokenizer
=
tokenizer
.
transformers_tokenizer
merged_kwargs
=
self
.
get_merged_mm_kwargs
(
kwargs
)
merged_kwargs
.
pop
(
"tokenizer"
,
None
)
return
cached_processor_from_config
(
self
.
model_config
,
processor_cls
=
typ
,
tokenizer
=
tokenizer
,
**
kwargs
,
**
merged_
kwargs
,
)
def
init_processor
(
...
...
@@ -283,12 +286,7 @@ class InputProcessingContext:
Initialize a HuggingFace-like processor class, merging the
keyword arguments with those in the model's configuration.
"""
mm_config
=
self
.
model_config
.
get_multimodal_config
()
base_kwargs
=
mm_config
.
mm_processor_kwargs
if
base_kwargs
is
None
:
base_kwargs
=
{}
merged_kwargs
=
{
**
base_kwargs
,
**
kwargs
}
merged_kwargs
=
self
.
get_merged_mm_kwargs
(
kwargs
)
return
typ
(
**
merged_kwargs
)
...
...
vllm/multimodal/processing/dummy_inputs.py
View file @
987506bc
...
...
@@ -62,8 +62,7 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
self
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
],
)
->
MultiModalDataDict
:
"""
Build the multimodal input which, after processing, results in
...
...
@@ -83,8 +82,7 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
self
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
],
)
->
ProcessorInputs
:
"""
Build the input which, after processing, results in
...
...
@@ -94,16 +92,9 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
seq_len: Sequence length
mm_counts: Count of items per modality
mm_options: Configurable options per modality (optional)
mm_processor_kwargs: Additional keyword arguments
for hf_processor (optional)
"""
dummy_text
=
self
.
get_dummy_text
(
mm_counts
)
dummy_mm_data
=
self
.
get_dummy_mm_data
(
seq_len
,
mm_counts
,
mm_options
,
mm_processor_kwargs
=
mm_processor_kwargs
,
)
dummy_mm_data
=
self
.
get_dummy_mm_data
(
seq_len
,
mm_counts
,
mm_options
)
dummy_mm_items
=
self
.
info
.
parse_mm_data
(
dummy_mm_data
,
validate
=
False
)
tokenization_kwargs
=
{
"truncation"
:
False
}
...
...
@@ -111,7 +102,6 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
return
ProcessorInputs
(
prompt
=
dummy_text
,
mm_items
=
dummy_mm_items
,
hf_processor_mm_kwargs
=
mm_processor_kwargs
or
{},
tokenization_kwargs
=
tokenization_kwargs
,
)
...
...
vllm/multimodal/registry.py
View file @
987506bc
...
...
@@ -5,7 +5,6 @@ from dataclasses import dataclass
from
multiprocessing.synchronize
import
Lock
as
LockType
from
typing
import
TYPE_CHECKING
,
Generic
,
Literal
,
Protocol
,
TypeVar
,
cast
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.config.observability
import
ObservabilityConfig
from
vllm.logger
import
init_logger
from
vllm.tokenizers
import
TokenizerLike
,
cached_tokenizer_from_config
...
...
@@ -99,27 +98,6 @@ class MultiModalRegistry:
A registry that dispatches data processing according to the model.
"""
def
_extract_mm_options
(
self
,
model_config
:
"ModelConfig"
,
)
->
Mapping
[
str
,
BaseDummyOptions
]
|
None
:
"""
Extract multimodal dummy options from model config.
Returns None if no configurable options are found, otherwise returns
a mapping of modality names to their dummy options.
"""
if
not
model_config
.
multimodal_config
:
return
None
mm_options
=
{
m
:
opt
for
m
in
model_config
.
multimodal_config
.
limit_per_prompt
if
(
opt
:
=
model_config
.
multimodal_config
.
get_dummy_options
(
m
))
is
not
None
}
return
mm_options
if
len
(
mm_options
)
>
0
else
None
def
supports_multimodal_inputs
(
self
,
model_config
:
"ModelConfig"
)
->
bool
:
"""
Checks if the model supports multimodal inputs.
...
...
@@ -261,8 +239,7 @@ class MultiModalRegistry:
processor_inputs
=
processor
.
dummy_inputs
.
get_dummy_processor_inputs
(
seq_len
=
seq_len
,
mm_counts
=
mm_counts
,
mm_options
=
self
.
_extract_mm_options
(
model_config
),
mm_processor_kwargs
=
mm_config
.
mm_processor_kwargs
,
mm_options
=
mm_config
.
limit_per_prompt
,
)
mm_inputs
=
processor
.
apply
(
prompt
=
processor_inputs
.
prompt
,
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment