Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0ab06100
Unverified
Commit
0ab06100
authored
Feb 12, 2026
by
Isotr0py
Committed by
GitHub
Feb 11, 2026
Browse files
[Multimodal] Expose `mm_processor_kwargs` for `DummyInputsBuilder` (#34330)
Signed-off-by:
Isotr0py
<
mozf@mail2.sysu.edu.cn
>
parent
ffb3d553
Changes
72
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
43 additions
and
8 deletions
+43
-8
vllm/model_executor/models/mistral3.py
vllm/model_executor/models/mistral3.py
+1
-0
vllm/model_executor/models/mllama4.py
vllm/model_executor/models/mllama4.py
+1
-0
vllm/model_executor/models/molmo.py
vllm/model_executor/models/molmo.py
+1
-0
vllm/model_executor/models/molmo2.py
vllm/model_executor/models/molmo2.py
+1
-0
vllm/model_executor/models/nano_nemotron_vl.py
vllm/model_executor/models/nano_nemotron_vl.py
+2
-0
vllm/model_executor/models/nemotron_parse.py
vllm/model_executor/models/nemotron_parse.py
+1
-0
vllm/model_executor/models/nvlm_d.py
vllm/model_executor/models/nvlm_d.py
+1
-0
vllm/model_executor/models/ovis.py
vllm/model_executor/models/ovis.py
+1
-0
vllm/model_executor/models/ovis2_5.py
vllm/model_executor/models/ovis2_5.py
+1
-0
vllm/model_executor/models/paddleocr_vl.py
vllm/model_executor/models/paddleocr_vl.py
+1
-0
vllm/model_executor/models/paligemma.py
vllm/model_executor/models/paligemma.py
+1
-0
vllm/model_executor/models/phi3v.py
vllm/model_executor/models/phi3v.py
+1
-0
vllm/model_executor/models/phi4mm.py
vllm/model_executor/models/phi4mm.py
+1
-0
vllm/model_executor/models/pixtral.py
vllm/model_executor/models/pixtral.py
+2
-0
vllm/model_executor/models/qwen2_5_omni_thinker.py
vllm/model_executor/models/qwen2_5_omni_thinker.py
+7
-2
vllm/model_executor/models/qwen2_audio.py
vllm/model_executor/models/qwen2_audio.py
+4
-1
vllm/model_executor/models/qwen2_vl.py
vllm/model_executor/models/qwen2_vl.py
+5
-1
vllm/model_executor/models/qwen3_asr.py
vllm/model_executor/models/qwen3_asr.py
+4
-1
vllm/model_executor/models/qwen3_omni_moe_thinker.py
vllm/model_executor/models/qwen3_omni_moe_thinker.py
+1
-1
vllm/model_executor/models/qwen3_vl.py
vllm/model_executor/models/qwen3_vl.py
+6
-2
No files found.
vllm/model_executor/models/mistral3.py
View file @
0ab06100
...
@@ -237,6 +237,7 @@ class Mistral3DummyInputsBuilder(BaseDummyInputsBuilder[_I]):
...
@@ -237,6 +237,7 @@ class Mistral3DummyInputsBuilder(BaseDummyInputsBuilder[_I]):
seq_len
:
int
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
)
->
MultiModalDataDict
:
)
->
MultiModalDataDict
:
num_images
=
mm_counts
.
get
(
"image"
,
0
)
num_images
=
mm_counts
.
get
(
"image"
,
0
)
...
...
vllm/model_executor/models/mllama4.py
View file @
0ab06100
...
@@ -704,6 +704,7 @@ class Mllama4DummyInputsBuilder(BaseDummyInputsBuilder[Mllama4ProcessingInfo]):
...
@@ -704,6 +704,7 @@ class Mllama4DummyInputsBuilder(BaseDummyInputsBuilder[Mllama4ProcessingInfo]):
seq_len
:
int
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
)
->
MultiModalDataDict
:
)
->
MultiModalDataDict
:
num_images
=
mm_counts
.
get
(
"image"
,
0
)
num_images
=
mm_counts
.
get
(
"image"
,
0
)
...
...
vllm/model_executor/models/molmo.py
View file @
0ab06100
...
@@ -1278,6 +1278,7 @@ class MolmoDummyInputsBuilder(BaseDummyInputsBuilder[MolmoProcessingInfo]):
...
@@ -1278,6 +1278,7 @@ class MolmoDummyInputsBuilder(BaseDummyInputsBuilder[MolmoProcessingInfo]):
seq_len
:
int
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
)
->
MultiModalDataDict
:
)
->
MultiModalDataDict
:
target_width
,
target_height
=
self
.
info
.
get_image_size_with_most_features
()
target_width
,
target_height
=
self
.
info
.
get_image_size_with_most_features
()
num_images
=
mm_counts
.
get
(
"image"
,
0
)
num_images
=
mm_counts
.
get
(
"image"
,
0
)
...
...
vllm/model_executor/models/molmo2.py
View file @
0ab06100
...
@@ -2079,6 +2079,7 @@ class Molmo2DummyInputsBuilder(BaseDummyInputsBuilder[Molmo2ProcessingInfo]):
...
@@ -2079,6 +2079,7 @@ class Molmo2DummyInputsBuilder(BaseDummyInputsBuilder[Molmo2ProcessingInfo]):
seq_len
:
int
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
)
->
MultiModalDataDict
:
)
->
MultiModalDataDict
:
num_images
=
mm_counts
.
get
(
"image"
,
0
)
num_images
=
mm_counts
.
get
(
"image"
,
0
)
num_videos
=
mm_counts
.
get
(
"video"
,
0
)
num_videos
=
mm_counts
.
get
(
"video"
,
0
)
...
...
vllm/model_executor/models/nano_nemotron_vl.py
View file @
0ab06100
...
@@ -1385,6 +1385,7 @@ class NanoNemotronVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
...
@@ -1385,6 +1385,7 @@ class NanoNemotronVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
seq_len
:
int
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
)
->
MultiModalDataDict
:
)
->
MultiModalDataDict
:
num_images
=
mm_counts
.
get
(
"image"
,
0
)
num_images
=
mm_counts
.
get
(
"image"
,
0
)
processor
=
self
.
info
.
get_hf_processor
()
processor
=
self
.
info
.
get_hf_processor
()
...
@@ -1457,6 +1458,7 @@ class NanoNemotronVLDummyInputsBuilder(
...
@@ -1457,6 +1458,7 @@ class NanoNemotronVLDummyInputsBuilder(
seq_len
:
int
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
)
->
MultiModalDataDict
:
)
->
MultiModalDataDict
:
dummy_image
=
super
().
get_dummy_mm_data
(
dummy_image
=
super
().
get_dummy_mm_data
(
seq_len
=
seq_len
,
mm_counts
=
mm_counts
,
mm_options
=
mm_options
seq_len
=
seq_len
,
mm_counts
=
mm_counts
,
mm_options
=
mm_options
...
...
vllm/model_executor/models/nemotron_parse.py
View file @
0ab06100
...
@@ -642,6 +642,7 @@ class NemotronParseDummyInputsBuilder(
...
@@ -642,6 +642,7 @@ class NemotronParseDummyInputsBuilder(
seq_len
:
int
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
)
->
MultiModalDataDict
:
)
->
MultiModalDataDict
:
num_images
=
mm_counts
.
get
(
"image"
,
0
)
num_images
=
mm_counts
.
get
(
"image"
,
0
)
...
...
vllm/model_executor/models/nvlm_d.py
View file @
0ab06100
...
@@ -93,6 +93,7 @@ class NVLMDummyInputsBuilder(BaseInternVLDummyInputsBuilder[NVLMProcessingInfo])
...
@@ -93,6 +93,7 @@ class NVLMDummyInputsBuilder(BaseInternVLDummyInputsBuilder[NVLMProcessingInfo])
seq_len
:
int
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
)
->
MultiModalDataDict
:
)
->
MultiModalDataDict
:
target_width
,
target_height
=
self
.
info
.
get_image_size_with_most_features
()
target_width
,
target_height
=
self
.
info
.
get_image_size_with_most_features
()
num_images
=
mm_counts
.
get
(
"image"
,
0
)
num_images
=
mm_counts
.
get
(
"image"
,
0
)
...
...
vllm/model_executor/models/ovis.py
View file @
0ab06100
...
@@ -303,6 +303,7 @@ class OvisDummyInputsBuilder(BaseDummyInputsBuilder[OvisProcessingInfo]):
...
@@ -303,6 +303,7 @@ class OvisDummyInputsBuilder(BaseDummyInputsBuilder[OvisProcessingInfo]):
seq_len
:
int
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
)
->
MultiModalDataDict
:
)
->
MultiModalDataDict
:
num_images
=
mm_counts
.
get
(
"image"
,
0
)
num_images
=
mm_counts
.
get
(
"image"
,
0
)
...
...
vllm/model_executor/models/ovis2_5.py
View file @
0ab06100
...
@@ -302,6 +302,7 @@ class Ovis2_5DummyInputsBuilder(BaseDummyInputsBuilder[Ovis2_5ProcessingInfo]):
...
@@ -302,6 +302,7 @@ class Ovis2_5DummyInputsBuilder(BaseDummyInputsBuilder[Ovis2_5ProcessingInfo]):
seq_len
:
int
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
)
->
MultiModalDataDict
:
)
->
MultiModalDataDict
:
num_images
=
mm_counts
.
get
(
"image"
,
0
)
num_images
=
mm_counts
.
get
(
"image"
,
0
)
num_videos
=
mm_counts
.
get
(
"video"
,
0
)
num_videos
=
mm_counts
.
get
(
"video"
,
0
)
...
...
vllm/model_executor/models/paddleocr_vl.py
View file @
0ab06100
...
@@ -204,6 +204,7 @@ class PaddleOCRVLDummyInputsBuilder(BaseDummyInputsBuilder[PaddleOCRVLProcessing
...
@@ -204,6 +204,7 @@ class PaddleOCRVLDummyInputsBuilder(BaseDummyInputsBuilder[PaddleOCRVLProcessing
seq_len
:
int
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
)
->
MultiModalDataDict
:
)
->
MultiModalDataDict
:
num_images
=
mm_counts
.
get
(
"image"
,
0
)
num_images
=
mm_counts
.
get
(
"image"
,
0
)
...
...
vllm/model_executor/models/paligemma.py
View file @
0ab06100
...
@@ -128,6 +128,7 @@ class PaliGemmaDummyInputsBuilder(BaseDummyInputsBuilder[PaliGemmaProcessingInfo
...
@@ -128,6 +128,7 @@ class PaliGemmaDummyInputsBuilder(BaseDummyInputsBuilder[PaliGemmaProcessingInfo
seq_len
:
int
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
)
->
MultiModalDataDict
:
)
->
MultiModalDataDict
:
hf_config
=
self
.
info
.
get_hf_config
()
hf_config
=
self
.
info
.
get_hf_config
()
vision_config
=
hf_config
.
vision_config
vision_config
=
hf_config
.
vision_config
...
...
vllm/model_executor/models/phi3v.py
View file @
0ab06100
...
@@ -380,6 +380,7 @@ class Phi3VDummyInputsBuilder(BaseDummyInputsBuilder[Phi3VProcessingInfo]):
...
@@ -380,6 +380,7 @@ class Phi3VDummyInputsBuilder(BaseDummyInputsBuilder[Phi3VProcessingInfo]):
seq_len
:
int
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
)
->
MultiModalDataDict
:
)
->
MultiModalDataDict
:
num_images
=
mm_counts
.
get
(
"image"
,
0
)
num_images
=
mm_counts
.
get
(
"image"
,
0
)
...
...
vllm/model_executor/models/phi4mm.py
View file @
0ab06100
...
@@ -826,6 +826,7 @@ class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]):
...
@@ -826,6 +826,7 @@ class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]):
seq_len
:
int
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
)
->
MultiModalDataDict
:
)
->
MultiModalDataDict
:
num_audios
=
mm_counts
.
get
(
"audio"
,
0
)
num_audios
=
mm_counts
.
get
(
"audio"
,
0
)
num_images
=
mm_counts
.
get
(
"image"
,
0
)
num_images
=
mm_counts
.
get
(
"image"
,
0
)
...
...
vllm/model_executor/models/pixtral.py
View file @
0ab06100
...
@@ -261,6 +261,7 @@ class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]):
...
@@ -261,6 +261,7 @@ class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]):
seq_len
:
int
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
)
->
MultiModalDataDict
:
)
->
MultiModalDataDict
:
num_images
=
mm_counts
.
get
(
"image"
,
0
)
num_images
=
mm_counts
.
get
(
"image"
,
0
)
...
@@ -282,6 +283,7 @@ class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]):
...
@@ -282,6 +283,7 @@ class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]):
seq_len
:
int
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
)
->
ProcessorInputs
:
)
->
ProcessorInputs
:
tokenizer
=
self
.
info
.
get_tokenizer
()
tokenizer
=
self
.
info
.
get_tokenizer
()
...
...
vllm/model_executor/models/qwen2_5_omni_thinker.py
View file @
0ab06100
...
@@ -358,12 +358,14 @@ class Qwen2_5OmniThinkerDummyInputsBuilder(
...
@@ -358,12 +358,14 @@ class Qwen2_5OmniThinkerDummyInputsBuilder(
seq_len
:
int
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
)
->
MultiModalDataDict
:
)
->
MultiModalDataDict
:
num_audios
=
mm_counts
.
get
(
"audio"
,
0
)
num_audios
=
mm_counts
.
get
(
"audio"
,
0
)
num_images
=
mm_counts
.
get
(
"image"
,
0
)
num_images
=
mm_counts
.
get
(
"image"
,
0
)
num_videos
=
mm_counts
.
get
(
"video"
,
0
)
num_videos
=
mm_counts
.
get
(
"video"
,
0
)
feature_extractor
=
self
.
info
.
get_feature_extractor
()
mm_processor_kwargs
=
mm_processor_kwargs
or
{}
feature_extractor
=
self
.
info
.
get_feature_extractor
(
**
mm_processor_kwargs
)
target_audio_length
=
(
target_audio_length
=
(
min
(
min
(
...
@@ -372,7 +374,10 @@ class Qwen2_5OmniThinkerDummyInputsBuilder(
...
@@ -372,7 +374,10 @@ class Qwen2_5OmniThinkerDummyInputsBuilder(
)
)
*
feature_extractor
.
sampling_rate
*
feature_extractor
.
sampling_rate
)
)
target_width
,
target_height
=
self
.
info
.
get_image_size_with_most_features
()
target_width
,
target_height
=
self
.
info
.
get_image_size_with_most_features
(
max_pixels
=
mm_processor_kwargs
.
get
(
"max_pixels"
,
None
),
)
target_num_frames
=
self
.
info
.
get_num_frames_with_most_features
(
target_num_frames
=
self
.
info
.
get_num_frames_with_most_features
(
seq_len
,
mm_counts
seq_len
,
mm_counts
)
)
...
...
vllm/model_executor/models/qwen2_audio.py
View file @
0ab06100
...
@@ -195,8 +195,11 @@ class Qwen2AudioDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2AudioProcessingIn
...
@@ -195,8 +195,11 @@ class Qwen2AudioDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2AudioProcessingIn
seq_len
:
int
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
)
->
MultiModalDataDict
:
)
->
MultiModalDataDict
:
feature_extractor
=
self
.
info
.
get_feature_extractor
()
feature_extractor
=
self
.
info
.
get_feature_extractor
(
**
(
mm_processor_kwargs
or
{})
)
sampling_rate
=
feature_extractor
.
sampling_rate
sampling_rate
=
feature_extractor
.
sampling_rate
audio_len
=
feature_extractor
.
chunk_length
*
sampling_rate
audio_len
=
feature_extractor
.
chunk_length
*
sampling_rate
...
...
vllm/model_executor/models/qwen2_vl.py
View file @
0ab06100
...
@@ -1016,11 +1016,15 @@ class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]):
...
@@ -1016,11 +1016,15 @@ class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]):
seq_len
:
int
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
)
->
MultiModalDataDict
:
)
->
MultiModalDataDict
:
num_images
=
mm_counts
.
get
(
"image"
,
0
)
num_images
=
mm_counts
.
get
(
"image"
,
0
)
num_videos
=
mm_counts
.
get
(
"video"
,
0
)
num_videos
=
mm_counts
.
get
(
"video"
,
0
)
target_width
,
target_height
=
self
.
info
.
get_image_size_with_most_features
()
mm_processor_kwargs
=
mm_processor_kwargs
or
{}
target_width
,
target_height
=
self
.
info
.
get_image_size_with_most_features
(
max_pixels
=
mm_processor_kwargs
.
get
(
"max_pixels"
,
None
)
)
target_num_frames
=
self
.
info
.
get_num_frames_with_most_features
(
target_num_frames
=
self
.
info
.
get_num_frames_with_most_features
(
seq_len
,
mm_counts
seq_len
,
mm_counts
)
)
...
...
vllm/model_executor/models/qwen3_asr.py
View file @
0ab06100
...
@@ -147,10 +147,13 @@ class Qwen3ASRDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3ASRProcessingInfo])
...
@@ -147,10 +147,13 @@ class Qwen3ASRDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3ASRProcessingInfo])
seq_len
:
int
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
)
->
MultiModalDataDict
:
)
->
MultiModalDataDict
:
num_audios
=
mm_counts
.
get
(
"audio"
,
0
)
num_audios
=
mm_counts
.
get
(
"audio"
,
0
)
feature_extractor
=
self
.
info
.
get_feature_extractor
()
feature_extractor
=
self
.
info
.
get_feature_extractor
(
**
(
mm_processor_kwargs
or
{})
)
target_audio_length
=
(
target_audio_length
=
(
min
(
min
(
...
...
vllm/model_executor/models/qwen3_omni_moe_thinker.py
View file @
0ab06100
...
@@ -1169,7 +1169,7 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
...
@@ -1169,7 +1169,7 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
return
x
return
x
# NOTE: WhisperFeatureExtractor cannot handle empty list of audios
# NOTE: WhisperFeatureExtractor cannot handle empty list of audios
feature_extractor
=
self
.
info
.
get_feature_extractor
()
feature_extractor
=
self
.
info
.
get_feature_extractor
(
**
mm_kwargs
)
hop_length
=
feature_extractor
.
hop_length
hop_length
=
feature_extractor
.
hop_length
if
audios
:
if
audios
:
# NOTE: Qwen3-Omni processor accept "audio"
# NOTE: Qwen3-Omni processor accept "audio"
...
...
vllm/model_executor/models/qwen3_vl.py
View file @
0ab06100
...
@@ -796,14 +796,18 @@ class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]):
...
@@ -796,14 +796,18 @@ class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]):
seq_len
:
int
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
mm_counts
:
Mapping
[
str
,
int
],
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_options
:
Mapping
[
str
,
BaseDummyOptions
]
|
None
=
None
,
mm_processor_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
)
->
MultiModalDataDict
:
)
->
MultiModalDataDict
:
num_images
=
mm_counts
.
get
(
"image"
,
0
)
num_images
=
mm_counts
.
get
(
"image"
,
0
)
num_videos
=
mm_counts
.
get
(
"video"
,
0
)
num_videos
=
mm_counts
.
get
(
"video"
,
0
)
image_overrides
=
mm_options
.
get
(
"image"
)
if
mm_options
else
None
image_overrides
=
mm_options
.
get
(
"image"
)
if
mm_options
else
None
video_overrides
=
mm_options
.
get
(
"video"
)
if
mm_options
else
None
video_overrides
=
mm_options
.
get
(
"video"
)
if
mm_options
else
None
mm_processor_kwargs
=
mm_processor_kwargs
or
{}
target_image_width
,
target_image_height
=
(
target_image_width
,
target_image_height
=
(
self
.
info
.
get_image_size_with_most_features
()
self
.
info
.
get_image_size_with_most_features
(
max_pixels
=
mm_processor_kwargs
.
get
(
"max_pixels"
,
None
),
)
)
)
# treat videos as special images
# treat videos as special images
...
@@ -828,7 +832,7 @@ class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]):
...
@@ -828,7 +832,7 @@ class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]):
target_num_frames
=
min
(
target_num_frames
,
num_frames_override
)
target_num_frames
=
min
(
target_num_frames
,
num_frames_override
)
target_num_frames
=
max
(
target_num_frames
,
2
)
target_num_frames
=
max
(
target_num_frames
,
2
)
video_processor
=
self
.
info
.
get_video_processor
()
video_processor
=
self
.
info
.
get_video_processor
(
**
(
mm_processor_kwargs
or
{})
)
video_max_pixels
=
video_processor
.
size
[
"longest_edge"
]
video_max_pixels
=
video_processor
.
size
[
"longest_edge"
]
# video_max_pixels contains the temporal compression factor,
# video_max_pixels contains the temporal compression factor,
# so we divide by 2 to get the maximum number of image pixels.
# so we divide by 2 to get the maximum number of image pixels.
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment