Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
88c3e114
Unverified
Commit
88c3e114
authored
Feb 01, 2026
by
Cyrus Leung
Committed by
GitHub
Jan 31, 2026
Browse files
[Refactor] Move MM data parsing outside processor (#33408)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
92924b2d
Changes
43
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
94 additions
and
54 deletions
+94
-54
vllm/model_executor/models/aya_vision.py
vllm/model_executor/models/aya_vision.py
+2
-3
vllm/model_executor/models/clip.py
vllm/model_executor/models/clip.py
+4
-4
vllm/model_executor/models/cohere2_vision.py
vllm/model_executor/models/cohere2_vision.py
+2
-3
vllm/model_executor/models/gemma3_mm.py
vllm/model_executor/models/gemma3_mm.py
+2
-3
vllm/model_executor/models/idefics3.py
vllm/model_executor/models/idefics3.py
+2
-3
vllm/model_executor/models/lfm2_vl.py
vllm/model_executor/models/lfm2_vl.py
+2
-3
vllm/model_executor/models/llava.py
vllm/model_executor/models/llava.py
+2
-3
vllm/model_executor/models/minicpmo.py
vllm/model_executor/models/minicpmo.py
+2
-1
vllm/model_executor/models/minicpmv.py
vllm/model_executor/models/minicpmv.py
+4
-2
vllm/model_executor/models/mllama4.py
vllm/model_executor/models/mllama4.py
+2
-3
vllm/model_executor/models/nemotron_parse.py
vllm/model_executor/models/nemotron_parse.py
+1
-1
vllm/model_executor/models/paligemma.py
vllm/model_executor/models/paligemma.py
+2
-2
vllm/model_executor/models/pixtral.py
vllm/model_executor/models/pixtral.py
+3
-1
vllm/model_executor/models/siglip.py
vllm/model_executor/models/siglip.py
+4
-4
vllm/model_executor/models/terratorch.py
vllm/model_executor/models/terratorch.py
+6
-6
vllm/model_executor/models/transformers/multimodal.py
vllm/model_executor/models/transformers/multimodal.py
+1
-2
vllm/model_executor/models/voxtral.py
vllm/model_executor/models/voxtral.py
+7
-4
vllm/model_executor/models/whisper.py
vllm/model_executor/models/whisper.py
+1
-1
vllm/multimodal/processing/context.py
vllm/multimodal/processing/context.py
+41
-1
vllm/multimodal/processing/dummy_inputs.py
vllm/multimodal/processing/dummy_inputs.py
+4
-4
No files found.
vllm/model_executor/models/aya_vision.py
View file @
88c3e114
...
@@ -227,9 +227,8 @@ class AyaVisionMultiModalProcessor(BaseMultiModalProcessor[AyaVisionProcessingIn
...
@@ -227,9 +227,8 @@ class AyaVisionMultiModalProcessor(BaseMultiModalProcessor[AyaVisionProcessingIn
# HF processor pops the `num_patches` kwarg, which is needed by vLLM
# HF processor pops the `num_patches` kwarg, which is needed by vLLM
if
(
images
:
=
mm_data
.
get
(
"images"
))
is
not
None
:
if
(
images
:
=
mm_data
.
get
(
"images"
))
is
not
None
:
parsed_images
=
self
.
data_parser
.
parse_mm_data
({
"image"
:
images
}).
get_items
(
mm_items
=
self
.
info
.
parse_mm_data
({
"image"
:
images
},
validate
=
False
)
"image"
,
ImageProcessorItems
parsed_images
=
mm_items
.
get_items
(
"image"
,
ImageProcessorItems
)
)
image_sizes
=
[
image_sizes
=
[
parsed_images
.
get_image_size
(
i
)
for
i
in
range
(
len
(
parsed_images
))
parsed_images
.
get_image_size
(
i
)
for
i
in
range
(
len
(
parsed_images
))
]
]
...
...
vllm/model_executor/models/clip.py
View file @
88c3e114
...
@@ -201,20 +201,20 @@ class CLIPMultiModalProcessor(BaseMultiModalProcessor[CLIPProcessingInfo]):
...
@@ -201,20 +201,20 @@ class CLIPMultiModalProcessor(BaseMultiModalProcessor[CLIPProcessingInfo]):
def
apply
(
def
apply
(
self
,
self
,
prompt
:
str
|
list
[
int
],
prompt
:
str
|
list
[
int
],
mm_
data
:
MultiModalData
Dict
,
mm_
items
:
MultiModalData
Items
,
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
tokenization_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
tokenization_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
*
,
*
,
mm_uuids
:
MultiModalUUIDDict
|
None
=
None
,
mm_uuids
:
MultiModalUUIDDict
|
None
=
None
,
)
->
MultiModalInputs
:
)
->
MultiModalInputs
:
if
prompt
and
mm_
data
:
if
prompt
and
mm_
items
:
raise
ValueError
(
raise
ValueError
(
"CLIP accepts text-only or image-only inputs, not both! "
"CLIP accepts text-only or image-only inputs, not both! "
"Image-only inputs means passing an image with an empty text "
"Image-only inputs means passing an image with an empty text "
"prompt."
"prompt."
)
)
if
mm_
data
:
if
mm_
items
:
# For multi-modal data, the prompt after processing should
# For multi-modal data, the prompt after processing should
# only contain the dummy image tokens
# only contain the dummy image tokens
tokenization_kwargs
=
{
tokenization_kwargs
=
{
...
@@ -224,7 +224,7 @@ class CLIPMultiModalProcessor(BaseMultiModalProcessor[CLIPProcessingInfo]):
...
@@ -224,7 +224,7 @@ class CLIPMultiModalProcessor(BaseMultiModalProcessor[CLIPProcessingInfo]):
return
super
().
apply
(
return
super
().
apply
(
prompt
=
prompt
,
prompt
=
prompt
,
mm_
data
=
mm_data
,
mm_
items
=
mm_items
,
hf_processor_mm_kwargs
=
hf_processor_mm_kwargs
,
hf_processor_mm_kwargs
=
hf_processor_mm_kwargs
,
tokenization_kwargs
=
tokenization_kwargs
,
tokenization_kwargs
=
tokenization_kwargs
,
mm_uuids
=
mm_uuids
,
mm_uuids
=
mm_uuids
,
...
...
vllm/model_executor/models/cohere2_vision.py
View file @
88c3e114
...
@@ -262,9 +262,8 @@ class Cohere2VisionMultiModalProcessor(
...
@@ -262,9 +262,8 @@ class Cohere2VisionMultiModalProcessor(
hf_processor
=
self
.
info
.
get_hf_processor
(
**
mm_kwargs
)
hf_processor
=
self
.
info
.
get_hf_processor
(
**
mm_kwargs
)
# Fallback calculation if HF processor didn't provide num_patches
# Fallback calculation if HF processor didn't provide num_patches
parsed_images
=
self
.
data_parser
.
parse_mm_data
({
"image"
:
images
}).
get_items
(
mm_items
=
self
.
info
.
parse_mm_data
({
"image"
:
images
},
validate
=
False
)
"image"
,
ImageProcessorItems
parsed_images
=
mm_items
.
get_items
(
"image"
,
ImageProcessorItems
)
)
num_patches
=
[
num_patches
=
[
self
.
info
.
get_num_patches
(
self
.
info
.
get_num_patches
(
...
...
vllm/model_executor/models/gemma3_mm.py
View file @
88c3e114
...
@@ -290,9 +290,8 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
...
@@ -290,9 +290,8 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
# HF processor pops the `num_crops` kwarg, which is needed by vLLM
# HF processor pops the `num_crops` kwarg, which is needed by vLLM
if
(
images
:
=
mm_data
.
get
(
"images"
))
is
not
None
:
if
(
images
:
=
mm_data
.
get
(
"images"
))
is
not
None
:
parsed_images
=
self
.
data_parser
.
parse_mm_data
({
"image"
:
images
}).
get_items
(
mm_items
=
self
.
info
.
parse_mm_data
({
"image"
:
images
},
validate
=
False
)
"image"
,
ImageProcessorItems
parsed_images
=
mm_items
.
get_items
(
"image"
,
ImageProcessorItems
)
)
image_sizes
=
[
image_sizes
=
[
parsed_images
.
get_image_size
(
i
)
for
i
in
range
(
len
(
parsed_images
))
parsed_images
.
get_image_size
(
i
)
for
i
in
range
(
len
(
parsed_images
))
]
]
...
...
vllm/model_executor/models/idefics3.py
View file @
88c3e114
...
@@ -349,9 +349,8 @@ class Idefics3MultiModalProcessor(BaseMultiModalProcessor[Idefics3ProcessingInfo
...
@@ -349,9 +349,8 @@ class Idefics3MultiModalProcessor(BaseMultiModalProcessor[Idefics3ProcessingInfo
tok_kwargs
,
tok_kwargs
,
)
)
parsed_images
=
self
.
data_parser
.
parse_mm_data
({
"image"
:
images
}).
get_items
(
mm_items
=
self
.
info
.
parse_mm_data
({
"image"
:
images
},
validate
=
False
)
"image"
,
ImageProcessorItems
parsed_images
=
mm_items
.
get_items
(
"image"
,
ImageProcessorItems
)
)
image_sizes
=
[
image_sizes
=
[
parsed_images
.
get_image_size
(
i
)
for
i
in
range
(
len
(
parsed_images
))
parsed_images
.
get_image_size
(
i
)
for
i
in
range
(
len
(
parsed_images
))
]
]
...
...
vllm/model_executor/models/lfm2_vl.py
View file @
88c3e114
...
@@ -357,9 +357,8 @@ class Lfm2VLMultiModalProcessor(BaseMultiModalProcessor[Lfm2VLProcessingInfo]):
...
@@ -357,9 +357,8 @@ class Lfm2VLMultiModalProcessor(BaseMultiModalProcessor[Lfm2VLProcessingInfo]):
tok_kwargs
,
tok_kwargs
,
)
)
parsed_images
=
self
.
data_parser
.
parse_mm_data
({
"image"
:
images
}).
get_items
(
mm_items
=
self
.
info
.
parse_mm_data
({
"image"
:
images
},
validate
=
False
)
"image"
,
ImageProcessorItems
parsed_images
=
mm_items
.
get_items
(
"image"
,
ImageProcessorItems
)
)
image_sizes
=
[
image_sizes
=
[
parsed_images
.
get_image_size
(
i
)
for
i
in
range
(
len
(
parsed_images
))
parsed_images
.
get_image_size
(
i
)
for
i
in
range
(
len
(
parsed_images
))
]
]
...
...
vllm/model_executor/models/llava.py
View file @
88c3e114
...
@@ -769,7 +769,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
...
@@ -769,7 +769,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
def
apply
(
def
apply
(
self
,
self
,
prompt
:
str
|
list
[
int
],
prompt
:
str
|
list
[
int
],
mm_
data
:
MultiModalData
Dict
,
mm_
items
:
MultiModalData
Items
,
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
tokenization_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
tokenization_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
mm_uuids
:
MultiModalUUIDDict
|
None
=
None
,
mm_uuids
:
MultiModalUUIDDict
|
None
=
None
,
...
@@ -785,13 +785,12 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
...
@@ -785,13 +785,12 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
result
=
super
().
apply
(
result
=
super
().
apply
(
prompt
,
prompt
,
mm_
data
,
mm_
items
,
hf_processor_mm_kwargs
,
hf_processor_mm_kwargs
,
tokenization_kwargs
,
tokenization_kwargs
,
mm_uuids
=
mm_uuids
,
mm_uuids
=
mm_uuids
,
)
)
mm_items
=
self
.
_to_mm_items
(
mm_data
)
mm_item_counts
=
mm_items
.
get_all_counts
()
mm_item_counts
=
mm_items
.
get_all_counts
()
mm_kwargs
=
result
[
"mm_kwargs"
]
mm_kwargs
=
result
[
"mm_kwargs"
]
mm_hashes
=
result
[
"mm_hashes"
]
mm_hashes
=
result
[
"mm_hashes"
]
...
...
vllm/model_executor/models/minicpmo.py
View file @
88c3e114
...
@@ -300,7 +300,8 @@ class MiniCPMOMultiModalProcessor(MiniCPMVMultiModalProcessor[MiniCPMOProcessing
...
@@ -300,7 +300,8 @@ class MiniCPMOMultiModalProcessor(MiniCPMVMultiModalProcessor[MiniCPMOProcessing
if
(
audios
:
=
mm_data
.
get
(
"audios"
))
is
None
:
if
(
audios
:
=
mm_data
.
get
(
"audios"
))
is
None
:
return
{}
return
{}
parsed_audios
=
self
.
data_parser
.
parse_mm_data
({
"audio"
:
audios
}).
get_items
(
mm_items
=
self
.
info
.
parse_mm_data
({
"audio"
:
audios
},
validate
=
False
)
parsed_audios
=
mm_items
.
get_items
(
"audio"
,
(
MiniCPMOAudioEmbeddingItems
,
AudioProcessorItems
)
"audio"
,
(
MiniCPMOAudioEmbeddingItems
,
AudioProcessorItems
)
)
)
...
...
vllm/model_executor/models/minicpmv.py
View file @
88c3e114
...
@@ -767,7 +767,8 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
...
@@ -767,7 +767,8 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
if
(
images
:
=
mm_data
.
get
(
"images"
))
is
None
:
if
(
images
:
=
mm_data
.
get
(
"images"
))
is
None
:
return
{}
return
{}
parsed_images
=
self
.
data_parser
.
parse_mm_data
({
"image"
:
images
}).
get_items
(
mm_items
=
self
.
info
.
parse_mm_data
({
"image"
:
images
},
validate
=
False
)
parsed_images
=
mm_items
.
get_items
(
"image"
,
(
MiniCPMVImageEmbeddingItems
,
ImageProcessorItems
)
"image"
,
(
MiniCPMVImageEmbeddingItems
,
ImageProcessorItems
)
)
)
...
@@ -793,7 +794,8 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
...
@@ -793,7 +794,8 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
if
(
videos
:
=
mm_data
.
get
(
"videos"
))
is
None
:
if
(
videos
:
=
mm_data
.
get
(
"videos"
))
is
None
:
return
{}
return
{}
parsed_videos
=
self
.
data_parser
.
parse_mm_data
({
"video"
:
videos
}).
get_items
(
mm_items
=
self
.
info
.
parse_mm_data
({
"video"
:
videos
},
validate
=
False
)
parsed_videos
=
mm_items
.
get_items
(
"video"
,
(
MiniCPMVVideoEmbeddingItems
,
VideoProcessorItems
)
"video"
,
(
MiniCPMVVideoEmbeddingItems
,
VideoProcessorItems
)
)
)
...
...
vllm/model_executor/models/mllama4.py
View file @
88c3e114
...
@@ -609,9 +609,8 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo])
...
@@ -609,9 +609,8 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo])
)
)
images
=
mm_data
[
"images"
]
images
=
mm_data
[
"images"
]
parsed_images
=
self
.
data_parser
.
parse_mm_data
({
"image"
:
images
}).
get_items
(
mm_items
=
self
.
info
.
parse_mm_data
({
"image"
:
images
},
validate
=
False
)
"image"
,
ImageProcessorItems
parsed_images
=
mm_items
.
get_items
(
"image"
,
ImageProcessorItems
)
)
tile_size
=
vision_config
.
image_size
tile_size
=
vision_config
.
image_size
possible_resolutions
=
find_supported_resolutions
(
possible_resolutions
=
find_supported_resolutions
(
...
...
vllm/model_executor/models/nemotron_parse.py
View file @
88c3e114
...
@@ -660,7 +660,7 @@ class NemotronParseMultiModalProcessor(
...
@@ -660,7 +660,7 @@ class NemotronParseMultiModalProcessor(
def
create_encoder_prompt
(
def
create_encoder_prompt
(
self
,
self
,
prompt
:
str
|
list
[
int
],
prompt
:
str
|
list
[
int
],
mm_
data
:
MultiModalData
Dict
,
mm_
items
:
MultiModalData
Items
,
)
->
str
|
list
[
int
]:
)
->
str
|
list
[
int
]:
return
[
0
]
return
[
0
]
...
...
vllm/model_executor/models/paligemma.py
View file @
88c3e114
...
@@ -225,14 +225,14 @@ class PaliGemmaMultiModalProcessor(BaseMultiModalProcessor[PaliGemmaProcessingIn
...
@@ -225,14 +225,14 @@ class PaliGemmaMultiModalProcessor(BaseMultiModalProcessor[PaliGemmaProcessingIn
def
apply
(
def
apply
(
self
,
self
,
prompt
:
str
|
list
[
int
],
prompt
:
str
|
list
[
int
],
mm_
data
:
MultiModalData
Dict
,
mm_
items
:
MultiModalData
Items
,
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
tokenization_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
tokenization_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
mm_uuids
:
MultiModalUUIDDict
|
None
=
None
,
mm_uuids
:
MultiModalUUIDDict
|
None
=
None
,
)
->
MultiModalInputs
:
)
->
MultiModalInputs
:
mm_inputs
=
super
().
apply
(
mm_inputs
=
super
().
apply
(
prompt
,
prompt
,
mm_
data
,
mm_
items
,
hf_processor_mm_kwargs
,
hf_processor_mm_kwargs
,
tokenization_kwargs
,
tokenization_kwargs
,
mm_uuids
=
mm_uuids
,
mm_uuids
=
mm_uuids
,
...
...
vllm/model_executor/models/pixtral.py
View file @
88c3e114
...
@@ -303,9 +303,11 @@ class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]):
...
@@ -303,9 +303,11 @@ class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]):
res
=
tokenizer
.
mistral
.
encode_chat_completion
(
request
)
res
=
tokenizer
.
mistral
.
encode_chat_completion
(
request
)
dummy_tokens
=
res
.
tokens
dummy_tokens
=
res
.
tokens
dummy_mm_items
=
self
.
info
.
parse_mm_data
(
dummy_mm_data
)
return
ProcessorInputs
(
return
ProcessorInputs
(
prompt
=
dummy_tokens
,
prompt
=
dummy_tokens
,
mm_
data
=
dummy_mm_
data
,
mm_
items
=
dummy_mm_
items
,
tokenization_kwargs
=
tokenization_kwargs
,
tokenization_kwargs
=
tokenization_kwargs
,
)
)
...
...
vllm/model_executor/models/siglip.py
View file @
88c3e114
...
@@ -187,20 +187,20 @@ class SiglipMultiModalProcessor(BaseMultiModalProcessor[SiglipProcessingInfo]):
...
@@ -187,20 +187,20 @@ class SiglipMultiModalProcessor(BaseMultiModalProcessor[SiglipProcessingInfo]):
def
apply
(
def
apply
(
self
,
self
,
prompt
:
str
|
list
[
int
],
prompt
:
str
|
list
[
int
],
mm_
data
:
MultiModalData
Dict
,
mm_
items
:
MultiModalData
Items
,
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
tokenization_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
tokenization_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
*
,
*
,
mm_uuids
:
MultiModalUUIDDict
|
None
=
None
,
mm_uuids
:
MultiModalUUIDDict
|
None
=
None
,
)
->
MultiModalInputs
:
)
->
MultiModalInputs
:
if
prompt
and
mm_
data
:
if
prompt
and
mm_
items
:
raise
ValueError
(
raise
ValueError
(
"Siglip accepts text-only or image-only inputs, not both! "
"Siglip accepts text-only or image-only inputs, not both! "
"Image-only inputs means passing an image with an empty text "
"Image-only inputs means passing an image with an empty text "
"prompt."
"prompt."
)
)
if
mm_
data
:
if
mm_
items
:
# For multi-modal data, the prompt after processing should
# For multi-modal data, the prompt after processing should
# only contain the image token
# only contain the image token
tokenization_kwargs
=
{
tokenization_kwargs
=
{
...
@@ -210,7 +210,7 @@ class SiglipMultiModalProcessor(BaseMultiModalProcessor[SiglipProcessingInfo]):
...
@@ -210,7 +210,7 @@ class SiglipMultiModalProcessor(BaseMultiModalProcessor[SiglipProcessingInfo]):
return
super
().
apply
(
return
super
().
apply
(
prompt
=
prompt
,
prompt
=
prompt
,
mm_
data
=
mm_data
,
mm_
items
=
mm_items
,
hf_processor_mm_kwargs
=
hf_processor_mm_kwargs
,
hf_processor_mm_kwargs
=
hf_processor_mm_kwargs
,
tokenization_kwargs
=
tokenization_kwargs
,
tokenization_kwargs
=
tokenization_kwargs
,
mm_uuids
=
mm_uuids
,
mm_uuids
=
mm_uuids
,
...
...
vllm/model_executor/models/terratorch.py
View file @
88c3e114
...
@@ -180,20 +180,20 @@ class TerratorchMultiModalProcessor(BaseMultiModalProcessor[TerratorchProcessing
...
@@ -180,20 +180,20 @@ class TerratorchMultiModalProcessor(BaseMultiModalProcessor[TerratorchProcessing
def
apply
(
def
apply
(
self
,
self
,
prompt
:
str
|
list
[
int
],
prompt
:
str
|
list
[
int
],
mm_
data
:
MultiModalData
Dict
,
mm_
items
:
MultiModalData
Items
,
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
tokenization_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
tokenization_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
mm_uuids
:
MultiModalUUIDDict
|
None
=
None
,
mm_uuids
:
MultiModalUUIDDict
|
None
=
None
,
)
->
MultiModalInputs
:
)
->
MultiModalInputs
:
mm_items
=
self
.
_to_mm_items
(
mm_data
)
if
tokenization_kwargs
is
None
:
tokenization_kwargs
=
tokenization_kwargs
or
{}
tokenization_kwargs
=
{}
mm_hashes
=
self
.
_hash_mm_items
(
mm_hashes
=
self
.
_hash_mm_items
(
mm_items
,
hf_processor_mm_kwargs
,
tokenization_kwargs
,
mm_uuids
=
mm_uuids
mm_items
,
hf_processor_mm_kwargs
,
tokenization_kwargs
,
mm_uuids
=
mm_uuids
)
)
mm_processed_data
=
BatchFeature
(
_
,
passthrough_data
=
self
.
_get_hf_mm_data
(
mm_items
)
mm_data
.
get
(
"image"
,
mm_data
),
tensor_type
=
"pt"
mm_processed_data
=
BatchFeature
(
dict
(
passthrough_data
),
tensor_type
=
"pt"
)
)
mm_placeholders
=
{
"image"
:
[
PlaceholderRange
(
offset
=
0
,
length
=
0
)]}
mm_placeholders
=
{
"image"
:
[
PlaceholderRange
(
offset
=
0
,
length
=
0
)]}
mm_kwargs
=
MultiModalKwargsItems
.
from_hf_inputs
(
mm_kwargs
=
MultiModalKwargsItems
.
from_hf_inputs
(
...
...
vllm/model_executor/models/transformers/multimodal.py
View file @
88c3e114
...
@@ -174,7 +174,7 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
...
@@ -174,7 +174,7 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
def
apply
(
def
apply
(
self
,
self
,
prompt
:
str
|
list
[
int
],
prompt
:
str
|
list
[
int
],
mm_
data
:
MultiModalData
Dict
,
mm_
items
:
MultiModalData
Items
,
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
tokenization_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
tokenization_kwargs
:
Mapping
[
str
,
object
]
|
None
=
None
,
mm_uuids
:
MultiModalUUIDDict
|
None
=
None
,
mm_uuids
:
MultiModalUUIDDict
|
None
=
None
,
...
@@ -188,7 +188,6 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
...
@@ -188,7 +188,6 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
if
tokenization_kwargs
is
None
:
if
tokenization_kwargs
is
None
:
tokenization_kwargs
=
{}
tokenization_kwargs
=
{}
mm_items
=
self
.
_to_mm_items
(
mm_data
)
hf_processor
=
self
.
info
.
get_hf_processor
(
**
hf_processor_mm_kwargs
)
hf_processor
=
self
.
info
.
get_hf_processor
(
**
hf_processor_mm_kwargs
)
if
not
isinstance
(
prompt
,
str
):
if
not
isinstance
(
prompt
,
str
):
# the prompt is the tokenized ids which is not supported
# the prompt is the tokenized ids which is not supported
...
...
vllm/model_executor/models/voxtral.py
View file @
88c3e114
...
@@ -262,11 +262,14 @@ class VoxtralDummyInputsBuilder(BaseDummyInputsBuilder[VoxtralProcessingInfo]):
...
@@ -262,11 +262,14 @@ class VoxtralDummyInputsBuilder(BaseDummyInputsBuilder[VoxtralProcessingInfo]):
)
)
res
=
tokenizer
.
mistral
.
encode_chat_completion
(
request
)
res
=
tokenizer
.
mistral
.
encode_chat_completion
(
request
)
dummy_tokens
=
res
.
tokens
dummy_tokens
=
res
.
tokens
dummy_mm_inputs
=
self
.
info
.
parse_mm_data
(
# whixtral tokenizer adds padding to the audio
# whixtral tokenizer adds padding to the audio
# so we need to update the audio arrays
# so we need to update the audio arrays
dummy_mm_data
[
"audio"
]
=
[
a
.
audio_array
for
a
in
res
.
audios
]
{
**
dummy_mm_data
,
"audio"
:
[
a
.
audio_array
for
a
in
res
.
audios
]},
)
return
ProcessorInputs
(
prompt
=
dummy_tokens
,
mm_
data
=
dummy_mm_
data
)
return
ProcessorInputs
(
prompt
=
dummy_tokens
,
mm_
items
=
dummy_mm_
inputs
)
class
VoxtralMultiModalProcessor
(
BaseMultiModalProcessor
[
VoxtralProcessingInfo
]):
class
VoxtralMultiModalProcessor
(
BaseMultiModalProcessor
[
VoxtralProcessingInfo
]):
...
...
vllm/model_executor/models/whisper.py
View file @
88c3e114
...
@@ -705,7 +705,7 @@ class WhisperMultiModalProcessor(EncDecMultiModalProcessor[WhisperProcessingInfo
...
@@ -705,7 +705,7 @@ class WhisperMultiModalProcessor(EncDecMultiModalProcessor[WhisperProcessingInfo
def
create_encoder_prompt
(
def
create_encoder_prompt
(
self
,
self
,
prompt
:
str
|
list
[
int
],
prompt
:
str
|
list
[
int
],
mm_
data
:
MultiModalData
Dict
,
mm_
items
:
MultiModalData
Items
,
)
->
str
|
list
[
int
]:
)
->
str
|
list
[
int
]:
# Strictly speaking, whisper encoder only accept audio features.
# Strictly speaking, whisper encoder only accept audio features.
# We create a dummy encoder prompt here which will be padded to
# We create a dummy encoder prompt here which will be padded to
...
...
vllm/multimodal/processing/context.py
View file @
88c3e114
...
@@ -14,7 +14,13 @@ import torch
...
@@ -14,7 +14,13 @@ import torch
from
typing_extensions
import
TypeVar
from
typing_extensions
import
TypeVar
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.multimodal.parse
import
MultiModalDataParser
from
vllm.multimodal.inputs
import
MultiModalDataDict
from
vllm.multimodal.parse
import
(
DictEmbeddingItems
,
EmbeddingItems
,
MultiModalDataItems
,
MultiModalDataParser
,
)
from
vllm.tokenizers
import
TokenizerLike
from
vllm.tokenizers
import
TokenizerLike
from
vllm.transformers_utils.processor
import
cached_processor_from_config
from
vllm.transformers_utils.processor
import
cached_processor_from_config
from
vllm.utils.func_utils
import
get_allowed_kwarg_only_overrides
from
vllm.utils.func_utils
import
get_allowed_kwarg_only_overrides
...
@@ -596,6 +602,10 @@ class BaseProcessingInfo:
...
@@ -596,6 +602,10 @@ class BaseProcessingInfo:
expected_hidden_size
=
self
.
_get_expected_hidden_size
(),
expected_hidden_size
=
self
.
_get_expected_hidden_size
(),
)
)
@
cached_property
def
data_parser
(
self
)
->
MultiModalDataParser
:
return
self
.
get_data_parser
()
@
property
@
property
def
skip_prompt_length_check
(
self
)
->
bool
:
def
skip_prompt_length_check
(
self
)
->
bool
:
return
False
return
False
...
@@ -655,6 +665,36 @@ class BaseProcessingInfo:
...
@@ -655,6 +665,36 @@ class BaseProcessingInfo:
raise
ValueError
(
msg
)
raise
ValueError
(
msg
)
def
parse_mm_data
(
self
,
mm_data
:
MultiModalDataDict
,
*
,
validate
:
bool
=
True
,
)
->
MultiModalDataItems
:
"""
Normalize
[`MultiModalDataDict`][vllm.multimodal.inputs.MultiModalDataDict]
to [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems]
before passing them to
[`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data].
"""
mm_items
=
self
.
data_parser
.
parse_mm_data
(
mm_data
)
if
validate
:
mm_config
=
self
.
ctx
.
model_config
.
get_multimodal_config
()
if
not
mm_config
.
enable_mm_embeds
:
for
modality
,
items
in
mm_items
.
items
():
if
isinstance
(
items
,
(
EmbeddingItems
,
DictEmbeddingItems
)):
raise
ValueError
(
f
"You must set `--enable-mm-embeds` to input "
f
"`
{
modality
}
_embeds`"
)
for
modality
,
items
in
mm_items
.
items
():
self
.
validate_num_items
(
modality
,
len
(
items
))
return
mm_items
def
get_mm_max_tokens_per_item
(
def
get_mm_max_tokens_per_item
(
self
,
self
,
seq_len
:
int
,
seq_len
:
int
,
...
...
vllm/multimodal/processing/dummy_inputs.py
View file @
88c3e114
...
@@ -18,6 +18,7 @@ from vllm.config.multimodal import (
...
@@ -18,6 +18,7 @@ from vllm.config.multimodal import (
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
..inputs
import
MultiModalDataDict
from
..inputs
import
MultiModalDataDict
from
..parse
import
MultiModalDataItems
from
.context
import
BaseProcessingInfo
from
.context
import
BaseProcessingInfo
_I
=
TypeVar
(
"_I"
,
bound
=
BaseProcessingInfo
)
_I
=
TypeVar
(
"_I"
,
bound
=
BaseProcessingInfo
)
...
@@ -33,7 +34,7 @@ class ProcessorInputs:
...
@@ -33,7 +34,7 @@ class ProcessorInputs:
"""
"""
prompt
:
str
|
list
[
int
]
prompt
:
str
|
list
[
int
]
mm_
data
:
MultiModalData
Dict
mm_
items
:
MultiModalData
Items
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
]
=
field
(
default_factory
=
dict
)
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
]
=
field
(
default_factory
=
dict
)
tokenization_kwargs
:
Mapping
[
str
,
object
]
=
field
(
default_factory
=
dict
)
tokenization_kwargs
:
Mapping
[
str
,
object
]
=
field
(
default_factory
=
dict
)
...
@@ -93,15 +94,14 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
...
@@ -93,15 +94,14 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
mm_options: Configurable options per modality (optional)
mm_options: Configurable options per modality (optional)
"""
"""
dummy_text
=
self
.
get_dummy_text
(
mm_counts
)
dummy_text
=
self
.
get_dummy_text
(
mm_counts
)
# Use the unified function for both legacy and configurable cases
dummy_mm_data
=
self
.
get_dummy_mm_data
(
seq_len
,
mm_counts
,
mm_options
)
dummy_mm_data
=
self
.
get_dummy_mm_data
(
seq_len
,
mm_counts
,
mm_options
)
dummy_mm_items
=
self
.
info
.
parse_mm_data
(
dummy_mm_data
)
tokenization_kwargs
=
{
"truncation"
:
False
}
tokenization_kwargs
=
{
"truncation"
:
False
}
return
ProcessorInputs
(
return
ProcessorInputs
(
prompt
=
dummy_text
,
prompt
=
dummy_text
,
mm_
data
=
dummy_mm_
data
,
mm_
items
=
dummy_mm_
items
,
tokenization_kwargs
=
tokenization_kwargs
,
tokenization_kwargs
=
tokenization_kwargs
,
)
)
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment