Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
90f9c2eb
Unverified
Commit
90f9c2eb
authored
Jun 16, 2025
by
Russell Bryant
Committed by
GitHub
Jun 16, 2025
Browse files
[V1] Change return type on get_multimodal_embeddings() (#19446)
Signed-off-by:
Russell Bryant
<
rbryant@redhat.com
>
parent
387bdf0a
Changes
37
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
49 additions
and
47 deletions
+49
-47
vllm/model_executor/models/mllama4.py
vllm/model_executor/models/mllama4.py
+2
-3
vllm/model_executor/models/molmo.py
vllm/model_executor/models/molmo.py
+3
-3
vllm/model_executor/models/ovis.py
vllm/model_executor/models/ovis.py
+3
-3
vllm/model_executor/models/paligemma.py
vllm/model_executor/models/paligemma.py
+3
-3
vllm/model_executor/models/phi3v.py
vllm/model_executor/models/phi3v.py
+4
-4
vllm/model_executor/models/phi4mm.py
vllm/model_executor/models/phi4mm.py
+3
-2
vllm/model_executor/models/pixtral.py
vllm/model_executor/models/pixtral.py
+3
-3
vllm/model_executor/models/qwen2_5_omni_thinker.py
vllm/model_executor/models/qwen2_5_omni_thinker.py
+3
-3
vllm/model_executor/models/qwen2_5_vl.py
vllm/model_executor/models/qwen2_5_vl.py
+3
-3
vllm/model_executor/models/qwen2_audio.py
vllm/model_executor/models/qwen2_audio.py
+3
-3
vllm/model_executor/models/qwen2_vl.py
vllm/model_executor/models/qwen2_vl.py
+3
-2
vllm/model_executor/models/qwen_vl.py
vllm/model_executor/models/qwen_vl.py
+3
-3
vllm/model_executor/models/skyworkr1v.py
vllm/model_executor/models/skyworkr1v.py
+3
-3
vllm/model_executor/models/tarsier.py
vllm/model_executor/models/tarsier.py
+3
-3
vllm/model_executor/models/ultravox.py
vllm/model_executor/models/ultravox.py
+3
-3
vllm/model_executor/models/whisper.py
vllm/model_executor/models/whisper.py
+2
-2
vllm/v1/worker/utils.py
vllm/v1/worker/utils.py
+2
-1
No files found.
vllm/model_executor/models/mllama4.py
View file @
90f9c2eb
...
@@ -794,11 +794,10 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -794,11 +794,10 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
return
self
.
language_model
return
self
.
language_model
def
get_multimodal_embeddings
(
self
,
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
MultiModalEmbeddings
:
**
kwargs
)
->
Optional
[
MultiModalEmbeddings
]:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
if
image_input
is
None
:
return
None
return
[]
return
self
.
_process_image_input
(
image_input
)
return
self
.
_process_image_input
(
image_input
)
...
...
vllm/model_executor/models/molmo.py
View file @
90f9c2eb
...
@@ -1473,11 +1473,11 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
...
@@ -1473,11 +1473,11 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
return
self
.
model
return
self
.
model
def
get_multimodal_embeddings
(
def
get_multimodal_embeddings
(
self
,
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]
:
**
kwargs
:
object
)
->
MultiModalEmbeddings
:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
if
image_input
is
None
:
return
None
return
[]
return
self
.
_process_image_input
(
image_input
)
return
self
.
_process_image_input
(
image_input
)
...
...
vllm/model_executor/models/ovis.py
View file @
90f9c2eb
...
@@ -499,11 +499,11 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -499,11 +499,11 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
return
tuple
(
vision_embeddings
)
return
tuple
(
vision_embeddings
)
def
get_multimodal_embeddings
(
def
get_multimodal_embeddings
(
self
,
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]
:
**
kwargs
:
object
)
->
MultiModalEmbeddings
:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
if
image_input
is
None
:
return
None
return
[]
image_features
=
self
.
_process_image_input
(
image_input
)
image_features
=
self
.
_process_image_input
(
image_input
)
...
...
vllm/model_executor/models/paligemma.py
View file @
90f9c2eb
...
@@ -338,11 +338,11 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -338,11 +338,11 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
return
self
.
language_model
return
self
.
language_model
def
get_multimodal_embeddings
(
def
get_multimodal_embeddings
(
self
,
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]
:
**
kwargs
:
object
)
->
MultiModalEmbeddings
:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
if
image_input
is
None
:
return
None
return
[]
vision_embeddings
=
self
.
_process_image_input
(
image_input
)
vision_embeddings
=
self
.
_process_image_input
(
image_input
)
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/paligemma/modeling_paligemma.py#L294 # noqa
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/paligemma/modeling_paligemma.py#L294 # noqa
vision_embeddings
=
vision_embeddings
*
(
self
.
config
.
hidden_size
**-
0.5
)
vision_embeddings
=
vision_embeddings
*
(
self
.
config
.
hidden_size
**-
0.5
)
...
...
vllm/model_executor/models/phi3v.py
View file @
90f9c2eb
...
@@ -655,11 +655,11 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
...
@@ -655,11 +655,11 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
return
self
.
language_model
return
self
.
language_model
def
get_multimodal_embeddings
(
def
get_multimodal_embeddings
(
self
,
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]
:
**
kwargs
:
object
)
->
MultiModalEmbeddings
:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
if
image_input
is
None
:
return
None
return
[]
vision_embeddings
=
self
.
_process_image_input
(
image_input
)
vision_embeddings
=
self
.
_process_image_input
(
image_input
)
return
vision_embeddings
return
vision_embeddings
...
@@ -669,7 +669,7 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
...
@@ -669,7 +669,7 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
multimodal_embeddings
:
Optional
[
MultiModalEmbeddings
]
=
None
,
multimodal_embeddings
:
Optional
[
MultiModalEmbeddings
]
=
None
,
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
embed_tokens
(
input_ids
)
inputs_embeds
=
self
.
embed_tokens
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
if
multimodal_embeddings
:
inputs_embeds
=
merge_multimodal_embeddings
(
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
multimodal_embeddings
,
input_ids
,
inputs_embeds
,
multimodal_embeddings
,
self
.
image_token_id
)
self
.
image_token_id
)
...
...
vllm/model_executor/models/phi4mm.py
View file @
90f9c2eb
...
@@ -1112,11 +1112,12 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
...
@@ -1112,11 +1112,12 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
image_attention_mask
)
image_attention_mask
)
return
image_embeds
return
image_embeds
def
get_multimodal_embeddings
(
def
get_multimodal_embeddings
(
self
,
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]
:
**
kwargs
:
object
)
->
MultiModalEmbeddings
:
modalities
=
self
.
_parse_and_validate_multimodal_inputs
(
**
kwargs
)
modalities
=
self
.
_parse_and_validate_multimodal_inputs
(
**
kwargs
)
if
not
modalities
:
if
not
modalities
:
return
[]
return
None
return
None
# The result multimodal_embeddings is tuple of tensors, with each
# The result multimodal_embeddings is tuple of tensors, with each
...
...
vllm/model_executor/models/pixtral.py
View file @
90f9c2eb
...
@@ -409,11 +409,11 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -409,11 +409,11 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
return
self
.
language_model
return
self
.
language_model
def
get_multimodal_embeddings
(
def
get_multimodal_embeddings
(
self
,
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]
:
**
kwargs
:
object
)
->
MultiModalEmbeddings
:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
if
image_input
is
None
:
return
None
return
[]
return
self
.
_process_image_input
(
image_input
)
return
self
.
_process_image_input
(
image_input
)
...
...
vllm/model_executor/models/qwen2_5_omni_thinker.py
View file @
90f9c2eb
...
@@ -772,13 +772,13 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
...
@@ -772,13 +772,13 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
return
self
.
language_model
return
self
.
language_model
def
get_multimodal_embeddings
(
def
get_multimodal_embeddings
(
self
,
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]
:
**
kwargs
:
object
)
->
MultiModalEmbeddings
:
mm_input_by_modality
=
self
.
_parse_and_validate_multimodal_inputs
(
mm_input_by_modality
=
self
.
_parse_and_validate_multimodal_inputs
(
**
kwargs
)
**
kwargs
)
if
not
mm_input_by_modality
:
if
not
mm_input_by_modality
:
return
None
return
[]
# The result multimodal_embeddings is tuple of tensors, with each
# The result multimodal_embeddings is tuple of tensors, with each
# tensor correspoending to a multimodal data item (image or video).
# tensor correspoending to a multimodal data item (image or video).
...
...
vllm/model_executor/models/qwen2_5_vl.py
View file @
90f9c2eb
...
@@ -1016,13 +1016,13 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -1016,13 +1016,13 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
return
self
.
language_model
return
self
.
language_model
def
get_multimodal_embeddings
(
def
get_multimodal_embeddings
(
self
,
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]
:
**
kwargs
:
object
)
->
MultiModalEmbeddings
:
mm_input_by_modality
=
self
.
_parse_and_validate_multimodal_inputs
(
mm_input_by_modality
=
self
.
_parse_and_validate_multimodal_inputs
(
**
kwargs
)
**
kwargs
)
if
not
mm_input_by_modality
:
if
not
mm_input_by_modality
:
return
None
return
[]
# The result multimodal_embeddings is tuple of tensors, with each
# The result multimodal_embeddings is tuple of tensors, with each
# tensor correspoending to a multimodal data item (image or video).
# tensor correspoending to a multimodal data item (image or video).
...
...
vllm/model_executor/models/qwen2_audio.py
View file @
90f9c2eb
...
@@ -350,11 +350,11 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -350,11 +350,11 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
return
self
.
language_model
return
self
.
language_model
def
get_multimodal_embeddings
(
def
get_multimodal_embeddings
(
self
,
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]
:
**
kwargs
:
object
)
->
MultiModalEmbeddings
:
audio_input
=
self
.
_parse_and_validate_audio_input
(
**
kwargs
)
audio_input
=
self
.
_parse_and_validate_audio_input
(
**
kwargs
)
if
audio_input
is
None
:
if
audio_input
is
None
:
return
None
return
[]
masked_audio_features
=
self
.
_process_audio_input
(
audio_input
)
masked_audio_features
=
self
.
_process_audio_input
(
audio_input
)
return
masked_audio_features
return
masked_audio_features
...
...
vllm/model_executor/models/qwen2_vl.py
View file @
90f9c2eb
...
@@ -1257,11 +1257,12 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -1257,11 +1257,12 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
return
self
.
language_model
return
self
.
language_model
def
get_multimodal_embeddings
(
def
get_multimodal_embeddings
(
self
,
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]
:
**
kwargs
:
object
)
->
MultiModalEmbeddings
:
modalities
=
self
.
_parse_and_validate_multimodal_inputs
(
**
kwargs
)
modalities
=
self
.
_parse_and_validate_multimodal_inputs
(
**
kwargs
)
if
not
modalities
:
if
not
modalities
:
return
[]
return
None
return
None
# The result multimodal_embeddings is tuple of tensors, with each
# The result multimodal_embeddings is tuple of tensors, with each
...
...
vllm/model_executor/models/qwen_vl.py
View file @
90f9c2eb
...
@@ -738,11 +738,11 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
...
@@ -738,11 +738,11 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
return
self
.
transformer
return
self
.
transformer
def
get_multimodal_embeddings
(
def
get_multimodal_embeddings
(
self
,
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]
:
**
kwargs
:
object
)
->
MultiModalEmbeddings
:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
if
image_input
is
None
:
return
None
return
[]
vision_embeddings
=
self
.
_process_image_input
(
image_input
)
vision_embeddings
=
self
.
_process_image_input
(
image_input
)
return
vision_embeddings
return
vision_embeddings
...
...
vllm/model_executor/models/skyworkr1v.py
View file @
90f9c2eb
...
@@ -869,11 +869,11 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -869,11 +869,11 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
return
self
.
language_model
return
self
.
language_model
def
get_multimodal_embeddings
(
def
get_multimodal_embeddings
(
self
,
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]
:
**
kwargs
:
object
)
->
MultiModalEmbeddings
:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
if
image_input
is
None
:
return
None
return
[]
return
self
.
_process_image_input
(
image_input
)
return
self
.
_process_image_input
(
image_input
)
...
...
vllm/model_executor/models/tarsier.py
View file @
90f9c2eb
...
@@ -585,11 +585,11 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -585,11 +585,11 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal,
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
return
self
.
language_model
return
self
.
language_model
def
get_multimodal_embeddings
(
def
get_multimodal_embeddings
(
self
,
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]
:
**
kwargs
:
object
)
->
MultiModalEmbeddings
:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
if
image_input
is
None
:
return
None
return
[]
return
self
.
_process_image_input
(
image_input
)
return
self
.
_process_image_input
(
image_input
)
def
get_input_embeddings
(
def
get_input_embeddings
(
...
...
vllm/model_executor/models/ultravox.py
View file @
90f9c2eb
...
@@ -546,11 +546,11 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
...
@@ -546,11 +546,11 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
return
self
.
language_model
return
self
.
language_model
def
get_multimodal_embeddings
(
def
get_multimodal_embeddings
(
self
,
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]
:
**
kwargs
:
object
)
->
MultiModalEmbeddings
:
audio_input
=
self
.
_parse_and_validate_audio_input
(
**
kwargs
)
audio_input
=
self
.
_parse_and_validate_audio_input
(
**
kwargs
)
if
audio_input
is
None
:
if
audio_input
is
None
:
return
None
return
[]
audio_embeddings
=
self
.
_process_audio_input
(
audio_input
)
audio_embeddings
=
self
.
_process_audio_input
(
audio_input
)
return
audio_embeddings
return
audio_embeddings
...
...
vllm/model_executor/models/whisper.py
View file @
90f9c2eb
...
@@ -687,8 +687,8 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
...
@@ -687,8 +687,8 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
return
self
.
model
.
decoder
return
self
.
model
.
decoder
def
get_multimodal_embeddings
(
def
get_multimodal_embeddings
(
self
,
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]
:
**
kwargs
:
object
)
->
MultiModalEmbeddings
:
# TODO: This method does not obey the interface for SupportsMultiModal.
# TODO: This method does not obey the interface for SupportsMultiModal.
# Refactor this once encoder/decoder support is implemented in V1.
# Refactor this once encoder/decoder support is implemented in V1.
audio_input
=
self
.
_parse_and_validate_audio_input
(
**
kwargs
)
audio_input
=
self
.
_parse_and_validate_audio_input
(
**
kwargs
)
...
...
vllm/v1/worker/utils.py
View file @
90f9c2eb
...
@@ -4,11 +4,12 @@ from typing import Optional
...
@@ -4,11 +4,12 @@ from typing import Optional
import
torch
import
torch
from
vllm.model_executor.models.interfaces
import
MultiModalEmbeddings
from
vllm.v1.kv_cache_interface
import
KVCacheGroupSpec
from
vllm.v1.kv_cache_interface
import
KVCacheGroupSpec
def
sanity_check_mm_encoder_outputs
(
def
sanity_check_mm_encoder_outputs
(
mm_embeddings
:
object
,
mm_embeddings
:
MultiModalEmbeddings
,
expected_num_items
:
int
,
expected_num_items
:
int
,
)
->
None
:
)
->
None
:
"""
"""
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment