Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d55244df
Unverified
Commit
d55244df
authored
Apr 09, 2025
by
Nicolò Lucchesi
Committed by
GitHub
Apr 09, 2025
Browse files
[Model] Add `SupportsMultiModal.get_language_model` interface (#16007)
Signed-off-by:
NickLucche
<
nlucches@redhat.com
>
parent
04149cce
Changes
33
Show whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
39 additions
and
0 deletions
+39
-0
vllm/model_executor/models/mllama4.py
vllm/model_executor/models/mllama4.py
+3
-0
vllm/model_executor/models/molmo.py
vllm/model_executor/models/molmo.py
+3
-0
vllm/model_executor/models/paligemma.py
vllm/model_executor/models/paligemma.py
+3
-0
vllm/model_executor/models/phi3v.py
vllm/model_executor/models/phi3v.py
+3
-0
vllm/model_executor/models/phi4mm.py
vllm/model_executor/models/phi4mm.py
+3
-0
vllm/model_executor/models/pixtral.py
vllm/model_executor/models/pixtral.py
+3
-0
vllm/model_executor/models/qwen2_5_vl.py
vllm/model_executor/models/qwen2_5_vl.py
+3
-0
vllm/model_executor/models/qwen2_audio.py
vllm/model_executor/models/qwen2_audio.py
+3
-0
vllm/model_executor/models/qwen2_vl.py
vllm/model_executor/models/qwen2_vl.py
+3
-0
vllm/model_executor/models/qwen_vl.py
vllm/model_executor/models/qwen_vl.py
+3
-0
vllm/model_executor/models/skyworkr1v.py
vllm/model_executor/models/skyworkr1v.py
+3
-0
vllm/model_executor/models/ultravox.py
vllm/model_executor/models/ultravox.py
+3
-0
vllm/model_executor/models/whisper.py
vllm/model_executor/models/whisper.py
+3
-0
No files found.
vllm/model_executor/models/mllama4.py
View file @
d55244df
...
@@ -742,6 +742,9 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -742,6 +742,9 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
for
img
in
vision_embeddings_flat
.
split
(
patches_per_image
,
dim
=
0
)
for
img
in
vision_embeddings_flat
.
split
(
patches_per_image
,
dim
=
0
)
]
]
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
return
self
.
language_model
def
get_multimodal_embeddings
(
self
,
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Optional
[
MultiModalEmbeddings
]:
**
kwargs
)
->
Optional
[
MultiModalEmbeddings
]:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
...
...
vllm/model_executor/models/molmo.py
View file @
d55244df
...
@@ -1488,6 +1488,9 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
...
@@ -1488,6 +1488,9 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
)
)
]
]
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
return
self
.
model
def
get_multimodal_embeddings
(
def
get_multimodal_embeddings
(
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
...
...
vllm/model_executor/models/paligemma.py
View file @
d55244df
...
@@ -323,6 +323,9 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -323,6 +323,9 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
return
self
.
multi_modal_projector
(
image_features
)
return
self
.
multi_modal_projector
(
image_features
)
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
return
self
.
language_model
def
get_multimodal_embeddings
(
def
get_multimodal_embeddings
(
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
...
...
vllm/model_executor/models/phi3v.py
View file @
d55244df
...
@@ -674,6 +674,9 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
...
@@ -674,6 +674,9 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
return
image_embeds
return
image_embeds
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
return
self
.
language_model
def
get_multimodal_embeddings
(
def
get_multimodal_embeddings
(
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
...
...
vllm/model_executor/models/phi4mm.py
View file @
d55244df
...
@@ -1802,3 +1802,6 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal,
...
@@ -1802,3 +1802,6 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal,
connector
=
[
"audio_projection_for_vision"
,
"audio_projection"
],
connector
=
[
"audio_projection_for_vision"
,
"audio_projection"
],
tower_model
=
[
"vision_encoder"
,
"embed_tokens_extend"
],
tower_model
=
[
"vision_encoder"
,
"embed_tokens_extend"
],
)
)
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
return
self
.
model
vllm/model_executor/models/pixtral.py
View file @
d55244df
...
@@ -396,6 +396,9 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -396,6 +396,9 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
image_embeds
=
torch
.
split
(
image_embeds
,
feature_sizes
)
image_embeds
=
torch
.
split
(
image_embeds
,
feature_sizes
)
return
image_embeds
return
image_embeds
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
return
self
.
language_model
def
get_multimodal_embeddings
(
def
get_multimodal_embeddings
(
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
...
...
vllm/model_executor/models/qwen2_5_vl.py
View file @
d55244df
...
@@ -967,6 +967,9 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -967,6 +967,9 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
**
kwargs
)
**
kwargs
)
return
modalities
return
modalities
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
return
self
.
language_model
def
get_multimodal_embeddings
(
def
get_multimodal_embeddings
(
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
...
...
vllm/model_executor/models/qwen2_audio.py
View file @
d55244df
...
@@ -355,6 +355,9 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -355,6 +355,9 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
return
torch
.
split
(
masked_audio_features
,
return
torch
.
split
(
masked_audio_features
,
audio_output_lengths
.
flatten
().
tolist
())
audio_output_lengths
.
flatten
().
tolist
())
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
return
self
.
language_model
def
get_multimodal_embeddings
(
def
get_multimodal_embeddings
(
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
audio_input
=
self
.
_parse_and_validate_audio_input
(
**
kwargs
)
audio_input
=
self
.
_parse_and_validate_audio_input
(
**
kwargs
)
...
...
vllm/model_executor/models/qwen2_vl.py
View file @
d55244df
...
@@ -1276,6 +1276,9 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -1276,6 +1276,9 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
return
modalities
return
modalities
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
return
self
.
language_model
def
get_multimodal_embeddings
(
def
get_multimodal_embeddings
(
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
...
...
vllm/model_executor/models/qwen_vl.py
View file @
d55244df
...
@@ -740,6 +740,9 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
...
@@ -740,6 +740,9 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
return
self
.
transformer
.
visual
(
image_input
[
"data"
])
return
self
.
transformer
.
visual
(
image_input
[
"data"
])
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
return
self
.
transformer
def
get_multimodal_embeddings
(
def
get_multimodal_embeddings
(
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
...
...
vllm/model_executor/models/skyworkr1v.py
View file @
d55244df
...
@@ -889,6 +889,9 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -889,6 +889,9 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
else
:
else
:
self
.
visual_token_mask
=
None
self
.
visual_token_mask
=
None
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
return
self
.
language_model
def
get_multimodal_embeddings
(
def
get_multimodal_embeddings
(
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
...
...
vllm/model_executor/models/ultravox.py
View file @
d55244df
...
@@ -563,6 +563,9 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
...
@@ -563,6 +563,9 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
]
]
return
flattened_embeddings
.
split
(
embed_lens
)
return
flattened_embeddings
.
split
(
embed_lens
)
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
return
self
.
language_model
def
get_multimodal_embeddings
(
def
get_multimodal_embeddings
(
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
audio_input
=
self
.
_parse_and_validate_audio_input
(
**
kwargs
)
audio_input
=
self
.
_parse_and_validate_audio_input
(
**
kwargs
)
...
...
vllm/model_executor/models/whisper.py
View file @
d55244df
...
@@ -692,6 +692,9 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
...
@@ -692,6 +692,9 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
)
)
return
decoder_outputs
return
decoder_outputs
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
return
self
.
model
.
decoder
def
get_multimodal_embeddings
(
def
get_multimodal_embeddings
(
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
# TODO: This method does not obey the interface for SupportsMultiModal.
# TODO: This method does not obey the interface for SupportsMultiModal.
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment