Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
14fdd21d
Unverified
Commit
14fdd21d
authored
Jun 18, 2025
by
Russell Bryant
Committed by
GitHub
Jun 18, 2025
Browse files
[Core] More fixes to MultiModalEmbeddings type handling (#19715)
Signed-off-by:
Russell Bryant
<
rbryant@redhat.com
>
parent
04fefe7c
Changes
35
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
40 additions
and
20 deletions
+40
-20
vllm/model_executor/models/aria.py
vllm/model_executor/models/aria.py
+2
-1
vllm/model_executor/models/aya_vision.py
vllm/model_executor/models/aya_vision.py
+2
-1
vllm/model_executor/models/blip2.py
vllm/model_executor/models/blip2.py
+2
-1
vllm/model_executor/models/chameleon.py
vllm/model_executor/models/chameleon.py
+2
-1
vllm/model_executor/models/deepseek_vl2.py
vllm/model_executor/models/deepseek_vl2.py
+2
-1
vllm/model_executor/models/florence2.py
vllm/model_executor/models/florence2.py
+2
-1
vllm/model_executor/models/fuyu.py
vllm/model_executor/models/fuyu.py
+2
-1
vllm/model_executor/models/gemma3_mm.py
vllm/model_executor/models/gemma3_mm.py
+2
-1
vllm/model_executor/models/glm4v.py
vllm/model_executor/models/glm4v.py
+2
-1
vllm/model_executor/models/granite_speech.py
vllm/model_executor/models/granite_speech.py
+2
-1
vllm/model_executor/models/idefics3.py
vllm/model_executor/models/idefics3.py
+2
-1
vllm/model_executor/models/internvl.py
vllm/model_executor/models/internvl.py
+2
-1
vllm/model_executor/models/kimi_vl.py
vllm/model_executor/models/kimi_vl.py
+2
-1
vllm/model_executor/models/llava.py
vllm/model_executor/models/llava.py
+2
-1
vllm/model_executor/models/llava_next.py
vllm/model_executor/models/llava_next.py
+2
-1
vllm/model_executor/models/llava_next_video.py
vllm/model_executor/models/llava_next_video.py
+2
-1
vllm/model_executor/models/llava_onevision.py
vllm/model_executor/models/llava_onevision.py
+2
-1
vllm/model_executor/models/minicpmv.py
vllm/model_executor/models/minicpmv.py
+2
-1
vllm/model_executor/models/minimax_vl_01.py
vllm/model_executor/models/minimax_vl_01.py
+2
-1
vllm/model_executor/models/mistral3.py
vllm/model_executor/models/mistral3.py
+2
-1
No files found.
vllm/model_executor/models/aria.py
View file @
14fdd21d
...
...
@@ -620,7 +620,8 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
multimodal_embeddings
:
Optional
[
MultiModalEmbeddings
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
if
multimodal_embeddings
is
not
None
\
and
len
(
multimodal_embeddings
)
!=
0
:
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
multimodal_embeddings
,
self
.
config
.
image_token_index
)
...
...
vllm/model_executor/models/aya_vision.py
View file @
14fdd21d
...
...
@@ -430,7 +430,8 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
multimodal_embeddings
:
Optional
[
MultiModalEmbeddings
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
if
multimodal_embeddings
is
not
None
\
and
len
(
multimodal_embeddings
)
!=
0
:
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
=
input_ids
,
inputs_embeds
=
inputs_embeds
,
...
...
vllm/model_executor/models/blip2.py
View file @
14fdd21d
...
...
@@ -641,7 +641,8 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
multimodal_embeddings
:
Optional
[
MultiModalEmbeddings
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
if
multimodal_embeddings
is
not
None
\
and
len
(
multimodal_embeddings
)
!=
0
:
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
multimodal_embeddings
,
_IMAGE_TOKEN_ID
)
...
...
vllm/model_executor/models/chameleon.py
View file @
14fdd21d
...
...
@@ -1005,7 +1005,8 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
if
multimodal_embeddings
is
not
None
\
and
len
(
multimodal_embeddings
)
!=
0
:
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
multimodal_embeddings
,
self
.
model
.
vocabulary_mapping
.
image_token_id
)
...
...
vllm/model_executor/models/deepseek_vl2.py
View file @
14fdd21d
...
...
@@ -600,7 +600,8 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
multimodal_embeddings
:
Optional
[
MultiModalEmbeddings
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
if
multimodal_embeddings
is
not
None
\
and
len
(
multimodal_embeddings
)
!=
0
:
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
multimodal_embeddings
,
self
.
image_token_id
)
...
...
vllm/model_executor/models/florence2.py
View file @
14fdd21d
...
...
@@ -1046,7 +1046,8 @@ class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal,
multimodal_embeddings
:
Optional
[
MultiModalEmbeddings
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
if
multimodal_embeddings
is
not
None
\
and
len
(
multimodal_embeddings
)
!=
0
:
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
multimodal_embeddings
,
self
.
pad_token_id
)
...
...
vllm/model_executor/models/fuyu.py
View file @
14fdd21d
...
...
@@ -345,7 +345,8 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
multimodal_embeddings
:
Optional
[
MultiModalEmbeddings
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
if
multimodal_embeddings
is
not
None
\
and
len
(
multimodal_embeddings
)
!=
0
:
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
...
...
vllm/model_executor/models/gemma3_mm.py
View file @
14fdd21d
...
...
@@ -592,7 +592,8 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
multimodal_embeddings
:
Optional
[
MultiModalEmbeddings
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
if
multimodal_embeddings
is
not
None
\
and
len
(
multimodal_embeddings
)
!=
0
:
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
...
...
vllm/model_executor/models/glm4v.py
View file @
14fdd21d
...
...
@@ -609,7 +609,8 @@ class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
transformer
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
if
multimodal_embeddings
is
not
None
\
and
len
(
multimodal_embeddings
)
!=
0
:
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
=
input_ids
,
inputs_embeds
=
inputs_embeds
,
...
...
vllm/model_executor/models/granite_speech.py
View file @
14fdd21d
...
...
@@ -721,7 +721,8 @@ class GraniteSpeechForConditionalGeneration(
multimodal_embeddings
:
Optional
[
MultiModalEmbeddings
]
=
None
,
)
->
torch
.
Tensor
:
"""Compute the merged LLM / audio embeddings."""
if
multimodal_embeddings
is
None
:
if
multimodal_embeddings
is
None
\
or
len
(
multimodal_embeddings
)
==
0
:
return
self
.
language_model
.
get_input_embeddings
(
input_ids
)
inputs_embeds
=
embed_multimodal
(
...
...
vllm/model_executor/models/idefics3.py
View file @
14fdd21d
...
...
@@ -720,7 +720,8 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
multimodal_embeddings
:
Optional
[
MultiModalEmbeddings
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
if
multimodal_embeddings
is
not
None
\
and
len
(
multimodal_embeddings
)
!=
0
:
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
...
...
vllm/model_executor/models/internvl.py
View file @
14fdd21d
...
...
@@ -1336,7 +1336,8 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
multimodal_embeddings
:
Optional
[
MultiModalEmbeddings
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
if
multimodal_embeddings
is
not
None
\
and
len
(
multimodal_embeddings
)
!=
0
:
context_token_ids
=
[
token_id
for
token_id
in
(
self
.
img_context_token_id
,
self
.
video_context_token_id
)
...
...
vllm/model_executor/models/kimi_vl.py
View file @
14fdd21d
...
...
@@ -393,7 +393,8 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal):
# model as one of the requirements of basic vLLM model implementation.
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
if
multimodal_embeddings
is
not
None
and
len
(
multimodal_embeddings
)
!=
0
:
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
=
input_ids
,
inputs_embeds
=
inputs_embeds
,
...
...
vllm/model_executor/models/llava.py
View file @
14fdd21d
...
...
@@ -683,7 +683,8 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
multimodal_embeddings
:
Optional
[
MultiModalEmbeddings
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
if
multimodal_embeddings
is
not
None
\
and
len
(
multimodal_embeddings
)
!=
0
:
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
...
...
vllm/model_executor/models/llava_next.py
View file @
14fdd21d
...
...
@@ -502,7 +502,8 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
multimodal_embeddings
:
Optional
[
MultiModalEmbeddings
]
=
None
,
)
->
torch
.
Tensor
:
if
not
multimodal_embeddings
:
if
multimodal_embeddings
is
None
\
or
len
(
multimodal_embeddings
)
==
0
:
return
self
.
language_model
.
get_input_embeddings
(
input_ids
)
inputs_embeds
=
embed_multimodal
(
...
...
vllm/model_executor/models/llava_next_video.py
View file @
14fdd21d
...
...
@@ -426,7 +426,8 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
multimodal_embeddings
:
Optional
[
MultiModalEmbeddings
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
if
multimodal_embeddings
is
not
None
\
and
len
(
multimodal_embeddings
)
!=
0
:
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
multimodal_embeddings
,
self
.
config
.
video_token_index
)
...
...
vllm/model_executor/models/llava_onevision.py
View file @
14fdd21d
...
...
@@ -881,7 +881,8 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
multimodal_embeddings
:
Optional
[
MultiModalEmbeddings
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
if
multimodal_embeddings
is
not
None
\
and
len
(
multimodal_embeddings
)
!=
0
:
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
multimodal_embeddings
,
[
self
.
config
.
image_token_index
,
self
.
config
.
video_token_index
])
...
...
vllm/model_executor/models/minicpmv.py
View file @
14fdd21d
...
...
@@ -892,7 +892,8 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
multimodal_embeddings
:
Optional
[
MultiModalEmbeddings
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
llm
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
if
multimodal_embeddings
is
not
None
\
and
len
(
multimodal_embeddings
)
!=
0
:
assert
len
(
self
.
mm_token_ids
)
>
0
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
...
...
vllm/model_executor/models/minimax_vl_01.py
View file @
14fdd21d
...
...
@@ -201,7 +201,8 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal,
multimodal_embeddings
:
Optional
[
MultiModalEmbeddings
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
if
multimodal_embeddings
is
not
None
\
and
len
(
multimodal_embeddings
)
!=
0
:
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
...
...
vllm/model_executor/models/mistral3.py
View file @
14fdd21d
...
...
@@ -521,7 +521,8 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
multimodal_embeddings
:
Optional
[
MultiModalEmbeddings
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
if
multimodal_embeddings
is
not
None
\
and
len
(
multimodal_embeddings
)
!=
0
:
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment