Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d2f816d6
Unverified
Commit
d2f816d6
authored
Oct 14, 2025
by
Cyrus Leung
Committed by
GitHub
Oct 14, 2025
Browse files
[Bugfix] Standardize merging multimodal embeddings (#26771)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
577d4982
Changes
19
Hide whitespace changes
Inline
Side-by-side
Showing
19 changed files
with
57 additions
and
57 deletions
+57
-57
vllm/model_executor/models/ernie45_vl.py
vllm/model_executor/models/ernie45_vl.py
+3
-3
vllm/model_executor/models/glm4_1v.py
vllm/model_executor/models/glm4_1v.py
+3
-3
vllm/model_executor/models/hyperclovax_vision.py
vllm/model_executor/models/hyperclovax_vision.py
+3
-3
vllm/model_executor/models/interns1.py
vllm/model_executor/models/interns1.py
+3
-3
vllm/model_executor/models/internvl.py
vllm/model_executor/models/internvl.py
+3
-3
vllm/model_executor/models/keye.py
vllm/model_executor/models/keye.py
+3
-3
vllm/model_executor/models/llava_onevision.py
vllm/model_executor/models/llava_onevision.py
+2
-2
vllm/model_executor/models/minicpmo.py
vllm/model_executor/models/minicpmo.py
+2
-2
vllm/model_executor/models/minicpmv.py
vllm/model_executor/models/minicpmv.py
+4
-4
vllm/model_executor/models/nano_nemotron_vl.py
vllm/model_executor/models/nano_nemotron_vl.py
+3
-3
vllm/model_executor/models/nemotron_vl.py
vllm/model_executor/models/nemotron_vl.py
+2
-2
vllm/model_executor/models/ovis2_5.py
vllm/model_executor/models/ovis2_5.py
+3
-3
vllm/model_executor/models/phi4_multimodal.py
vllm/model_executor/models/phi4_multimodal.py
+2
-2
vllm/model_executor/models/phi4mm.py
vllm/model_executor/models/phi4mm.py
+2
-2
vllm/model_executor/models/qwen2_5_omni_thinker.py
vllm/model_executor/models/qwen2_5_omni_thinker.py
+4
-4
vllm/model_executor/models/qwen2_5_vl.py
vllm/model_executor/models/qwen2_5_vl.py
+5
-5
vllm/model_executor/models/qwen2_vl.py
vllm/model_executor/models/qwen2_vl.py
+3
-3
vllm/model_executor/models/qwen3_omni_moe_thinker.py
vllm/model_executor/models/qwen3_omni_moe_thinker.py
+4
-4
vllm/model_executor/models/qwen3_vl.py
vllm/model_executor/models/qwen3_vl.py
+3
-3
No files found.
vllm/model_executor/models/ernie45_vl.py
View file @
d2f816d6
...
...
@@ -1645,12 +1645,12 @@ class Ernie4_5_VLMoeForConditionalGeneration(
for
modality
in
modalities
:
if
modality
==
"images"
:
image_input
=
modalities
[
"images"
]
vision
_embeddings
=
self
.
_process_image_input
(
image_input
)
multimodal_embeddings
+=
vision
_embeddings
image
_embeddings
=
self
.
_process_image_input
(
image_input
)
multimodal_embeddings
+=
tuple
(
image
_embeddings
)
if
modality
==
"videos"
:
video_input
=
modalities
[
"videos"
]
video_embeddings
=
self
.
_process_video_input
(
video_input
)
multimodal_embeddings
+=
video_embeddings
multimodal_embeddings
+=
tuple
(
video_embeddings
)
return
multimodal_embeddings
...
...
vllm/model_executor/models/glm4_1v.py
View file @
d2f816d6
...
...
@@ -1608,11 +1608,11 @@ class Glm4vForConditionalGeneration(
for
modality
in
mm_input_by_modality
:
multimodal_input
=
mm_input_by_modality
[
modality
]
if
modality
==
"image"
:
vision
_embeddings
=
self
.
_process_image_input
(
multimodal_input
)
multimodal_embeddings
+=
vision
_embeddings
image
_embeddings
=
self
.
_process_image_input
(
multimodal_input
)
multimodal_embeddings
+=
tuple
(
image
_embeddings
)
if
modality
==
"video"
:
video_embeddings
=
self
.
_process_video_input
(
multimodal_input
)
multimodal_embeddings
+=
video_embeddings
multimodal_embeddings
+=
tuple
(
video_embeddings
)
return
multimodal_embeddings
def
forward
(
...
...
vllm/model_executor/models/hyperclovax_vision.py
View file @
d2f816d6
...
...
@@ -749,12 +749,12 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
for
modality
in
modalities
:
if
modality
==
"images"
:
image_input
=
modalities
[
"images"
]
vision
_embeddings
=
self
.
_process_image_input
(
image_input
)
multimodal_embeddings
+=
vision
_embeddings
image
_embeddings
=
self
.
_process_image_input
(
image_input
)
multimodal_embeddings
+=
tuple
(
image
_embeddings
)
if
modality
==
"videos"
:
video_input
=
modalities
[
"videos"
]
video_embeddings
=
self
.
_process_video_input
(
video_input
)
multimodal_embeddings
+=
video_embeddings
multimodal_embeddings
+=
tuple
(
video_embeddings
)
return
multimodal_embeddings
...
...
vllm/model_executor/models/interns1.py
View file @
d2f816d6
...
...
@@ -753,12 +753,12 @@ class InternS1ForConditionalGeneration(
for
modality
in
modalities
:
if
modality
==
"images"
:
image_input
=
modalities
[
"images"
]
vision
_embeddings
=
self
.
_process_vision_input
(
image_input
)
multimodal_embeddings
+=
vision
_embeddings
image
_embeddings
=
self
.
_process_vision_input
(
image_input
)
multimodal_embeddings
+=
tuple
(
image
_embeddings
)
if
modality
==
"videos"
:
video_input
=
modalities
[
"videos"
]
video_embeddings
=
self
.
_process_vision_input
(
video_input
)
multimodal_embeddings
+=
video_embeddings
multimodal_embeddings
+=
tuple
(
video_embeddings
)
return
multimodal_embeddings
...
...
vllm/model_executor/models/internvl.py
View file @
d2f816d6
...
...
@@ -1358,12 +1358,12 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA)
for
modality
in
modalities
:
if
modality
==
"images"
:
image_input
=
modalities
[
"images"
]
vision
_embeddings
=
self
.
_process_vision_input
(
image_input
)
multimodal_embeddings
+=
vision
_embeddings
image
_embeddings
=
self
.
_process_vision_input
(
image_input
)
multimodal_embeddings
+=
tuple
(
image
_embeddings
)
if
modality
==
"videos"
:
video_input
=
modalities
[
"videos"
]
video_embeddings
=
self
.
_process_vision_input
(
video_input
)
multimodal_embeddings
+=
video_embeddings
multimodal_embeddings
+=
tuple
(
video_embeddings
)
return
multimodal_embeddings
...
...
vllm/model_executor/models/keye.py
View file @
d2f816d6
...
...
@@ -1459,12 +1459,12 @@ class BaseKeyeModule(nn.Module):
for
modality
in
modalities
:
if
modality
==
"images"
:
image_input
=
modalities
[
"images"
]
vision
_embeddings
=
self
.
_process_image_input
(
image_input
)
multimodal_embeddings
+=
vision
_embeddings
image
_embeddings
=
self
.
_process_image_input
(
image_input
)
multimodal_embeddings
+=
tuple
(
image
_embeddings
)
if
modality
==
"videos"
:
video_input
=
modalities
[
"videos"
]
video_embeddings
=
self
.
_process_video_input
(
video_input
)
multimodal_embeddings
+=
video_embeddings
multimodal_embeddings
+=
tuple
(
video_embeddings
)
return
multimodal_embeddings
def
forward
(
...
...
vllm/model_executor/models/llava_onevision.py
View file @
d2f816d6
...
...
@@ -881,8 +881,8 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, Supp
for
modality
in
mm_input_by_modality
:
multimodal_input
=
mm_input_by_modality
[
modality
]
if
modality
==
"image"
:
vision
_embeddings
=
self
.
_process_image_input
(
multimodal_input
)
multimodal_embeddings
+=
tuple
(
vision
_embeddings
)
image
_embeddings
=
self
.
_process_image_input
(
multimodal_input
)
multimodal_embeddings
+=
tuple
(
image
_embeddings
)
if
modality
==
"video"
:
video_embeddings
=
self
.
_process_video_pixels
(
multimodal_input
)
multimodal_embeddings
+=
tuple
(
video_embeddings
)
...
...
vllm/model_executor/models/minicpmo.py
View file @
d2f816d6
...
...
@@ -762,7 +762,7 @@ class MiniCPMO(MiniCPMV2_6):
for
modality
in
modalities
:
if
modality
==
"audios"
:
audio_input
=
modalities
[
"audios"
]
audio_
feature
s
=
self
.
_process_audio_input
(
audio_input
)
multimodal_embeddings
+=
tuple
(
audio_
feature
s
)
audio_
embedding
s
=
self
.
_process_audio_input
(
audio_input
)
multimodal_embeddings
+=
tuple
(
audio_
embedding
s
)
return
multimodal_embeddings
vllm/model_executor/models/minicpmv.py
View file @
d2f816d6
...
...
@@ -1129,12 +1129,12 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
for
modality
in
modalities
:
if
modality
==
"images"
:
image_input
=
modalities
[
"images"
]
image_
feature
s
=
self
.
_process_vision_input
(
image_input
)
multimodal_embeddings
+=
tuple
(
image_
feature
s
)
image_
embedding
s
=
self
.
_process_vision_input
(
image_input
)
multimodal_embeddings
+=
tuple
(
image_
embedding
s
)
if
modality
==
"videos"
:
video_input
=
modalities
[
"videos"
]
video_
feature
s
=
self
.
_process_vision_input
(
video_input
)
multimodal_embeddings
+=
tuple
(
video_
feature
s
)
video_
embedding
s
=
self
.
_process_vision_input
(
video_input
)
multimodal_embeddings
+=
tuple
(
video_
embedding
s
)
return
multimodal_embeddings
...
...
vllm/model_executor/models/nano_nemotron_vl.py
View file @
d2f816d6
...
...
@@ -1263,12 +1263,12 @@ class NemotronH_Nano_VL_V2(
for
modality
in
modalities
:
if
modality
==
"images"
:
image_input
=
modalities
[
"images"
]
vision
_embeddings
=
self
.
_process_image_input
(
image_input
)
multimodal_embeddings
+=
vision
_embeddings
image
_embeddings
=
self
.
_process_image_input
(
image_input
)
multimodal_embeddings
+=
tuple
(
image
_embeddings
)
if
modality
==
"videos"
:
video_input
=
modalities
[
"videos"
]
video_embeddings
=
self
.
_process_video_input
(
video_input
)
multimodal_embeddings
+=
video_embeddings
multimodal_embeddings
+=
tuple
(
video_embeddings
)
return
multimodal_embeddings
...
...
vllm/model_executor/models/nemotron_vl.py
View file @
d2f816d6
...
...
@@ -575,8 +575,8 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor
for
modality
in
modalities
:
if
modality
==
"images"
:
image_input
=
modalities
[
"images"
]
vision
_embeddings
=
self
.
_process_image_input
(
image_input
)
multimodal_embeddings
+=
vision
_embeddings
image
_embeddings
=
self
.
_process_image_input
(
image_input
)
multimodal_embeddings
+=
tuple
(
image
_embeddings
)
return
multimodal_embeddings
...
...
vllm/model_executor/models/ovis2_5.py
View file @
d2f816d6
...
...
@@ -616,12 +616,12 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
for
modality
in
modalities
:
if
modality
==
"images"
:
image_input
=
modalities
[
"images"
]
vision
_embeddings
=
self
.
_process_visual_input
(
image_input
)
multimodal_embeddings
+=
vision
_embeddings
image
_embeddings
=
self
.
_process_visual_input
(
image_input
)
multimodal_embeddings
+=
tuple
(
image
_embeddings
)
if
modality
==
"videos"
:
video_input
=
modalities
[
"videos"
]
video_embeddings
=
self
.
_process_visual_input
(
video_input
)
multimodal_embeddings
+=
video_embeddings
multimodal_embeddings
+=
tuple
(
video_embeddings
)
return
multimodal_embeddings
...
...
vllm/model_executor/models/phi4_multimodal.py
View file @
d2f816d6
...
...
@@ -1430,8 +1430,8 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
if
modality
==
"images"
:
audio_projection_mode
=
"vision"
image_input
=
modalities
[
"images"
]
vision
_embeddings
=
self
.
_process_image_input
(
image_input
)
multimodal_embeddings
+=
tuple
(
vision
_embeddings
)
image
_embeddings
=
self
.
_process_image_input
(
image_input
)
multimodal_embeddings
+=
tuple
(
image
_embeddings
)
if
modality
==
"audios"
:
audio_input
=
modalities
[
"audios"
]
audio_embeddings
=
self
.
_process_audio_input
(
...
...
vllm/model_executor/models/phi4mm.py
View file @
d2f816d6
...
...
@@ -1248,8 +1248,8 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
if
modality
==
"images"
:
audio_projection_mode
=
"vision"
image_input
=
modalities
[
"images"
]
vision
_embeddings
=
self
.
_process_image_input
(
image_input
)
multimodal_embeddings
+=
tuple
(
vision
_embeddings
)
image
_embeddings
=
self
.
_process_image_input
(
image_input
)
multimodal_embeddings
+=
tuple
(
image
_embeddings
)
if
modality
==
"audios"
:
audio_input
=
modalities
[
"audios"
]
audio_embeddings
=
self
.
_process_audio_input
(
...
...
vllm/model_executor/models/qwen2_5_omni_thinker.py
View file @
d2f816d6
...
...
@@ -1210,14 +1210,14 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
for
modality
in
mm_input_by_modality
:
multimodal_input
=
mm_input_by_modality
[
modality
]
if
modality
==
"image"
:
vision
_embeddings
=
self
.
_process_image_input
(
multimodal_input
)
multimodal_embeddings
+=
vision
_embeddings
image
_embeddings
=
self
.
_process_image_input
(
multimodal_input
)
multimodal_embeddings
+=
tuple
(
image
_embeddings
)
if
modality
==
"video"
:
video_embeddings
=
self
.
_process_video_input
(
multimodal_input
)
multimodal_embeddings
+=
video_embeddings
multimodal_embeddings
+=
tuple
(
video_embeddings
)
if
modality
==
"audio"
:
audio_embeddings
=
self
.
_process_audio_input
(
multimodal_input
)
multimodal_embeddings
+=
audio_embeddings
multimodal_embeddings
+=
tuple
(
audio_embeddings
)
return
multimodal_embeddings
# TODO (ywang96): support overlapping modality embeddings so that
...
...
vllm/model_executor/models/qwen2_5_vl.py
View file @
d2f816d6
...
...
@@ -1586,19 +1586,19 @@ class Qwen2_5_VLForConditionalGeneration(
for
modality
in
mm_input_by_modality
:
multimodal_input
=
mm_input_by_modality
[
modality
]
if
modality
==
"image"
:
vision
_embeddings
=
self
.
_process_image_input
(
multimodal_input
)
image
_embeddings
=
self
.
_process_image_input
(
multimodal_input
)
if
self
.
is_multimodal_pruning_enabled
:
vision
_embeddings
=
self
.
_postprocess_image_embeds_evs
(
vision
_embeddings
,
multimodal_input
image
_embeddings
=
self
.
_postprocess_image_embeds_evs
(
image
_embeddings
,
multimodal_input
)
multimodal_embeddings
+=
vision
_embeddings
multimodal_embeddings
+=
tuple
(
image
_embeddings
)
if
modality
==
"video"
:
video_embeddings
=
self
.
_process_video_input
(
multimodal_input
)
if
self
.
is_multimodal_pruning_enabled
:
video_embeddings
=
self
.
_postprocess_video_embeds_evs
(
video_embeddings
,
multimodal_input
)
multimodal_embeddings
+=
video_embeddings
multimodal_embeddings
+=
tuple
(
video_embeddings
)
return
multimodal_embeddings
def
forward
(
...
...
vllm/model_executor/models/qwen2_vl.py
View file @
d2f816d6
...
...
@@ -1561,12 +1561,12 @@ class Qwen2VLForConditionalGeneration(
for
modality
in
modalities
:
if
modality
==
"images"
:
image_input
=
modalities
[
"images"
]
vision
_embeddings
=
self
.
_process_image_input
(
image_input
)
multimodal_embeddings
+=
vision
_embeddings
image
_embeddings
=
self
.
_process_image_input
(
image_input
)
multimodal_embeddings
+=
tuple
(
image
_embeddings
)
if
modality
==
"videos"
:
video_input
=
modalities
[
"videos"
]
video_embeddings
=
self
.
_process_video_input
(
video_input
)
multimodal_embeddings
+=
video_embeddings
multimodal_embeddings
+=
tuple
(
video_embeddings
)
return
multimodal_embeddings
...
...
vllm/model_executor/models/qwen3_omni_moe_thinker.py
View file @
d2f816d6
...
...
@@ -1260,14 +1260,14 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
for
modality
in
mm_input_by_modality
:
multimodal_input
=
mm_input_by_modality
[
modality
]
if
modality
==
"image"
:
vision
_embeddings
=
self
.
_process_image_input
(
multimodal_input
)
multimodal_embeddings
+=
vision
_embeddings
image
_embeddings
=
self
.
_process_image_input
(
multimodal_input
)
multimodal_embeddings
+=
tuple
(
image
_embeddings
)
if
modality
==
"video"
:
video_embeddings
=
self
.
_process_video_input
(
multimodal_input
)
multimodal_embeddings
+=
video_embeddings
multimodal_embeddings
+=
tuple
(
video_embeddings
)
if
modality
==
"audio"
:
audio_embeddings
=
self
.
_process_audio_input
(
multimodal_input
)
multimodal_embeddings
+=
audio_embeddings
multimodal_embeddings
+=
tuple
(
audio_embeddings
)
return
multimodal_embeddings
def
get_input_embeddings
(
...
...
vllm/model_executor/models/qwen3_vl.py
View file @
d2f816d6
...
...
@@ -1601,11 +1601,11 @@ class Qwen3VLForConditionalGeneration(
for
modality
in
mm_input_by_modality
:
multimodal_input
=
mm_input_by_modality
[
modality
]
if
modality
==
"image"
:
vision
_embeddings
=
self
.
_process_image_input
(
multimodal_input
)
multimodal_embeddings
+=
vision
_embeddings
image
_embeddings
=
self
.
_process_image_input
(
multimodal_input
)
multimodal_embeddings
+=
tuple
(
image
_embeddings
)
if
modality
==
"video"
:
video_embeddings
=
self
.
_process_video_input
(
multimodal_input
)
multimodal_embeddings
+=
video_embeddings
multimodal_embeddings
+=
tuple
(
video_embeddings
)
return
multimodal_embeddings
def
_compute_deepstack_embeds
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment