Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
b6fbc156
Unverified
Commit
b6fbc156
authored
Sep 09, 2025
by
CSWYF3634076
Committed by
GitHub
Sep 09, 2025
Browse files
[BugFix][Model] Fix Ernie4.5-VL hanging on long inputs (#24074)
Signed-off-by:
wangyafeng
<
wangyafeng@baidu.com
>
parent
3e0d4a34
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
18 additions
and
7 deletions
+18
-7
vllm/model_executor/models/ernie45_vl.py
vllm/model_executor/models/ernie45_vl.py
+10
-4
vllm/model_executor/models/ernie45_vl_moe.py
vllm/model_executor/models/ernie45_vl_moe.py
+8
-3
No files found.
vllm/model_executor/models/ernie45_vl.py
View file @
b6fbc156
...
...
@@ -66,8 +66,6 @@ from .vision import get_vit_attn_backend
logger
=
init_logger
(
__name__
)
_MAX_FRAMES_PER_VIDEO
=
16
# === Vision Transformer === #
...
...
@@ -839,6 +837,15 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo):
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
Optional
[
int
]]:
return
{
"image"
:
None
,
"video"
:
None
}
def
get_mm_max_tokens_per_item
(
self
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
)
->
Mapping
[
str
,
int
]:
max_image_tokens
=
self
.
get_max_image_tokens
()
max_video_tokens
=
self
.
get_max_video_tokens
(
seq_len
,
mm_counts
)
return
{
"image"
:
max_image_tokens
,
"video"
:
max_video_tokens
}
def
_get_vision_info
(
self
,
*
,
...
...
@@ -964,8 +971,7 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo):
max_image_tokens
=
self
.
get_max_image_tokens
()
*
max_images
max_total_frames
=
self
.
_get_max_video_frames
(
seq_len
-
max_image_tokens
)
max_frames_per_video
=
min
(
max_total_frames
//
max
(
max_videos
,
1
),
_MAX_FRAMES_PER_VIDEO
)
max_frames_per_video
=
max_total_frames
//
max
(
max_videos
,
1
)
return
max
(
max_frames_per_video
,
2
)
...
...
vllm/model_executor/models/ernie45_vl_moe.py
View file @
b6fbc156
...
...
@@ -287,8 +287,13 @@ class Ernie4_5_VLMoeMoE(nn.Module):
if
self
.
has_shared_experts
:
shared_output
=
self
.
shared_experts
(
hidden_states
)
if
visual_token_mask
is
not
None
and
visual_token_mask
.
any
():
# assert visual_token_mask.shape[0] != hidden_states.shape[0]
if
visual_token_mask
is
not
None
and
visual_token_mask
.
all
():
# only vision modal input
router_logits
,
_
=
self
.
vision_experts_gate
(
hidden_states
)
final_hidden_states
=
self
.
vision_experts
(
hidden_states
=
hidden_states
,
router_logits
=
router_logits
)
elif
visual_token_mask
is
not
None
and
visual_token_mask
.
any
():
# text and vision modals input
visual_token_mask
=
visual_token_mask
.
repeat
(
1
,
self
.
hidden_size
).
bool
()
text_token_mask
=
~
visual_token_mask
...
...
@@ -310,7 +315,7 @@ class Ernie4_5_VLMoeMoE(nn.Module):
hidden_states
=
vision_hidden_states
,
router_logits
=
vision_router_logits
).
flatten
()
else
:
# text modal input
processing directly
#
only
text modal input
text_router_logits
,
_
=
self
.
text_experts_gate
(
hidden_states
)
final_hidden_states
=
self
.
text_experts
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment