Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
b4784001
Unverified
Commit
b4784001
authored
Apr 21, 2026
by
Shanshan Shen
Committed by
GitHub
Apr 21, 2026
Browse files
[MM][Misc] Support image+video mixed inputs (per prompt) for VLM examples (#40335)
Signed-off-by:
shen-shanshan
<
467638484@qq.com
>
parent
989cc12d
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
291 additions
and
101 deletions
+291
-101
examples/offline_inference/vision_language.py
examples/offline_inference/vision_language.py
+291
-101
No files found.
examples/offline_inference/vision_language.py
View file @
b4784001
...
...
@@ -394,18 +394,24 @@ def run_eagle2_5(questions: list[str], modality: str) -> ModelRequestData:
def
run_ernie45_vl
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
model_name
=
"baidu/ERNIE-4.5-VL-28B-A3B-PT"
mm_limit
=
{
"image"
:
1
,
"video"
:
1
}
if
modality
==
"image+video"
else
{
modality
:
1
}
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
4096
,
max_num_seqs
=
5
,
limit_mm_per_prompt
=
{
modality
:
1
}
,
limit_mm_per_prompt
=
mm_limit
,
trust_remote_code
=
True
,
)
image_placeholder
=
"Picture 1:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>"
video_placeholder
=
"Video 1:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>"
if
modality
==
"image"
:
placeholder
=
"Picture 1:<|IMAGE_START|><|
image
@
placeholder
|><|IMAGE_END|>"
placeholder
=
image
_
placeholder
elif
modality
==
"video"
:
placeholder
=
"Video 1:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>"
placeholder
=
video_placeholder
elif
modality
==
"image+video"
:
placeholder
=
image_placeholder
+
video_placeholder
prompts
=
[
(
...
...
@@ -425,6 +431,7 @@ def run_ernie45_vl(questions: list[str], modality: str) -> ModelRequestData:
def
run_exaone4_5
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
model_name
=
"LGAI-EXAONE/EXAONE-4.5-33B"
mm_limit
=
{
"image"
:
1
,
"video"
:
1
}
if
modality
==
"image+video"
else
{
modality
:
1
}
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
4096
,
...
...
@@ -434,18 +441,23 @@ def run_exaone4_5(questions: list[str], modality: str) -> ModelRequestData:
"max_pixels"
:
1280
*
28
*
28
,
"fps"
:
1
,
},
limit_mm_per_prompt
=
{
modality
:
1
}
,
limit_mm_per_prompt
=
mm_limit
,
)
image_placeholder
=
"<vision><|image_pad|></vision>"
video_placeholder
=
"<vision><|video_pad|></vision>"
if
modality
==
"image"
:
placeholder
=
"<|
image_p
ad|>"
placeholder
=
image_p
laceholder
elif
modality
==
"video"
:
placeholder
=
"<|video_pad|>"
placeholder
=
video_placeholder
elif
modality
==
"image+video"
:
placeholder
=
image_placeholder
+
video_placeholder
prompts
=
[
(
"<|system|>
\n
You are a helpful assistant.<|endofturn|>
\n
"
f
"<|user|>
\n
<vision>
{
placeholder
}
</vision>
"
f
"<|user|>
\n
{
placeholder
}
"
f
"
{
question
}
<|endofturn|>
\n
"
"<|assistant|>
\n
"
)
...
...
@@ -566,6 +578,7 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
def
run_glm4_1v
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
model_name
=
"zai-org/GLM-4.1V-9B-Thinking"
mm_limit
=
{
"image"
:
1
,
"video"
:
1
}
if
modality
==
"image+video"
else
{
modality
:
1
}
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
4096
,
...
...
@@ -574,14 +587,19 @@ def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData:
"size"
:
{
"shortest_edge"
:
12544
,
"longest_edge"
:
47040000
},
"fps"
:
1
,
},
limit_mm_per_prompt
=
{
modality
:
1
}
,
limit_mm_per_prompt
=
mm_limit
,
enforce_eager
=
True
,
)
image_placeholder
=
"<|begin_of_image|><|image|><|end_of_image|>"
video_placeholder
=
"<|begin_of_video|><|video|><|end_of_video|>"
if
modality
==
"image"
:
placeholder
=
"<|begin_of_image|><|image|><|end_of_image|>"
placeholder
=
image_placeholder
elif
modality
==
"video"
:
placeholder
=
"<|begin_of_video|><|video|><|end_of_video|>"
placeholder
=
video_placeholder
elif
modality
==
"image+video"
:
placeholder
=
image_placeholder
+
video_placeholder
prompts
=
[
(
...
...
@@ -602,6 +620,7 @@ def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData:
def
run_glm4_5v
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
model_name
=
"zai-org/GLM-4.5V"
mm_limit
=
{
"image"
:
1
,
"video"
:
1
}
if
modality
==
"image+video"
else
{
modality
:
1
}
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
4096
,
...
...
@@ -610,15 +629,20 @@ def run_glm4_5v(questions: list[str], modality: str) -> ModelRequestData:
"size"
:
{
"shortest_edge"
:
12544
,
"longest_edge"
:
47040000
},
"fps"
:
1
,
},
limit_mm_per_prompt
=
{
modality
:
1
}
,
limit_mm_per_prompt
=
mm_limit
,
enforce_eager
=
True
,
tensor_parallel_size
=
4
,
)
image_placeholder
=
"<|begin_of_image|><|image|><|end_of_image|>"
video_placeholder
=
"<|begin_of_video|><|video|><|end_of_video|>"
if
modality
==
"image"
:
placeholder
=
"<|begin_of_image|><|image|><|end_of_image|>"
placeholder
=
image_placeholder
elif
modality
==
"video"
:
placeholder
=
"<|begin_of_video|><|video|><|end_of_video|>"
placeholder
=
video_placeholder
elif
modality
==
"image+video"
:
placeholder
=
image_placeholder
+
video_placeholder
prompts
=
[
(
...
...
@@ -639,6 +663,7 @@ def run_glm4_5v(questions: list[str], modality: str) -> ModelRequestData:
def
run_glm4_5v_fp8
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
model_name
=
"zai-org/GLM-4.5V-FP8"
mm_limit
=
{
"image"
:
1
,
"video"
:
1
}
if
modality
==
"image+video"
else
{
modality
:
1
}
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
4096
,
...
...
@@ -647,15 +672,20 @@ def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData:
"size"
:
{
"shortest_edge"
:
12544
,
"longest_edge"
:
47040000
},
"fps"
:
1
,
},
limit_mm_per_prompt
=
{
modality
:
1
}
,
limit_mm_per_prompt
=
mm_limit
,
enforce_eager
=
True
,
tensor_parallel_size
=
4
,
)
image_placeholder
=
"<|begin_of_image|><|image|><|end_of_image|>"
video_placeholder
=
"<|begin_of_video|><|video|><|end_of_video|>"
if
modality
==
"image"
:
placeholder
=
"<|begin_of_image|><|image|><|end_of_image|>"
placeholder
=
image_placeholder
elif
modality
==
"video"
:
placeholder
=
"<|begin_of_video|><|video|><|end_of_video|>"
placeholder
=
video_placeholder
elif
modality
==
"image+video"
:
placeholder
=
image_placeholder
+
video_placeholder
prompts
=
[
(
...
...
@@ -676,6 +706,7 @@ def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData:
def
run_glm_ocr
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
model_name
=
"zai-org/GLM-OCR"
mm_limit
=
{
"image"
:
1
,
"video"
:
1
}
if
modality
==
"image+video"
else
{
modality
:
1
}
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
4096
,
...
...
@@ -684,14 +715,19 @@ def run_glm_ocr(questions: list[str], modality: str) -> ModelRequestData:
"size"
:
{
"shortest_edge"
:
12544
,
"longest_edge"
:
47040000
},
"fps"
:
1
,
},
limit_mm_per_prompt
=
{
modality
:
1
}
,
limit_mm_per_prompt
=
mm_limit
,
enforce_eager
=
True
,
)
image_placeholder
=
"<|begin_of_image|><|image|><|end_of_image|>"
video_placeholder
=
"<|begin_of_video|><|video|><|end_of_video|>"
if
modality
==
"image"
:
placeholder
=
"<|begin_of_image|><|image|><|end_of_image|>"
placeholder
=
image_placeholder
elif
modality
==
"video"
:
placeholder
=
"<|begin_of_video|><|video|><|end_of_video|>"
placeholder
=
video_placeholder
elif
modality
==
"image+video"
:
placeholder
=
image_placeholder
+
video_placeholder
prompts
=
[
(
...
...
@@ -772,11 +808,12 @@ def run_hyperclovax_seed_vision(
model_name
=
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
trust_remote_code
=
True
)
mm_limit
=
{
"image"
:
1
,
"video"
:
1
}
if
modality
==
"image+video"
else
{
modality
:
1
}
engine_args
=
EngineArgs
(
model
=
model_name
,
trust_remote_code
=
True
,
max_model_len
=
8192
if
modality
==
"image"
else
16384
,
limit_mm_per_prompt
=
{
modality
:
1
}
,
max_model_len
=
16384
if
modality
in
(
"video"
,
"image+video"
)
else
8192
,
limit_mm_per_prompt
=
mm_limit
,
)
messages
=
list
()
...
...
@@ -828,6 +865,29 @@ def run_hyperclovax_seed_vision(
}
]
)
elif
modality
==
"image+video"
:
messages
.
append
(
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"image"
,
"ocr"
:
""
,
"lens_keywords"
:
""
,
"lens_local_keywords"
:
""
,
},
{
"type"
:
"video"
,
},
{
"type"
:
"text"
,
"text"
:
question
,
},
],
}
]
)
else
:
raise
ValueError
(
f
"Unsupported modality:
{
modality
}
"
)
...
...
@@ -876,19 +936,25 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
def
run_interns1
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
model_name
=
"internlm/Intern-S1-mini"
mm_limit
=
{
"image"
:
1
,
"video"
:
1
}
if
modality
==
"image+video"
else
{
modality
:
1
}
engine_args
=
EngineArgs
(
model
=
model_name
,
trust_remote_code
=
True
,
max_model_len
=
8192
,
max_num_seqs
=
2
,
limit_mm_per_prompt
=
{
modality
:
1
}
,
limit_mm_per_prompt
=
mm_limit
,
enforce_eager
=
True
,
)
image_placeholder
=
"<IMG_CONTEXT>"
video_placeholder
=
"<video>"
if
modality
==
"image"
:
placeholder
=
"<IMG_CONTEXT>"
placeholder
=
image_placeholder
elif
modality
==
"video"
:
placeholder
=
"<video>"
placeholder
=
video_placeholder
elif
modality
==
"image+video"
:
placeholder
=
image_placeholder
+
"
\n
"
+
video_placeholder
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
trust_remote_code
=
True
)
messages
=
[
...
...
@@ -909,20 +975,26 @@ def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
def
run_interns1_pro
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
model_name
=
"internlm/Intern-S1-Pro"
mm_limit
=
{
"image"
:
1
,
"video"
:
1
}
if
modality
==
"image+video"
else
{
modality
:
1
}
engine_args
=
EngineArgs
(
model
=
model_name
,
trust_remote_code
=
True
,
max_model_len
=
8192
,
max_num_seqs
=
2
,
limit_mm_per_prompt
=
{
modality
:
1
}
,
limit_mm_per_prompt
=
mm_limit
,
enforce_eager
=
True
,
tensor_parallel_size
=
4
,
)
image_placeholder
=
"<|vision_start|><|image_pad|><|vision_end|>"
video_placeholder
=
"<|vision_start|><|video_pad|><|vision_end|>"
if
modality
==
"image"
:
placeholder
=
"<|vision_start|><|image_pad|><|vision_end|>"
placeholder
=
image_placeholder
elif
modality
==
"video"
:
placeholder
=
"<|vision_start|><|video_pad|><|vision_end|>"
placeholder
=
video_placeholder
elif
modality
==
"image+video"
:
placeholder
=
image_placeholder
+
video_placeholder
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
trust_remote_code
=
True
)
messages
=
[
...
...
@@ -943,17 +1015,23 @@ def run_interns1_pro(questions: list[str], modality: str) -> ModelRequestData:
def
run_internvl
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
model_name
=
"OpenGVLab/InternVL3-2B"
mm_limit
=
{
"image"
:
1
,
"video"
:
1
}
if
modality
==
"image+video"
else
{
modality
:
1
}
engine_args
=
EngineArgs
(
model
=
model_name
,
trust_remote_code
=
True
,
max_model_len
=
8192
,
limit_mm_per_prompt
=
{
modality
:
1
}
,
limit_mm_per_prompt
=
mm_limit
,
)
image_placeholder
=
"<image>"
video_placeholder
=
"<video>"
if
modality
==
"image"
:
placeholder
=
"<
image
>"
placeholder
=
image
_placeholder
elif
modality
==
"video"
:
placeholder
=
"<video>"
placeholder
=
video_placeholder
elif
modality
==
"image+video"
:
placeholder
=
image_placeholder
+
"
\n
"
+
video_placeholder
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
trust_remote_code
=
True
)
messages
=
[
...
...
@@ -1010,21 +1088,27 @@ def run_kanana_v(questions: list[str], modality: str) -> ModelRequestData:
def
run_keye_vl
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
model_name
=
"Kwai-Keye/Keye-VL-8B-Preview"
mm_limit
=
{
"image"
:
1
,
"video"
:
1
}
if
modality
==
"image+video"
else
{
modality
:
1
}
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
8192
,
trust_remote_code
=
True
,
limit_mm_per_prompt
=
{
modality
:
1
}
,
limit_mm_per_prompt
=
mm_limit
,
)
image_placeholder
=
"<|vision_start|><|image_pad|><|vision_end|>"
video_placeholder
=
"<|vision_start|><|video_pad|><|vision_end|>"
if
modality
==
"image"
:
placeholder
=
"<|
image_p
ad|>"
placeholder
=
image_p
laceholder
elif
modality
==
"video"
:
placeholder
=
"<|video_pad|>"
placeholder
=
video_placeholder
elif
modality
==
"image+video"
:
placeholder
=
image_placeholder
+
video_placeholder
prompts
=
[
(
f
"<|im_start|>user
\n
<|vision_start|>
{
placeholder
}
<|vision_end|>
"
f
"<|im_start|>user
\n
{
placeholder
}
"
f
"
{
question
}
<|im_end|>
\n
"
"<|im_start|>assistant
\n
"
)
...
...
@@ -1041,21 +1125,27 @@ def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
def
run_keye_vl1_5
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
model_name
=
"Kwai-Keye/Keye-VL-1.5-8B"
mm_limit
=
{
"image"
:
1
,
"video"
:
1
}
if
modality
==
"image+video"
else
{
modality
:
1
}
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
8192
,
trust_remote_code
=
True
,
limit_mm_per_prompt
=
{
modality
:
1
}
,
limit_mm_per_prompt
=
mm_limit
,
)
image_placeholder
=
"<|vision_start|><|image_pad|><|vision_end|>"
video_placeholder
=
"<|vision_start|><|video_pad|><|vision_end|>"
if
modality
==
"image"
:
placeholder
=
"<|
image_p
ad|>"
placeholder
=
image_p
laceholder
elif
modality
==
"video"
:
placeholder
=
"<|video_pad|>"
placeholder
=
video_placeholder
elif
modality
==
"image+video"
:
placeholder
=
image_placeholder
+
video_placeholder
prompts
=
[
(
f
"<|im_start|>user
\n
<|vision_start|>
{
placeholder
}
<|vision_end|>
"
f
"<|im_start|>user
\n
{
placeholder
}
"
f
"
{
question
}
<|im_end|>
\n
"
"<|im_start|>assistant
\n
"
)
...
...
@@ -1259,22 +1349,26 @@ def run_llava_next_video(questions: list[str], modality: str) -> ModelRequestDat
# LLaVA-OneVision
def
run_llava_onevision
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
if
modality
==
"video"
:
prompts
=
[
f
"<|im_start|>user <video>
\n
{
question
}
<|im_end|><|im_start|>assistant
\n
"
for
question
in
questions
]
image_placeholder
=
"<image>"
video_placeholder
=
"<video>"
elif
modality
==
"image"
:
prompts
=
[
f
"<|im_start|>user <image>
\n
{
question
}
<|im_end|><|im_start|>assistant
\n
"
for
question
in
questions
]
if
modality
==
"image"
:
placeholder
=
image_placeholder
elif
modality
==
"video"
:
placeholder
=
video_placeholder
elif
modality
==
"image+video"
:
placeholder
=
image_placeholder
+
"
\n
"
+
video_placeholder
prompts
=
[
(
f
"<|im_start|>user
{
placeholder
}
\n
{
question
}
<|im_end|><|im_start|>assistant
\n
"
)
for
question
in
questions
]
mm_limit
=
{
"image"
:
1
,
"video"
:
1
}
if
modality
==
"image+video"
else
{
modality
:
1
}
engine_args
=
EngineArgs
(
model
=
"llava-hf/llava-onevision-qwen2-7b-ov-hf"
,
max_model_len
=
16384
,
limit_mm_per_prompt
=
{
modality
:
1
}
,
limit_mm_per_prompt
=
mm_limit
,
)
return
ModelRequestData
(
...
...
@@ -1307,7 +1401,7 @@ def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
# MiniCPM-V
def
run_minicpmv_base
(
questions
:
list
[
str
],
modality
:
str
,
model_name
):
assert
modality
in
[
"image"
,
"video"
]
assert
modality
in
[
"image"
,
"video"
,
"image+video"
]
# If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa
# 2.0
...
...
@@ -1329,12 +1423,13 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
# o2.6: image, video, audio
# model_name = "openbmb/MiniCPM-o-2_6"
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
trust_remote_code
=
True
)
mm_limit
=
{
"image"
:
1
,
"video"
:
1
}
if
modality
==
"image+video"
else
{
modality
:
1
}
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
trust_remote_code
=
True
,
limit_mm_per_prompt
=
{
modality
:
1
}
,
limit_mm_per_prompt
=
mm_limit
,
)
# NOTE The stop_token_ids are different for various versions of MiniCPM-V
# 2.0
...
...
@@ -1347,17 +1442,22 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
stop_tokens
=
[
"<|im_end|>"
,
"<|endoftext|>"
]
stop_token_ids
=
[
tokenizer
.
convert_tokens_to_ids
(
i
)
for
i
in
stop_tokens
]
modality_placeholder
=
{
"image"
:
"(<image>./</image>)"
,
"video"
:
"(<video>./</video>)"
,
}
image_placeholder
=
"(<image>./</image>)"
video_placeholder
=
"(<video>./</video>)"
if
modality
==
"image"
:
placeholder
=
image_placeholder
elif
modality
==
"video"
:
placeholder
=
video_placeholder
elif
modality
==
"image+video"
:
placeholder
=
image_placeholder
+
"
\n
"
+
video_placeholder
prompts
=
[
tokenizer
.
apply_chat_template
(
[
{
"role"
:
"user"
,
"content"
:
f
"
{
modality_
placeholder
[
modality
]
}
\n
{
question
}
"
,
"content"
:
f
"
{
placeholder
}
\n
{
question
}
"
,
}
],
tokenize
=
False
,
...
...
@@ -1466,20 +1566,24 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
def
run_molmo2
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
model_name
=
"allenai/Molmo2-8B"
mm_limit
=
{
"image"
:
1
,
"video"
:
1
}
if
modality
==
"image+video"
else
{
modality
:
1
}
engine_args
=
EngineArgs
(
model
=
model_name
,
trust_remote_code
=
True
,
dtype
=
"bfloat16"
,
limit_mm_per_prompt
=
{
modality
:
1
}
,
limit_mm_per_prompt
=
mm_limit
,
max_num_batched_tokens
=
36864
,
)
image_placeholder
=
"<|image|>"
video_placeholder
=
"<|video|>"
if
modality
==
"image"
:
placeholder
=
"<|
image
|>"
placeholder
=
image
_placeholder
elif
modality
==
"video"
:
placeholder
=
"<|
video
|>"
el
se
:
raise
ValueError
(
f
"Unsupported modality for molmo2:
{
modality
}
"
)
placeholder
=
video
_placeholder
el
if
modality
==
"image+video"
:
placeholder
=
image_placeholder
+
video_placeholder
prompts
=
[
f
"
{
placeholder
}
<|im_start|>user
\n
{
question
}
<|im_end|>
\n
<|im_start|>assistant
\n
"
...
...
@@ -1563,19 +1667,25 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
def
run_openpangu_vl
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
model_name
=
"FreedomIntelligence/openPangu-VL-7B"
mm_limit
=
{
"image"
:
1
,
"video"
:
1
}
if
modality
==
"image+video"
else
{
modality
:
1
}
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
4096
,
max_num_seqs
=
4
,
trust_remote_code
=
True
,
enforce_eager
=
True
,
limit_mm_per_prompt
=
{
modality
:
1
}
,
limit_mm_per_prompt
=
mm_limit
,
)
image_placeholder
=
"[unused19]"
video_placeholder
=
"[unused32]"
if
modality
==
"image"
:
placeholder
=
"[unused19]"
placeholder
=
image_placeholder
elif
modality
==
"video"
:
placeholder
=
"[unused32]"
placeholder
=
video_placeholder
elif
modality
==
"image+video"
:
placeholder
=
image_placeholder
+
video_placeholder
prompts
=
[
(
...
...
@@ -1623,18 +1733,25 @@ def run_ovis(questions: list[str], modality: str) -> ModelRequestData:
def
run_ovis2_5
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
model_name
=
"AIDC-AI/Ovis2.5-2B"
mm_limit
=
{
"image"
:
1
,
"video"
:
1
}
if
modality
==
"image+video"
else
{
modality
:
1
}
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
trust_remote_code
=
True
,
dtype
=
"half"
,
limit_mm_per_prompt
=
{
modality
:
1
}
,
limit_mm_per_prompt
=
mm_limit
,
)
image_placeholder
=
"<image>"
video_placeholder
=
"<video>"
if
modality
==
"image"
:
placeholder
=
"<
image
>"
placeholder
=
image
_placeholder
elif
modality
==
"video"
:
placeholder
=
"<video>"
placeholder
=
video_placeholder
elif
modality
==
"image+video"
:
placeholder
=
image_placeholder
+
"
\n
"
+
video_placeholder
prompts
=
[
f
"<|im_start|>user
\n\n
{
placeholder
}
\n
{
question
}
<|im_end|>
\n
<|im_start|>assistant
\n
"
...
...
@@ -1846,6 +1963,7 @@ def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
def
run_qwen2_vl
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
model_name
=
"Qwen/Qwen2-VL-7B-Instruct"
mm_limit
=
{
"image"
:
1
,
"video"
:
1
}
if
modality
==
"image+video"
else
{
modality
:
1
}
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
4096
,
...
...
@@ -1855,18 +1973,23 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
"min_pixels"
:
28
*
28
,
"max_pixels"
:
1280
*
28
*
28
,
},
limit_mm_per_prompt
=
{
modality
:
1
}
,
limit_mm_per_prompt
=
mm_limit
,
)
image_placeholder
=
"<|vision_start|><|image_pad|><|vision_end|>"
video_placeholder
=
"<|vision_start|><|video_pad|><|vision_end|>"
if
modality
==
"image"
:
placeholder
=
"<|
image_p
ad|>"
placeholder
=
image_p
laceholder
elif
modality
==
"video"
:
placeholder
=
"<|video_pad|>"
placeholder
=
video_placeholder
elif
modality
==
"image+video"
:
placeholder
=
image_placeholder
+
video_placeholder
prompts
=
[
(
"<|im_start|>system
\n
You are a helpful assistant.<|im_end|>
\n
"
f
"<|im_start|>user
\n
<|vision_start|>
{
placeholder
}
<|vision_end|>
"
f
"<|im_start|>user
\n
{
placeholder
}
"
f
"
{
question
}
<|im_end|>
\n
"
"<|im_start|>assistant
\n
"
)
...
...
@@ -1883,6 +2006,7 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
def
run_qwen2_5_vl
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
model_name
=
"Qwen/Qwen2.5-VL-3B-Instruct"
mm_limit
=
{
"image"
:
1
,
"video"
:
1
}
if
modality
==
"image+video"
else
{
modality
:
1
}
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
4096
,
...
...
@@ -1892,18 +2016,23 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
"max_pixels"
:
1280
*
28
*
28
,
"fps"
:
1
,
},
limit_mm_per_prompt
=
{
modality
:
1
}
,
limit_mm_per_prompt
=
mm_limit
,
)
image_placeholder
=
"<|vision_start|><|image_pad|><|vision_end|>"
video_placeholder
=
"<|vision_start|><|video_pad|><|vision_end|>"
if
modality
==
"image"
:
placeholder
=
"<|
image_p
ad|>"
placeholder
=
image_p
laceholder
elif
modality
==
"video"
:
placeholder
=
"<|video_pad|>"
placeholder
=
video_placeholder
elif
modality
==
"image+video"
:
placeholder
=
image_placeholder
+
video_placeholder
prompts
=
[
(
"<|im_start|>system
\n
You are a helpful assistant.<|im_end|>
\n
"
f
"<|im_start|>user
\n
<|vision_start|>
{
placeholder
}
<|vision_end|>
"
f
"<|im_start|>user
\n
{
placeholder
}
"
f
"
{
question
}
<|im_end|>
\n
"
"<|im_start|>assistant
\n
"
)
...
...
@@ -1920,6 +2049,7 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
def
run_qwen2_5_omni
(
questions
:
list
[
str
],
modality
:
str
):
model_name
=
"Qwen/Qwen2.5-Omni-7B"
mm_limit
=
{
"image"
:
1
,
"video"
:
1
}
if
modality
==
"image+video"
else
{
modality
:
1
}
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
4096
,
...
...
@@ -1929,13 +2059,18 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
"max_pixels"
:
1280
*
28
*
28
,
"fps"
:
1
,
},
limit_mm_per_prompt
=
{
modality
:
1
}
,
limit_mm_per_prompt
=
mm_limit
,
)
image_placeholder
=
"<|vision_bos|><|IMAGE|><|vision_eos|>"
video_placeholder
=
"<|vision_bos|><|VIDEO|><|vision_eos|>"
if
modality
==
"image"
:
placeholder
=
"<|IMAGE|>"
placeholder
=
image_placeholder
elif
modality
==
"video"
:
placeholder
=
"<|VIDEO|>"
placeholder
=
video_placeholder
elif
modality
==
"image+video"
:
placeholder
=
image_placeholder
+
video_placeholder
default_system
=
(
"You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
...
...
@@ -1946,7 +2081,7 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
prompts
=
[
(
f
"<|im_start|>system
\n
{
default_system
}
<|im_end|>
\n
"
f
"<|im_start|>user
\n
<|vision_bos|>
{
placeholder
}
<|vision_eos|>
"
f
"<|im_start|>user
\n
{
placeholder
}
"
f
"
{
question
}
<|im_end|>
\n
"
"<|im_start|>assistant
\n
"
)
...
...
@@ -1962,6 +2097,7 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
def
run_qwen3_vl
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
model_name
=
"Qwen/Qwen3-VL-4B-Instruct"
mm_limit
=
{
"image"
:
1
,
"video"
:
1
}
if
modality
==
"image+video"
else
{
modality
:
1
}
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
4096
,
...
...
@@ -1971,18 +2107,23 @@ def run_qwen3_vl(questions: list[str], modality: str) -> ModelRequestData:
"max_pixels"
:
1280
*
28
*
28
,
"fps"
:
1
,
},
limit_mm_per_prompt
=
{
modality
:
1
}
,
limit_mm_per_prompt
=
mm_limit
,
)
image_placeholder
=
"<|vision_start|><|image_pad|><|vision_end|>"
video_placeholder
=
"<|vision_start|><|video_pad|><|vision_end|>"
if
modality
==
"image"
:
placeholder
=
"<|
image_p
ad|>"
placeholder
=
image_p
laceholder
elif
modality
==
"video"
:
placeholder
=
"<|video_pad|>"
placeholder
=
video_placeholder
elif
modality
==
"image+video"
:
placeholder
=
image_placeholder
+
video_placeholder
prompts
=
[
(
"<|im_start|>system
\n
You are a helpful assistant.<|im_end|>
\n
"
f
"<|im_start|>user
\n
<|vision_start|>
{
placeholder
}
<|vision_end|>
"
f
"<|im_start|>user
\n
{
placeholder
}
"
f
"
{
question
}
<|im_end|>
\n
"
"<|im_start|>assistant
\n
"
)
...
...
@@ -1999,6 +2140,7 @@ def run_qwen3_vl(questions: list[str], modality: str) -> ModelRequestData:
def
run_qwen3_vl_moe
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
model_name
=
"Qwen/Qwen3-VL-30B-A3B-Instruct"
mm_limit
=
{
"image"
:
1
,
"video"
:
1
}
if
modality
==
"image+video"
else
{
modality
:
1
}
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
4096
,
...
...
@@ -2008,18 +2150,23 @@ def run_qwen3_vl_moe(questions: list[str], modality: str) -> ModelRequestData:
"max_pixels"
:
1280
*
28
*
28
,
"fps"
:
1
,
},
limit_mm_per_prompt
=
{
modality
:
1
}
,
limit_mm_per_prompt
=
mm_limit
,
)
image_placeholder
=
"<|vision_start|><|image_pad|><|vision_end|>"
video_placeholder
=
"<|vision_start|><|video_pad|><|vision_end|>"
if
modality
==
"image"
:
placeholder
=
"<|
image_p
ad|>"
placeholder
=
image_p
laceholder
elif
modality
==
"video"
:
placeholder
=
"<|video_pad|>"
placeholder
=
video_placeholder
elif
modality
==
"image+video"
:
placeholder
=
image_placeholder
+
video_placeholder
prompts
=
[
(
"<|im_start|>system
\n
You are a helpful assistant.<|im_end|>
\n
"
f
"<|im_start|>user
\n
<|vision_start|>
{
placeholder
}
<|vision_end|>
"
f
"<|im_start|>user
\n
{
placeholder
}
"
f
"
{
question
}
<|im_end|>
\n
"
"<|im_start|>assistant
\n
"
)
...
...
@@ -2190,6 +2337,7 @@ def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
def
run_tarsier2
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
model_name
=
"omni-research/Tarsier2-Recap-7b"
mm_limit
=
{
"image"
:
1
,
"video"
:
1
}
if
modality
==
"image+video"
else
{
modality
:
1
}
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
4096
,
...
...
@@ -2197,18 +2345,23 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
"architectures"
:
[
"Tarsier2ForConditionalGeneration"
],
"model_type"
:
"tarsier2"
,
},
limit_mm_per_prompt
=
{
modality
:
1
}
,
limit_mm_per_prompt
=
mm_limit
,
)
image_placeholder
=
"<|vision_start|><|image_pad|><|vision_end|>"
video_placeholder
=
"<|vision_start|><|video_pad|><|vision_end|>"
if
modality
==
"image"
:
placeholder
=
"<|
image_p
ad|>"
placeholder
=
image_p
laceholder
elif
modality
==
"video"
:
placeholder
=
"<|video_pad|>"
placeholder
=
video_placeholder
elif
modality
==
"image+video"
:
placeholder
=
image_placeholder
+
video_placeholder
prompts
=
[
(
"<|im_start|>system
\n
You are a helpful assistant.<|im_end|>
\n
"
f
"<|im_start|>user
\n
<|vision_start|>
{
placeholder
}
<|vision_end|>
"
f
"<|im_start|>user
\n
{
placeholder
}
"
f
"
{
question
}
<|im_end|>
\n
"
"<|im_start|>assistant
\n
"
)
...
...
@@ -2357,6 +2510,24 @@ def get_multi_modal_input(args):
"questions"
:
vision_chunk_questions
,
}
if
args
.
modality
==
"image+video"
:
image
=
convert_image_mode
(
ImageAsset
(
"cherry_blossom"
).
pil_image
,
"RGB"
)
needs_metadata
=
args
.
model_type
in
MODELS_NEED_VIDEO_METADATA
video
=
VideoAsset
(
name
=
"baby_reading"
,
num_frames
=
args
.
num_frames
).
np_ndarrays
metadata
=
VideoAsset
(
name
=
"baby_reading"
,
num_frames
=
args
.
num_frames
).
metadata
img_video_questions
=
[
"What is shown in the image? What happens in the video?"
,
"Describe both the image and the video content."
,
]
return
{
"data"
:
{
"image"
:
image
,
"video"
:
([(
video
,
metadata
)]
if
needs_metadata
else
video
),
},
"questions"
:
img_video_questions
,
}
msg
=
f
"Modality
{
args
.
modality
}
is not supported."
raise
ValueError
(
msg
)
...
...
@@ -2439,7 +2610,7 @@ def parse_args():
"--modality"
,
type
=
str
,
default
=
"image"
,
choices
=
[
"image"
,
"video"
,
"vision_chunk"
],
choices
=
[
"image"
,
"video"
,
"image+video"
,
"vision_chunk"
],
help
=
"Modality of the input."
,
)
parser
.
add_argument
(
...
...
@@ -2546,23 +2717,42 @@ def main(args):
else
req_data
.
sampling_params
)
def
_mm_data
(
data
,
modality
):
if
modality
==
"image+video"
:
return
{
"image"
:
data
[
"image"
],
"video"
:
data
[
"video"
]}
return
{
modality
:
data
}
def
_mm_uuid
(
uuid
,
modality
):
if
modality
==
"image+video"
:
return
{
"image"
:
uuid
,
"video"
:
uuid
+
"v"
}
return
{
modality
:
uuid
}
def
_mm_empty
(
modality
):
if
modality
==
"image+video"
:
return
{
"image"
:
None
,
"video"
:
None
}
return
{
modality
:
None
}
assert
args
.
num_prompts
>
0
if
args
.
num_prompts
==
1
:
# Single inference
uuid
=
"uuid_0"
inputs
=
{
"prompt"
:
prompts
[
0
],
"multi_modal_data"
:
{
modality
:
data
}
,
"multi_modal_uuids"
:
{
modality
:
uuid
}
,
"multi_modal_data"
:
_mm_data
(
data
,
modality
)
,
"multi_modal_uuids"
:
_mm_uuid
(
uuid
,
modality
)
,
}
inputs_with_empty_media
=
{
"prompt"
:
prompts
[
0
],
"multi_modal_data"
:
{
modality
:
None
}
,
"multi_modal_uuids"
:
{
modality
:
uuid
}
,
"multi_modal_data"
:
_mm_empty
(
modality
)
,
"multi_modal_uuids"
:
_mm_uuid
(
uuid
,
modality
)
,
}
else
:
# Batch inference
if
args
.
image_repeat_prob
is
not
None
:
if
modality
==
"image+video"
:
raise
ValueError
(
"--image-repeat-prob is not supported for 'image+video' modality"
)
# Repeat images with specified probability of "image_repeat_prob"
inputs
,
inputs_with_empty_media
=
apply_image_repeat
(
args
.
image_repeat_prob
,
...
...
@@ -2572,7 +2762,7 @@ def main(args):
modality
,
)
else
:
# Use the same image for all prompts
# Use the same image
/video
for all prompts
inputs
=
[]
inputs_with_empty_media
=
[]
for
i
in
range
(
args
.
num_prompts
):
...
...
@@ -2580,15 +2770,15 @@ def main(args):
inputs
.
append
(
{
"prompt"
:
prompts
[
i
%
len
(
prompts
)],
"multi_modal_data"
:
{
modality
:
data
}
,
"multi_modal_uuids"
:
{
modality
:
uuid
}
,
"multi_modal_data"
:
_mm_data
(
data
,
modality
)
,
"multi_modal_uuids"
:
_mm_uuid
(
uuid
,
modality
)
,
}
)
inputs_with_empty_media
.
append
(
{
"prompt"
:
prompts
[
i
%
len
(
prompts
)],
"multi_modal_data"
:
{
modality
:
None
}
,
"multi_modal_uuids"
:
{
modality
:
uuid
}
,
"multi_modal_data"
:
_mm_empty
(
modality
)
,
"multi_modal_uuids"
:
_mm_uuid
(
uuid
,
modality
)
,
}
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment