Unverified Commit b4784001 authored by Shanshan Shen's avatar Shanshan Shen Committed by GitHub
Browse files

[MM][Misc] Support image+video mixed inputs (per prompt) for VLM examples (#40335)


Signed-off-by: default avatarshen-shanshan <467638484@qq.com>
parent 989cc12d
...@@ -394,18 +394,24 @@ def run_eagle2_5(questions: list[str], modality: str) -> ModelRequestData: ...@@ -394,18 +394,24 @@ def run_eagle2_5(questions: list[str], modality: str) -> ModelRequestData:
def run_ernie45_vl(questions: list[str], modality: str) -> ModelRequestData: def run_ernie45_vl(questions: list[str], modality: str) -> ModelRequestData:
model_name = "baidu/ERNIE-4.5-VL-28B-A3B-PT" model_name = "baidu/ERNIE-4.5-VL-28B-A3B-PT"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=4096, max_model_len=4096,
max_num_seqs=5, max_num_seqs=5,
limit_mm_per_prompt={modality: 1}, limit_mm_per_prompt=mm_limit,
trust_remote_code=True, trust_remote_code=True,
) )
image_placeholder = "Picture 1:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>"
video_placeholder = "Video 1:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>"
if modality == "image": if modality == "image":
placeholder = "Picture 1:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>" placeholder = image_placeholder
elif modality == "video": elif modality == "video":
placeholder = "Video 1:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>" placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
prompts = [ prompts = [
( (
...@@ -425,6 +431,7 @@ def run_ernie45_vl(questions: list[str], modality: str) -> ModelRequestData: ...@@ -425,6 +431,7 @@ def run_ernie45_vl(questions: list[str], modality: str) -> ModelRequestData:
def run_exaone4_5(questions: list[str], modality: str) -> ModelRequestData: def run_exaone4_5(questions: list[str], modality: str) -> ModelRequestData:
model_name = "LGAI-EXAONE/EXAONE-4.5-33B" model_name = "LGAI-EXAONE/EXAONE-4.5-33B"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=4096, max_model_len=4096,
...@@ -434,18 +441,23 @@ def run_exaone4_5(questions: list[str], modality: str) -> ModelRequestData: ...@@ -434,18 +441,23 @@ def run_exaone4_5(questions: list[str], modality: str) -> ModelRequestData:
"max_pixels": 1280 * 28 * 28, "max_pixels": 1280 * 28 * 28,
"fps": 1, "fps": 1,
}, },
limit_mm_per_prompt={modality: 1}, limit_mm_per_prompt=mm_limit,
) )
image_placeholder = "<vision><|image_pad|></vision>"
video_placeholder = "<vision><|video_pad|></vision>"
if modality == "image": if modality == "image":
placeholder = "<|image_pad|>" placeholder = image_placeholder
elif modality == "video": elif modality == "video":
placeholder = "<|video_pad|>" placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
prompts = [ prompts = [
( (
"<|system|>\nYou are a helpful assistant.<|endofturn|>\n" "<|system|>\nYou are a helpful assistant.<|endofturn|>\n"
f"<|user|>\n<vision>{placeholder}</vision>" f"<|user|>\n{placeholder}"
f"{question}<|endofturn|>\n" f"{question}<|endofturn|>\n"
"<|assistant|>\n" "<|assistant|>\n"
) )
...@@ -566,6 +578,7 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData: ...@@ -566,6 +578,7 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData: def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData:
model_name = "zai-org/GLM-4.1V-9B-Thinking" model_name = "zai-org/GLM-4.1V-9B-Thinking"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=4096, max_model_len=4096,
...@@ -574,14 +587,19 @@ def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData: ...@@ -574,14 +587,19 @@ def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData:
"size": {"shortest_edge": 12544, "longest_edge": 47040000}, "size": {"shortest_edge": 12544, "longest_edge": 47040000},
"fps": 1, "fps": 1,
}, },
limit_mm_per_prompt={modality: 1}, limit_mm_per_prompt=mm_limit,
enforce_eager=True, enforce_eager=True,
) )
image_placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
video_placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
if modality == "image": if modality == "image":
placeholder = "<|begin_of_image|><|image|><|end_of_image|>" placeholder = image_placeholder
elif modality == "video": elif modality == "video":
placeholder = "<|begin_of_video|><|video|><|end_of_video|>" placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
prompts = [ prompts = [
( (
...@@ -602,6 +620,7 @@ def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData: ...@@ -602,6 +620,7 @@ def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData:
def run_glm4_5v(questions: list[str], modality: str) -> ModelRequestData: def run_glm4_5v(questions: list[str], modality: str) -> ModelRequestData:
model_name = "zai-org/GLM-4.5V" model_name = "zai-org/GLM-4.5V"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=4096, max_model_len=4096,
...@@ -610,15 +629,20 @@ def run_glm4_5v(questions: list[str], modality: str) -> ModelRequestData: ...@@ -610,15 +629,20 @@ def run_glm4_5v(questions: list[str], modality: str) -> ModelRequestData:
"size": {"shortest_edge": 12544, "longest_edge": 47040000}, "size": {"shortest_edge": 12544, "longest_edge": 47040000},
"fps": 1, "fps": 1,
}, },
limit_mm_per_prompt={modality: 1}, limit_mm_per_prompt=mm_limit,
enforce_eager=True, enforce_eager=True,
tensor_parallel_size=4, tensor_parallel_size=4,
) )
image_placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
video_placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
if modality == "image": if modality == "image":
placeholder = "<|begin_of_image|><|image|><|end_of_image|>" placeholder = image_placeholder
elif modality == "video": elif modality == "video":
placeholder = "<|begin_of_video|><|video|><|end_of_video|>" placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
prompts = [ prompts = [
( (
...@@ -639,6 +663,7 @@ def run_glm4_5v(questions: list[str], modality: str) -> ModelRequestData: ...@@ -639,6 +663,7 @@ def run_glm4_5v(questions: list[str], modality: str) -> ModelRequestData:
def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData: def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData:
model_name = "zai-org/GLM-4.5V-FP8" model_name = "zai-org/GLM-4.5V-FP8"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=4096, max_model_len=4096,
...@@ -647,15 +672,20 @@ def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData: ...@@ -647,15 +672,20 @@ def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData:
"size": {"shortest_edge": 12544, "longest_edge": 47040000}, "size": {"shortest_edge": 12544, "longest_edge": 47040000},
"fps": 1, "fps": 1,
}, },
limit_mm_per_prompt={modality: 1}, limit_mm_per_prompt=mm_limit,
enforce_eager=True, enforce_eager=True,
tensor_parallel_size=4, tensor_parallel_size=4,
) )
image_placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
video_placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
if modality == "image": if modality == "image":
placeholder = "<|begin_of_image|><|image|><|end_of_image|>" placeholder = image_placeholder
elif modality == "video": elif modality == "video":
placeholder = "<|begin_of_video|><|video|><|end_of_video|>" placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
prompts = [ prompts = [
( (
...@@ -676,6 +706,7 @@ def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData: ...@@ -676,6 +706,7 @@ def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData:
def run_glm_ocr(questions: list[str], modality: str) -> ModelRequestData: def run_glm_ocr(questions: list[str], modality: str) -> ModelRequestData:
model_name = "zai-org/GLM-OCR" model_name = "zai-org/GLM-OCR"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=4096, max_model_len=4096,
...@@ -684,14 +715,19 @@ def run_glm_ocr(questions: list[str], modality: str) -> ModelRequestData: ...@@ -684,14 +715,19 @@ def run_glm_ocr(questions: list[str], modality: str) -> ModelRequestData:
"size": {"shortest_edge": 12544, "longest_edge": 47040000}, "size": {"shortest_edge": 12544, "longest_edge": 47040000},
"fps": 1, "fps": 1,
}, },
limit_mm_per_prompt={modality: 1}, limit_mm_per_prompt=mm_limit,
enforce_eager=True, enforce_eager=True,
) )
image_placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
video_placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
if modality == "image": if modality == "image":
placeholder = "<|begin_of_image|><|image|><|end_of_image|>" placeholder = image_placeholder
elif modality == "video": elif modality == "video":
placeholder = "<|begin_of_video|><|video|><|end_of_video|>" placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
prompts = [ prompts = [
( (
...@@ -772,11 +808,12 @@ def run_hyperclovax_seed_vision( ...@@ -772,11 +808,12 @@ def run_hyperclovax_seed_vision(
model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B" model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
max_model_len=8192 if modality == "image" else 16384, max_model_len=16384 if modality in ("video", "image+video") else 8192,
limit_mm_per_prompt={modality: 1}, limit_mm_per_prompt=mm_limit,
) )
messages = list() messages = list()
...@@ -828,6 +865,29 @@ def run_hyperclovax_seed_vision( ...@@ -828,6 +865,29 @@ def run_hyperclovax_seed_vision(
} }
] ]
) )
elif modality == "image+video":
messages.append(
[
{
"role": "user",
"content": [
{
"type": "image",
"ocr": "",
"lens_keywords": "",
"lens_local_keywords": "",
},
{
"type": "video",
},
{
"type": "text",
"text": question,
},
],
}
]
)
else: else:
raise ValueError(f"Unsupported modality: {modality}") raise ValueError(f"Unsupported modality: {modality}")
...@@ -876,19 +936,25 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData: ...@@ -876,19 +936,25 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
def run_interns1(questions: list[str], modality: str) -> ModelRequestData: def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
model_name = "internlm/Intern-S1-mini" model_name = "internlm/Intern-S1-mini"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
max_model_len=8192, max_model_len=8192,
max_num_seqs=2, max_num_seqs=2,
limit_mm_per_prompt={modality: 1}, limit_mm_per_prompt=mm_limit,
enforce_eager=True, enforce_eager=True,
) )
image_placeholder = "<IMG_CONTEXT>"
video_placeholder = "<video>"
if modality == "image": if modality == "image":
placeholder = "<IMG_CONTEXT>" placeholder = image_placeholder
elif modality == "video": elif modality == "video":
placeholder = "<video>" placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + "\n" + video_placeholder
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
messages = [ messages = [
...@@ -909,20 +975,26 @@ def run_interns1(questions: list[str], modality: str) -> ModelRequestData: ...@@ -909,20 +975,26 @@ def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
def run_interns1_pro(questions: list[str], modality: str) -> ModelRequestData: def run_interns1_pro(questions: list[str], modality: str) -> ModelRequestData:
model_name = "internlm/Intern-S1-Pro" model_name = "internlm/Intern-S1-Pro"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
max_model_len=8192, max_model_len=8192,
max_num_seqs=2, max_num_seqs=2,
limit_mm_per_prompt={modality: 1}, limit_mm_per_prompt=mm_limit,
enforce_eager=True, enforce_eager=True,
tensor_parallel_size=4, tensor_parallel_size=4,
) )
image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
video_placeholder = "<|vision_start|><|video_pad|><|vision_end|>"
if modality == "image": if modality == "image":
placeholder = "<|vision_start|><|image_pad|><|vision_end|>" placeholder = image_placeholder
elif modality == "video": elif modality == "video":
placeholder = "<|vision_start|><|video_pad|><|vision_end|>" placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
messages = [ messages = [
...@@ -943,17 +1015,23 @@ def run_interns1_pro(questions: list[str], modality: str) -> ModelRequestData: ...@@ -943,17 +1015,23 @@ def run_interns1_pro(questions: list[str], modality: str) -> ModelRequestData:
def run_internvl(questions: list[str], modality: str) -> ModelRequestData: def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
model_name = "OpenGVLab/InternVL3-2B" model_name = "OpenGVLab/InternVL3-2B"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
max_model_len=8192, max_model_len=8192,
limit_mm_per_prompt={modality: 1}, limit_mm_per_prompt=mm_limit,
) )
image_placeholder = "<image>"
video_placeholder = "<video>"
if modality == "image": if modality == "image":
placeholder = "<image>" placeholder = image_placeholder
elif modality == "video": elif modality == "video":
placeholder = "<video>" placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + "\n" + video_placeholder
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
messages = [ messages = [
...@@ -1010,21 +1088,27 @@ def run_kanana_v(questions: list[str], modality: str) -> ModelRequestData: ...@@ -1010,21 +1088,27 @@ def run_kanana_v(questions: list[str], modality: str) -> ModelRequestData:
def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData: def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
model_name = "Kwai-Keye/Keye-VL-8B-Preview" model_name = "Kwai-Keye/Keye-VL-8B-Preview"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=8192, max_model_len=8192,
trust_remote_code=True, trust_remote_code=True,
limit_mm_per_prompt={modality: 1}, limit_mm_per_prompt=mm_limit,
) )
image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
video_placeholder = "<|vision_start|><|video_pad|><|vision_end|>"
if modality == "image": if modality == "image":
placeholder = "<|image_pad|>" placeholder = image_placeholder
elif modality == "video": elif modality == "video":
placeholder = "<|video_pad|>" placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
prompts = [ prompts = [
( (
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>" f"<|im_start|>user\n{placeholder}"
f"{question}<|im_end|>\n" f"{question}<|im_end|>\n"
"<|im_start|>assistant\n" "<|im_start|>assistant\n"
) )
...@@ -1041,21 +1125,27 @@ def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData: ...@@ -1041,21 +1125,27 @@ def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
def run_keye_vl1_5(questions: list[str], modality: str) -> ModelRequestData: def run_keye_vl1_5(questions: list[str], modality: str) -> ModelRequestData:
model_name = "Kwai-Keye/Keye-VL-1.5-8B" model_name = "Kwai-Keye/Keye-VL-1.5-8B"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=8192, max_model_len=8192,
trust_remote_code=True, trust_remote_code=True,
limit_mm_per_prompt={modality: 1}, limit_mm_per_prompt=mm_limit,
) )
image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
video_placeholder = "<|vision_start|><|video_pad|><|vision_end|>"
if modality == "image": if modality == "image":
placeholder = "<|image_pad|>" placeholder = image_placeholder
elif modality == "video": elif modality == "video":
placeholder = "<|video_pad|>" placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
prompts = [ prompts = [
( (
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>" f"<|im_start|>user\n{placeholder}"
f"{question}<|im_end|>\n" f"{question}<|im_end|>\n"
"<|im_start|>assistant\n" "<|im_start|>assistant\n"
) )
...@@ -1259,22 +1349,26 @@ def run_llava_next_video(questions: list[str], modality: str) -> ModelRequestDat ...@@ -1259,22 +1349,26 @@ def run_llava_next_video(questions: list[str], modality: str) -> ModelRequestDat
# LLaVA-OneVision # LLaVA-OneVision
def run_llava_onevision(questions: list[str], modality: str) -> ModelRequestData: def run_llava_onevision(questions: list[str], modality: str) -> ModelRequestData:
if modality == "video": image_placeholder = "<image>"
prompts = [ video_placeholder = "<video>"
f"<|im_start|>user <video>\n{question}<|im_end|><|im_start|>assistant\n"
for question in questions if modality == "image":
] placeholder = image_placeholder
elif modality == "video":
placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + "\n" + video_placeholder
elif modality == "image":
prompts = [ prompts = [
f"<|im_start|>user <image>\n{question}<|im_end|><|im_start|>assistant\n" (f"<|im_start|>user {placeholder}\n{question}<|im_end|><|im_start|>assistant\n")
for question in questions for question in questions
] ]
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs( engine_args = EngineArgs(
model="llava-hf/llava-onevision-qwen2-7b-ov-hf", model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
max_model_len=16384, max_model_len=16384,
limit_mm_per_prompt={modality: 1}, limit_mm_per_prompt=mm_limit,
) )
return ModelRequestData( return ModelRequestData(
...@@ -1307,7 +1401,7 @@ def run_mantis(questions: list[str], modality: str) -> ModelRequestData: ...@@ -1307,7 +1401,7 @@ def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
# MiniCPM-V # MiniCPM-V
def run_minicpmv_base(questions: list[str], modality: str, model_name): def run_minicpmv_base(questions: list[str], modality: str, model_name):
assert modality in ["image", "video"] assert modality in ["image", "video", "image+video"]
# If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa
# 2.0 # 2.0
...@@ -1329,12 +1423,13 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name): ...@@ -1329,12 +1423,13 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
# o2.6: image, video, audio # o2.6: image, video, audio
# model_name = "openbmb/MiniCPM-o-2_6" # model_name = "openbmb/MiniCPM-o-2_6"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
trust_remote_code=True, trust_remote_code=True,
limit_mm_per_prompt={modality: 1}, limit_mm_per_prompt=mm_limit,
) )
# NOTE The stop_token_ids are different for various versions of MiniCPM-V # NOTE The stop_token_ids are different for various versions of MiniCPM-V
# 2.0 # 2.0
...@@ -1347,17 +1442,22 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name): ...@@ -1347,17 +1442,22 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
stop_tokens = ["<|im_end|>", "<|endoftext|>"] stop_tokens = ["<|im_end|>", "<|endoftext|>"]
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
modality_placeholder = { image_placeholder = "(<image>./</image>)"
"image": "(<image>./</image>)", video_placeholder = "(<video>./</video>)"
"video": "(<video>./</video>)",
} if modality == "image":
placeholder = image_placeholder
elif modality == "video":
placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + "\n" + video_placeholder
prompts = [ prompts = [
tokenizer.apply_chat_template( tokenizer.apply_chat_template(
[ [
{ {
"role": "user", "role": "user",
"content": f"{modality_placeholder[modality]}\n{question}", "content": f"{placeholder}\n{question}",
} }
], ],
tokenize=False, tokenize=False,
...@@ -1466,20 +1566,24 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData: ...@@ -1466,20 +1566,24 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
def run_molmo2(questions: list[str], modality: str) -> ModelRequestData: def run_molmo2(questions: list[str], modality: str) -> ModelRequestData:
model_name = "allenai/Molmo2-8B" model_name = "allenai/Molmo2-8B"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
dtype="bfloat16", dtype="bfloat16",
limit_mm_per_prompt={modality: 1}, limit_mm_per_prompt=mm_limit,
max_num_batched_tokens=36864, max_num_batched_tokens=36864,
) )
image_placeholder = "<|image|>"
video_placeholder = "<|video|>"
if modality == "image": if modality == "image":
placeholder = "<|image|>" placeholder = image_placeholder
elif modality == "video": elif modality == "video":
placeholder = "<|video|>" placeholder = video_placeholder
else: elif modality == "image+video":
raise ValueError(f"Unsupported modality for molmo2: {modality}") placeholder = image_placeholder + video_placeholder
prompts = [ prompts = [
f"{placeholder}<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n" f"{placeholder}<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"
...@@ -1563,19 +1667,25 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData: ...@@ -1563,19 +1667,25 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
def run_openpangu_vl(questions: list[str], modality: str) -> ModelRequestData: def run_openpangu_vl(questions: list[str], modality: str) -> ModelRequestData:
model_name = "FreedomIntelligence/openPangu-VL-7B" model_name = "FreedomIntelligence/openPangu-VL-7B"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=4096, max_model_len=4096,
max_num_seqs=4, max_num_seqs=4,
trust_remote_code=True, trust_remote_code=True,
enforce_eager=True, enforce_eager=True,
limit_mm_per_prompt={modality: 1}, limit_mm_per_prompt=mm_limit,
) )
image_placeholder = "[unused19]"
video_placeholder = "[unused32]"
if modality == "image": if modality == "image":
placeholder = "[unused19]" placeholder = image_placeholder
elif modality == "video": elif modality == "video":
placeholder = "[unused32]" placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
prompts = [ prompts = [
( (
...@@ -1623,18 +1733,25 @@ def run_ovis(questions: list[str], modality: str) -> ModelRequestData: ...@@ -1623,18 +1733,25 @@ def run_ovis(questions: list[str], modality: str) -> ModelRequestData:
def run_ovis2_5(questions: list[str], modality: str) -> ModelRequestData: def run_ovis2_5(questions: list[str], modality: str) -> ModelRequestData:
model_name = "AIDC-AI/Ovis2.5-2B" model_name = "AIDC-AI/Ovis2.5-2B"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
trust_remote_code=True, trust_remote_code=True,
dtype="half", dtype="half",
limit_mm_per_prompt={modality: 1}, limit_mm_per_prompt=mm_limit,
) )
image_placeholder = "<image>"
video_placeholder = "<video>"
if modality == "image": if modality == "image":
placeholder = "<image>" placeholder = image_placeholder
elif modality == "video": elif modality == "video":
placeholder = "<video>" placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + "\n" + video_placeholder
prompts = [ prompts = [
f"<|im_start|>user\n\n{placeholder}\n{question}<|im_end|>\n<|im_start|>assistant\n" f"<|im_start|>user\n\n{placeholder}\n{question}<|im_end|>\n<|im_start|>assistant\n"
...@@ -1846,6 +1963,7 @@ def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData: ...@@ -1846,6 +1963,7 @@ def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData: def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
model_name = "Qwen/Qwen2-VL-7B-Instruct" model_name = "Qwen/Qwen2-VL-7B-Instruct"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=4096, max_model_len=4096,
...@@ -1855,18 +1973,23 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData: ...@@ -1855,18 +1973,23 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
"min_pixels": 28 * 28, "min_pixels": 28 * 28,
"max_pixels": 1280 * 28 * 28, "max_pixels": 1280 * 28 * 28,
}, },
limit_mm_per_prompt={modality: 1}, limit_mm_per_prompt=mm_limit,
) )
image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
video_placeholder = "<|vision_start|><|video_pad|><|vision_end|>"
if modality == "image": if modality == "image":
placeholder = "<|image_pad|>" placeholder = image_placeholder
elif modality == "video": elif modality == "video":
placeholder = "<|video_pad|>" placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
prompts = [ prompts = [
( (
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>" f"<|im_start|>user\n{placeholder}"
f"{question}<|im_end|>\n" f"{question}<|im_end|>\n"
"<|im_start|>assistant\n" "<|im_start|>assistant\n"
) )
...@@ -1883,6 +2006,7 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData: ...@@ -1883,6 +2006,7 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData: def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
model_name = "Qwen/Qwen2.5-VL-3B-Instruct" model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=4096, max_model_len=4096,
...@@ -1892,18 +2016,23 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData: ...@@ -1892,18 +2016,23 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
"max_pixels": 1280 * 28 * 28, "max_pixels": 1280 * 28 * 28,
"fps": 1, "fps": 1,
}, },
limit_mm_per_prompt={modality: 1}, limit_mm_per_prompt=mm_limit,
) )
image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
video_placeholder = "<|vision_start|><|video_pad|><|vision_end|>"
if modality == "image": if modality == "image":
placeholder = "<|image_pad|>" placeholder = image_placeholder
elif modality == "video": elif modality == "video":
placeholder = "<|video_pad|>" placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
prompts = [ prompts = [
( (
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>" f"<|im_start|>user\n{placeholder}"
f"{question}<|im_end|>\n" f"{question}<|im_end|>\n"
"<|im_start|>assistant\n" "<|im_start|>assistant\n"
) )
...@@ -1920,6 +2049,7 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData: ...@@ -1920,6 +2049,7 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
def run_qwen2_5_omni(questions: list[str], modality: str): def run_qwen2_5_omni(questions: list[str], modality: str):
model_name = "Qwen/Qwen2.5-Omni-7B" model_name = "Qwen/Qwen2.5-Omni-7B"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=4096, max_model_len=4096,
...@@ -1929,13 +2059,18 @@ def run_qwen2_5_omni(questions: list[str], modality: str): ...@@ -1929,13 +2059,18 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
"max_pixels": 1280 * 28 * 28, "max_pixels": 1280 * 28 * 28,
"fps": 1, "fps": 1,
}, },
limit_mm_per_prompt={modality: 1}, limit_mm_per_prompt=mm_limit,
) )
image_placeholder = "<|vision_bos|><|IMAGE|><|vision_eos|>"
video_placeholder = "<|vision_bos|><|VIDEO|><|vision_eos|>"
if modality == "image": if modality == "image":
placeholder = "<|IMAGE|>" placeholder = image_placeholder
elif modality == "video": elif modality == "video":
placeholder = "<|VIDEO|>" placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
default_system = ( default_system = (
"You are Qwen, a virtual human developed by the Qwen Team, Alibaba " "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
...@@ -1946,7 +2081,7 @@ def run_qwen2_5_omni(questions: list[str], modality: str): ...@@ -1946,7 +2081,7 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
prompts = [ prompts = [
( (
f"<|im_start|>system\n{default_system}<|im_end|>\n" f"<|im_start|>system\n{default_system}<|im_end|>\n"
f"<|im_start|>user\n<|vision_bos|>{placeholder}<|vision_eos|>" f"<|im_start|>user\n{placeholder}"
f"{question}<|im_end|>\n" f"{question}<|im_end|>\n"
"<|im_start|>assistant\n" "<|im_start|>assistant\n"
) )
...@@ -1962,6 +2097,7 @@ def run_qwen2_5_omni(questions: list[str], modality: str): ...@@ -1962,6 +2097,7 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
def run_qwen3_vl(questions: list[str], modality: str) -> ModelRequestData: def run_qwen3_vl(questions: list[str], modality: str) -> ModelRequestData:
model_name = "Qwen/Qwen3-VL-4B-Instruct" model_name = "Qwen/Qwen3-VL-4B-Instruct"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=4096, max_model_len=4096,
...@@ -1971,18 +2107,23 @@ def run_qwen3_vl(questions: list[str], modality: str) -> ModelRequestData: ...@@ -1971,18 +2107,23 @@ def run_qwen3_vl(questions: list[str], modality: str) -> ModelRequestData:
"max_pixels": 1280 * 28 * 28, "max_pixels": 1280 * 28 * 28,
"fps": 1, "fps": 1,
}, },
limit_mm_per_prompt={modality: 1}, limit_mm_per_prompt=mm_limit,
) )
image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
video_placeholder = "<|vision_start|><|video_pad|><|vision_end|>"
if modality == "image": if modality == "image":
placeholder = "<|image_pad|>" placeholder = image_placeholder
elif modality == "video": elif modality == "video":
placeholder = "<|video_pad|>" placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
prompts = [ prompts = [
( (
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>" f"<|im_start|>user\n{placeholder}"
f"{question}<|im_end|>\n" f"{question}<|im_end|>\n"
"<|im_start|>assistant\n" "<|im_start|>assistant\n"
) )
...@@ -1999,6 +2140,7 @@ def run_qwen3_vl(questions: list[str], modality: str) -> ModelRequestData: ...@@ -1999,6 +2140,7 @@ def run_qwen3_vl(questions: list[str], modality: str) -> ModelRequestData:
def run_qwen3_vl_moe(questions: list[str], modality: str) -> ModelRequestData: def run_qwen3_vl_moe(questions: list[str], modality: str) -> ModelRequestData:
model_name = "Qwen/Qwen3-VL-30B-A3B-Instruct" model_name = "Qwen/Qwen3-VL-30B-A3B-Instruct"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=4096, max_model_len=4096,
...@@ -2008,18 +2150,23 @@ def run_qwen3_vl_moe(questions: list[str], modality: str) -> ModelRequestData: ...@@ -2008,18 +2150,23 @@ def run_qwen3_vl_moe(questions: list[str], modality: str) -> ModelRequestData:
"max_pixels": 1280 * 28 * 28, "max_pixels": 1280 * 28 * 28,
"fps": 1, "fps": 1,
}, },
limit_mm_per_prompt={modality: 1}, limit_mm_per_prompt=mm_limit,
) )
image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
video_placeholder = "<|vision_start|><|video_pad|><|vision_end|>"
if modality == "image": if modality == "image":
placeholder = "<|image_pad|>" placeholder = image_placeholder
elif modality == "video": elif modality == "video":
placeholder = "<|video_pad|>" placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
prompts = [ prompts = [
( (
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>" f"<|im_start|>user\n{placeholder}"
f"{question}<|im_end|>\n" f"{question}<|im_end|>\n"
"<|im_start|>assistant\n" "<|im_start|>assistant\n"
) )
...@@ -2190,6 +2337,7 @@ def run_tarsier(questions: list[str], modality: str) -> ModelRequestData: ...@@ -2190,6 +2337,7 @@ def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData: def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
model_name = "omni-research/Tarsier2-Recap-7b" model_name = "omni-research/Tarsier2-Recap-7b"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=4096, max_model_len=4096,
...@@ -2197,18 +2345,23 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData: ...@@ -2197,18 +2345,23 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
"architectures": ["Tarsier2ForConditionalGeneration"], "architectures": ["Tarsier2ForConditionalGeneration"],
"model_type": "tarsier2", "model_type": "tarsier2",
}, },
limit_mm_per_prompt={modality: 1}, limit_mm_per_prompt=mm_limit,
) )
image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
video_placeholder = "<|vision_start|><|video_pad|><|vision_end|>"
if modality == "image": if modality == "image":
placeholder = "<|image_pad|>" placeholder = image_placeholder
elif modality == "video": elif modality == "video":
placeholder = "<|video_pad|>" placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
prompts = [ prompts = [
( (
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>" f"<|im_start|>user\n{placeholder}"
f"{question}<|im_end|>\n" f"{question}<|im_end|>\n"
"<|im_start|>assistant\n" "<|im_start|>assistant\n"
) )
...@@ -2357,6 +2510,24 @@ def get_multi_modal_input(args): ...@@ -2357,6 +2510,24 @@ def get_multi_modal_input(args):
"questions": vision_chunk_questions, "questions": vision_chunk_questions,
} }
if args.modality == "image+video":
image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
needs_metadata = args.model_type in MODELS_NEED_VIDEO_METADATA
video = VideoAsset(name="baby_reading", num_frames=args.num_frames).np_ndarrays
metadata = VideoAsset(name="baby_reading", num_frames=args.num_frames).metadata
img_video_questions = [
"What is shown in the image? What happens in the video?",
"Describe both the image and the video content.",
]
return {
"data": {
"image": image,
"video": ([(video, metadata)] if needs_metadata else video),
},
"questions": img_video_questions,
}
msg = f"Modality {args.modality} is not supported." msg = f"Modality {args.modality} is not supported."
raise ValueError(msg) raise ValueError(msg)
...@@ -2439,7 +2610,7 @@ def parse_args(): ...@@ -2439,7 +2610,7 @@ def parse_args():
"--modality", "--modality",
type=str, type=str,
default="image", default="image",
choices=["image", "video", "vision_chunk"], choices=["image", "video", "image+video", "vision_chunk"],
help="Modality of the input.", help="Modality of the input.",
) )
parser.add_argument( parser.add_argument(
...@@ -2546,23 +2717,42 @@ def main(args): ...@@ -2546,23 +2717,42 @@ def main(args):
else req_data.sampling_params else req_data.sampling_params
) )
def _mm_data(data, modality):
if modality == "image+video":
return {"image": data["image"], "video": data["video"]}
return {modality: data}
def _mm_uuid(uuid, modality):
if modality == "image+video":
return {"image": uuid, "video": uuid + "v"}
return {modality: uuid}
def _mm_empty(modality):
if modality == "image+video":
return {"image": None, "video": None}
return {modality: None}
assert args.num_prompts > 0 assert args.num_prompts > 0
if args.num_prompts == 1: if args.num_prompts == 1:
# Single inference # Single inference
uuid = "uuid_0" uuid = "uuid_0"
inputs = { inputs = {
"prompt": prompts[0], "prompt": prompts[0],
"multi_modal_data": {modality: data}, "multi_modal_data": _mm_data(data, modality),
"multi_modal_uuids": {modality: uuid}, "multi_modal_uuids": _mm_uuid(uuid, modality),
} }
inputs_with_empty_media = { inputs_with_empty_media = {
"prompt": prompts[0], "prompt": prompts[0],
"multi_modal_data": {modality: None}, "multi_modal_data": _mm_empty(modality),
"multi_modal_uuids": {modality: uuid}, "multi_modal_uuids": _mm_uuid(uuid, modality),
} }
else: else:
# Batch inference # Batch inference
if args.image_repeat_prob is not None: if args.image_repeat_prob is not None:
if modality == "image+video":
raise ValueError(
"--image-repeat-prob is not supported for 'image+video' modality"
)
# Repeat images with specified probability of "image_repeat_prob" # Repeat images with specified probability of "image_repeat_prob"
inputs, inputs_with_empty_media = apply_image_repeat( inputs, inputs_with_empty_media = apply_image_repeat(
args.image_repeat_prob, args.image_repeat_prob,
...@@ -2572,7 +2762,7 @@ def main(args): ...@@ -2572,7 +2762,7 @@ def main(args):
modality, modality,
) )
else: else:
# Use the same image for all prompts # Use the same image/video for all prompts
inputs = [] inputs = []
inputs_with_empty_media = [] inputs_with_empty_media = []
for i in range(args.num_prompts): for i in range(args.num_prompts):
...@@ -2580,15 +2770,15 @@ def main(args): ...@@ -2580,15 +2770,15 @@ def main(args):
inputs.append( inputs.append(
{ {
"prompt": prompts[i % len(prompts)], "prompt": prompts[i % len(prompts)],
"multi_modal_data": {modality: data}, "multi_modal_data": _mm_data(data, modality),
"multi_modal_uuids": {modality: uuid}, "multi_modal_uuids": _mm_uuid(uuid, modality),
} }
) )
inputs_with_empty_media.append( inputs_with_empty_media.append(
{ {
"prompt": prompts[i % len(prompts)], "prompt": prompts[i % len(prompts)],
"multi_modal_data": {modality: None}, "multi_modal_data": _mm_empty(modality),
"multi_modal_uuids": {modality: uuid}, "multi_modal_uuids": _mm_uuid(uuid, modality),
} }
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment