Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
27a145e8
Unverified
Commit
27a145e8
authored
Aug 01, 2025
by
Roger Wang
Committed by
GitHub
Aug 01, 2025
Browse files
[Doc] Add example for Step3-VL (#22061)
Signed-off-by:
Roger Wang
<
hey@rogerw.me
>
parent
da31f6ad
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
270 additions
and
211 deletions
+270
-211
examples/offline_inference/vision_language.py
examples/offline_inference/vision_language.py
+157
-127
examples/offline_inference/vision_language_multi_image.py
examples/offline_inference/vision_language_multi_image.py
+113
-84
No files found.
examples/offline_inference/vision_language.py
View file @
27a145e8
...
...
@@ -423,32 +423,6 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
)
# SmolVLM2-2.2B-Instruct
def
run_smolvlm
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
model_name
=
"HuggingFaceTB/SmolVLM2-2.2B-Instruct"
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
8192
,
max_num_seqs
=
2
,
enforce_eager
=
True
,
mm_processor_kwargs
=
{
"max_image_size"
:
{
"longest_edge"
:
384
},
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
prompts
=
[
(
f
"<|im_start|>User:<image>
{
question
}
<end_of_utterance>
\n
Assistant:"
)
for
question
in
questions
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# Intern-S1
def
run_interns1
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
model_name
=
"internlm/Intern-S1"
...
...
@@ -522,44 +496,6 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
)
# Nemontron_VL
def
run_nemotron_vl
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
model_name
=
"nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
engine_args
=
EngineArgs
(
model
=
model_name
,
trust_remote_code
=
True
,
max_model_len
=
8192
,
limit_mm_per_prompt
=
{
modality
:
1
},
)
assert
modality
==
"image"
placeholder
=
"<image>"
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
trust_remote_code
=
True
)
messages
=
[
[{
"role"
:
"user"
,
"content"
:
f
"
{
placeholder
}
\n
{
question
}
"
}]
for
question
in
questions
]
prompts
=
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
# Stop tokens for InternVL
# models variants may have different stop tokens
# please refer to the model card for the correct "stop words":
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
stop_tokens
=
[
"<|endoftext|>"
,
"<|im_start|>"
,
"<|im_end|>"
,
"<|end|>"
]
stop_token_ids
=
[
tokenizer
.
convert_tokens_to_ids
(
i
)
for
i
in
stop_tokens
]
stop_token_ids
=
[
token_id
for
token_id
in
stop_token_ids
if
token_id
is
not
None
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
stop_token_ids
=
stop_token_ids
,
)
# Keye-VL
def
run_keye_vl
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
model_name
=
"Kwai-Keye/Keye-VL-8B-Preview"
...
...
@@ -615,6 +551,41 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
)
def
run_llama4
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
model_name
=
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
8192
,
max_num_seqs
=
4
,
tensor_parallel_size
=
8
,
gpu_memory_utilization
=
0.4
,
limit_mm_per_prompt
=
{
modality
:
1
},
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
messages
=
[
[
{
"role"
:
"user"
,
"content"
:
[{
"type"
:
"image"
},
{
"type"
:
"text"
,
"text"
:
f
"
{
question
}
"
}],
}
]
for
question
in
questions
]
prompts
=
tokenizer
.
apply_chat_template
(
messages
,
add_generation_prompt
=
True
,
tokenize
=
False
)
stop_token_ids
=
None
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
stop_token_ids
=
stop_token_ids
,
)
# LLaVA-1.5
def
run_llava
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
...
...
@@ -857,63 +828,66 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
)
def
run_llama4
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
# Molmo
def
run_molmo
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
model_name
=
"
meta-llama/Llama-4-Scout-17B-16E-Instruct
"
model_name
=
"
allenai/Molmo-7B-D-0924
"
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
8192
,
max_num_seqs
=
4
,
tensor_parallel_size
=
8
,
gpu_memory_utilization
=
0.4
,
trust_remote_code
=
True
,
dtype
=
"bfloat16"
,
limit_mm_per_prompt
=
{
modality
:
1
},
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
messages
=
[
[
{
"role"
:
"user"
,
"content"
:
[{
"type"
:
"image"
},
{
"type"
:
"text"
,
"text"
:
f
"
{
question
}
"
}],
}
]
prompts
=
[
f
"<|im_start|>user <image>
\n
{
question
}
<|im_end|>
\
<|im_start|>assistant
\n
"
for
question
in
questions
]
prompts
=
tokenizer
.
apply_chat_template
(
messages
,
add_generation_prompt
=
True
,
tokenize
=
False
)
stop_token_ids
=
None
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
stop_token_ids
=
stop_token_ids
,
)
# Molmo
def
run_molmo
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
model_name
=
"allenai/Molmo-7B-D-0924"
# Nemontron_VL
def
run_nemotron_vl
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
model_name
=
"nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
engine_args
=
EngineArgs
(
model
=
model_name
,
trust_remote_code
=
True
,
dtype
=
"bfloat16"
,
max_model_len
=
8192
,
limit_mm_per_prompt
=
{
modality
:
1
},
)
prompts
=
[
f
"<|im_start|>user <image>
\n
{
question
}
<|im_end|>
\
<|im_start|>assistant
\n
"
assert
modality
==
"image"
placeholder
=
"<image>"
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
trust_remote_code
=
True
)
messages
=
[
[{
"role"
:
"user"
,
"content"
:
f
"
{
placeholder
}
\n
{
question
}
"
}]
for
question
in
questions
]
prompts
=
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
# Stop tokens for InternVL
# models variants may have different stop tokens
# please refer to the model card for the correct "stop words":
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
stop_tokens
=
[
"<|endoftext|>"
,
"<|im_start|>"
,
"<|im_end|>"
,
"<|end|>"
]
stop_token_ids
=
[
tokenizer
.
convert_tokens_to_ids
(
i
)
for
i
in
stop_tokens
]
stop_token_ids
=
[
token_id
for
token_id
in
stop_token_ids
if
token_id
is
not
None
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
stop_token_ids
=
stop_token_ids
,
)
...
...
@@ -1274,10 +1248,11 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
)
#
omni-research/Tarsier-7b
def
run_
tarsier
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
#
SkyworkR1V
def
run_
skyworkr1v
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
model_name
=
"omni-research/Tarsier-7b"
model_name
=
"Skywork/Skywork-R1V-38B"
engine_args
=
EngineArgs
(
model
=
model_name
,
...
...
@@ -1285,36 +1260,73 @@ def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
max_model_len
=
4096
,
limit_mm_per_prompt
=
{
modality
:
1
},
)
prompts
=
[(
f
"USER: <image>
\n
{
question
}
ASSISTANT:"
)
for
question
in
questions
]
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
trust_remote_code
=
True
)
messages
=
[
[{
"role"
:
"user"
,
"content"
:
f
"<image>
\n
{
question
}
"
}]
for
question
in
questions
]
prompts
=
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
# Stop tokens for SkyworkR1V
# https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py
stop_tokens
=
[
"<|end▁of▁sentence|>"
,
"<|endoftext|>"
]
stop_token_ids
=
[
tokenizer
.
convert_tokens_to_ids
(
i
)
for
i
in
stop_tokens
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
stop_token_ids
=
stop_token_ids
,
)
def
run_tarsier2
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
model_name
=
"omni-research/Tarsier2-Recap-7b"
# SmolVLM2-2.2B-Instruct
def
run_smolvlm
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
model_name
=
"HuggingFaceTB/SmolVLM2-2.2B-Instruct"
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
4096
,
hf_overrides
=
{
"architectures"
:
[
"Tarsier2ForConditionalGeneration"
]},
max_model_len
=
8192
,
max_num_seqs
=
2
,
enforce_eager
=
True
,
mm_processor_kwargs
=
{
"max_image_size"
:
{
"longest_edge"
:
384
},
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
prompts
=
[
(
f
"<|im_start|>User:<image>
{
question
}
<end_of_utterance>
\n
Assistant:"
)
for
question
in
questions
]
if
modality
==
"image"
:
placeholder
=
"<|image_pad|>"
elif
modality
==
"video"
:
placeholder
=
"<|video_pad|>"
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# Step3
def
run_step3
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
model_name
=
"stepfun-ai/step3-fp8"
# NOTE: Below are verified configurations for step3-fp8
# on 8xH100 GPUs.
engine_args
=
EngineArgs
(
model
=
model_name
,
max_num_batched_tokens
=
4096
,
gpu_memory_utilization
=
0.85
,
tensor_parallel_size
=
8
,
limit_mm_per_prompt
=
{
modality
:
1
},
reasoning_parser
=
"step3"
,
)
prompts
=
[
(
"<|im_start|>system
\n
You are a helpful assistant.<|im_end|>
\n
"
f
"<|im_start|>user
\n
<|vision_start|>
{
placeholder
}
<|vision_end|>"
f
"
{
question
}
<|im_end|>
\n
"
"<|im_start|>assistant
\n
"
)
"<|begin▁of▁sentence|> You are a helpful assistant. <|BOT|>user
\n
"
f
"<im_patch>
{
question
}
<|EOT|><|BOT|>assistant
\n
<think>
\n
"
for
question
in
questions
]
...
...
@@ -1324,11 +1336,10 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
)
#
SkyworkR1V
def
run_
skyworkr1v
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
#
omni-research/Tarsier-7b
def
run_
tarsier
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
model_name
=
"Skywork/Skywork-R1V-38B"
model_name
=
"omni-research/Tarsier-7b"
engine_args
=
EngineArgs
(
model
=
model_name
,
...
...
@@ -1336,24 +1347,42 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
max_model_len
=
4096
,
limit_mm_per_prompt
=
{
modality
:
1
},
)
prompts
=
[(
f
"USER: <image>
\n
{
question
}
ASSISTANT:"
)
for
question
in
questions
]
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
trust_remote_code
=
True
)
messages
=
[
[{
"role"
:
"user"
,
"content"
:
f
"<image>
\n
{
question
}
"
}]
for
question
in
questions
]
prompts
=
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# Stop tokens for SkyworkR1V
# https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py
stop_tokens
=
[
"<|end▁of▁sentence|>"
,
"<|endoftext|>"
]
stop_token_ids
=
[
tokenizer
.
convert_tokens_to_ids
(
i
)
for
i
in
stop_tokens
]
def
run_tarsier2
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
model_name
=
"omni-research/Tarsier2-Recap-7b"
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
4096
,
hf_overrides
=
{
"architectures"
:
[
"Tarsier2ForConditionalGeneration"
]},
limit_mm_per_prompt
=
{
modality
:
1
},
)
if
modality
==
"image"
:
placeholder
=
"<|image_pad|>"
elif
modality
==
"video"
:
placeholder
=
"<|video_pad|>"
prompts
=
[
(
"<|im_start|>system
\n
You are a helpful assistant.<|im_end|>
\n
"
f
"<|im_start|>user
\n
<|vision_start|>
{
placeholder
}
<|vision_end|>"
f
"
{
question
}
<|im_end|>
\n
"
"<|im_start|>assistant
\n
"
)
for
question
in
questions
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
stop_token_ids
=
stop_token_ids
,
)
...
...
@@ -1373,9 +1402,9 @@ model_example_map = {
"idefics3"
:
run_idefics3
,
"interns1"
:
run_interns1
,
"internvl_chat"
:
run_internvl
,
"nemotron_vl"
:
run_nemotron_vl
,
"keye_vl"
:
run_keye_vl
,
"kimi_vl"
:
run_kimi_vl
,
"llama4"
:
run_llama4
,
"llava"
:
run_llava
,
"llava-next"
:
run_llava_next
,
"llava-next-video"
:
run_llava_next_video
,
...
...
@@ -1385,8 +1414,8 @@ model_example_map = {
"minicpmv"
:
run_minicpmv
,
"mistral3"
:
run_mistral3
,
"mllama"
:
run_mllama
,
"llama4"
:
run_llama4
,
"molmo"
:
run_molmo
,
"nemotron_vl"
:
run_nemotron_vl
,
"NVLM_D"
:
run_nvlm_d
,
"ovis"
:
run_ovis
,
"paligemma"
:
run_paligemma
,
...
...
@@ -1401,6 +1430,7 @@ model_example_map = {
"qwen2_5_omni"
:
run_qwen2_5_omni
,
"skywork_chat"
:
run_skyworkr1v
,
"smolvlm"
:
run_smolvlm
,
"step3"
:
run_step3
,
"tarsier"
:
run_tarsier
,
"tarsier2"
:
run_tarsier2
,
}
...
...
examples/offline_inference/vision_language_multi_image.py
View file @
27a145e8
...
...
@@ -197,36 +197,55 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
)
def
load_idefics3
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
model_name
=
"HuggingFaceM4/Idefics3-8B-Llama3"
def
load_hyperclovax_seed_vision
(
question
:
str
,
image_urls
:
list
[
str
]
)
->
ModelRequestData
:
model_name
=
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
trust_remote_code
=
True
)
# The configuration below has been confirmed to launch on a single L40 GPU.
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
8192
,
max_num_seqs
=
16
,
enforce_eager
=
True
,
trust_remote_code
=
True
,
max_model_len
=
16384
,
limit_mm_per_prompt
=
{
"image"
:
len
(
image_urls
)},
# if you are running out of memory, you can reduce the "longest_edge".
# see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
mm_processor_kwargs
=
{
"size"
:
{
"longest_edge"
:
2
*
364
},
},
)
placeholders
=
"
\n
"
.
join
(
f
"Image-
{
i
}
: <image>
\n
"
for
i
,
_
in
enumerate
(
image_urls
,
start
=
1
)
message
=
{
"role"
:
"user"
,
"content"
:
list
()}
for
_image_url
in
image_urls
:
message
[
"content"
].
append
(
{
"type"
:
"image"
,
"image"
:
_image_url
,
"ocr"
:
""
,
"lens_keywords"
:
""
,
"lens_local_keywords"
:
""
,
}
)
message
[
"content"
].
append
(
{
"type"
:
"text"
,
"text"
:
question
,
}
)
prompt
=
f
"<|begin_of_text|>User:
{
placeholders
}
\n
{
question
}
<end_of_utterance>
\n
Assistant:"
# noqa: E501
prompt
=
tokenizer
.
apply_chat_template
(
[
message
,
],
tokenize
=
False
,
add_generation_prompt
=
True
,
)
return
ModelRequestData
(
engine_args
=
engine_args
,
prompt
=
prompt
,
stop_token_ids
=
None
,
image_data
=
[
fetch_image
(
url
)
for
url
in
image_urls
],
)
def
load_
smolvlm
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
model_name
=
"HuggingFace
TB/SmolVLM2-2.2B-Instruct
"
def
load_
idefics3
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
model_name
=
"HuggingFace
M4/Idefics3-8B-Llama3
"
# The configuration below has been confirmed to launch on a single L40 GPU.
engine_args
=
EngineArgs
(
...
...
@@ -235,17 +254,17 @@ def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
max_num_seqs
=
16
,
enforce_eager
=
True
,
limit_mm_per_prompt
=
{
"image"
:
len
(
image_urls
)},
# if you are running out of memory, you can reduce the "longest_edge".
# see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
mm_processor_kwargs
=
{
"
max_image_
size"
:
{
"longest_edge"
:
38
4
},
"size"
:
{
"longest_edge"
:
2
*
36
4
},
},
)
placeholders
=
"
\n
"
.
join
(
f
"Image-
{
i
}
: <image>
\n
"
for
i
,
_
in
enumerate
(
image_urls
,
start
=
1
)
)
prompt
=
(
f
"<|im_start|>User:
{
placeholders
}
\n
{
question
}
<end_of_utterance>
\n
Assistant:"
# noqa: E501
)
prompt
=
f
"<|begin_of_text|>User:
{
placeholders
}
\n
{
question
}
<end_of_utterance>
\n
Assistant:"
# noqa: E501
return
ModelRequestData
(
engine_args
=
engine_args
,
prompt
=
prompt
,
...
...
@@ -316,60 +335,13 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
)
def
load_hyperclovax_seed_vision
(
question
:
str
,
image_urls
:
list
[
str
]
)
->
ModelRequestData
:
model_name
=
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
trust_remote_code
=
True
)
engine_args
=
EngineArgs
(
model
=
model_name
,
trust_remote_code
=
True
,
max_model_len
=
16384
,
limit_mm_per_prompt
=
{
"image"
:
len
(
image_urls
)},
)
message
=
{
"role"
:
"user"
,
"content"
:
list
()}
for
_image_url
in
image_urls
:
message
[
"content"
].
append
(
{
"type"
:
"image"
,
"image"
:
_image_url
,
"ocr"
:
""
,
"lens_keywords"
:
""
,
"lens_local_keywords"
:
""
,
}
)
message
[
"content"
].
append
(
{
"type"
:
"text"
,
"text"
:
question
,
}
)
prompt
=
tokenizer
.
apply_chat_template
(
[
message
,
],
tokenize
=
False
,
add_generation_prompt
=
True
,
)
return
ModelRequestData
(
engine_args
=
engine_args
,
prompt
=
prompt
,
stop_token_ids
=
None
,
image_data
=
[
fetch_image
(
url
)
for
url
in
image_urls
],
)
def
load_llama4
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
model_name
=
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
def
load_llava
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
# NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
# it will generate poor response for multi-image inputs!
model_name
=
"llava-hf/llava-1.5-7b-hf"
engine_args
=
EngineArgs
(
model
=
model_name
,
max_num_seqs
=
16
,
max_model_len
=
131072
,
tensor_parallel_size
=
8
,
limit_mm_per_prompt
=
{
"image"
:
len
(
image_urls
)},
)
...
...
@@ -397,11 +369,12 @@ def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
)
def
load_llava_next
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
model_name
=
"llava-hf/llava-v1.6-mistral-7b-hf"
def
load_llava
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
# NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
# it will generate poor response for multi-image inputs!
model_name
=
"llava-hf/llava-1.5-7b-hf"
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
8192
,
max_num_seqs
=
16
,
limit_mm_per_prompt
=
{
"image"
:
len
(
image_urls
)},
)
...
...
@@ -430,11 +403,11 @@ def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData:
)
def
load_llava_
o
ne
vision
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
model_name
=
"llava-hf/llava-
onevision-qwen2-7b-ov
-hf"
def
load_llava_ne
xt
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
model_name
=
"llava-hf/llava-
v1.6-mistral-7b
-hf"
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
16384
,
max_model_len
=
8192
,
max_num_seqs
=
16
,
limit_mm_per_prompt
=
{
"image"
:
len
(
image_urls
)},
)
...
...
@@ -463,13 +436,12 @@ def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestDa
)
def
load_llama4
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
model_name
=
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
def
load_llava_onevision
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
model_name
=
"llava-hf/llava-onevision-qwen2-7b-ov-hf"
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
1
31072
,
tensor_parallel_size
=
8
,
max_model_len
=
1
6384
,
max_num_seqs
=
16
,
limit_mm_per_prompt
=
{
"image"
:
len
(
image_urls
)},
)
...
...
@@ -954,6 +926,62 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
)
def
load_smolvlm
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
model_name
=
"HuggingFaceTB/SmolVLM2-2.2B-Instruct"
# The configuration below has been confirmed to launch on a single L40 GPU.
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
8192
,
max_num_seqs
=
16
,
enforce_eager
=
True
,
limit_mm_per_prompt
=
{
"image"
:
len
(
image_urls
)},
mm_processor_kwargs
=
{
"max_image_size"
:
{
"longest_edge"
:
384
},
},
)
placeholders
=
"
\n
"
.
join
(
f
"Image-
{
i
}
: <image>
\n
"
for
i
,
_
in
enumerate
(
image_urls
,
start
=
1
)
)
prompt
=
(
f
"<|im_start|>User:
{
placeholders
}
\n
{
question
}
<end_of_utterance>
\n
Assistant:"
# noqa: E501
)
return
ModelRequestData
(
engine_args
=
engine_args
,
prompt
=
prompt
,
image_data
=
[
fetch_image
(
url
)
for
url
in
image_urls
],
)
def
load_step3
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
model_name
=
"stepfun-ai/step3-fp8"
# NOTE: Below are verified configurations for step3-fp8
# on 8xH100 GPUs.
engine_args
=
EngineArgs
(
model
=
model_name
,
max_num_batched_tokens
=
4096
,
gpu_memory_utilization
=
0.85
,
tensor_parallel_size
=
8
,
limit_mm_per_prompt
=
{
"image"
:
len
(
image_urls
)},
reasoning_parser
=
"step3"
,
)
prompt
=
(
"<|begin▁of▁sentence|> You are a helpful assistant. <|BOT|>user
\n
"
f
"
{
'<im_patch>'
*
len
(
image_urls
)
}{
question
}
<|EOT|><|BOT|"
">assistant
\n
<think>
\n
"
)
image_data
=
[
fetch_image
(
url
)
for
url
in
image_urls
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompt
=
prompt
,
image_data
=
image_data
,
)
def
load_tarsier
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
model_name
=
"omni-research/Tarsier-7b"
...
...
@@ -1006,16 +1034,16 @@ model_example_map = {
"deepseek_vl_v2"
:
load_deepseek_vl2
,
"gemma3"
:
load_gemma3
,
"h2ovl_chat"
:
load_h2ovl
,
"hyperclovax_seed_vision"
:
load_hyperclovax_seed_vision
,
"idefics3"
:
load_idefics3
,
"interns1"
:
load_interns1
,
"internvl_chat"
:
load_internvl
,
"hyperclovax_seed_vision"
:
load_hyperclovax_seed_vision
,
"keye_vl"
:
load_keye_vl
,
"kimi_vl"
:
load_kimi_vl
,
"llama4"
:
load_llama4
,
"llava"
:
load_llava
,
"llava-next"
:
load_llava_next
,
"llava-onevision"
:
load_llava_onevision
,
"llama4"
:
load_llama4
,
"mistral3"
:
load_mistral3
,
"mllama"
:
load_mllama
,
"NVLM_D"
:
load_nvlm_d
,
...
...
@@ -1028,6 +1056,7 @@ model_example_map = {
"qwen2_vl"
:
load_qwen2_vl
,
"qwen2_5_vl"
:
load_qwen2_5_vl
,
"smolvlm"
:
load_smolvlm
,
"step3"
:
load_step3
,
"tarsier"
:
load_tarsier
,
"tarsier2"
:
load_tarsier2
,
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment