Unverified Commit 27a145e8 authored by Roger Wang's avatar Roger Wang Committed by GitHub
Browse files

[Doc] Add example for Step3-VL (#22061)


Signed-off-by: default avatarRoger Wang <hey@rogerw.me>
parent da31f6ad
...@@ -423,32 +423,6 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData: ...@@ -423,32 +423,6 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
) )
# SmolVLM2-2.2B-Instruct
def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=2,
enforce_eager=True,
mm_processor_kwargs={
"max_image_size": {"longest_edge": 384},
},
limit_mm_per_prompt={modality: 1},
)
prompts = [
(f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# Intern-S1 # Intern-S1
def run_interns1(questions: list[str], modality: str) -> ModelRequestData: def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
model_name = "internlm/Intern-S1" model_name = "internlm/Intern-S1"
...@@ -522,44 +496,6 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData: ...@@ -522,44 +496,6 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
) )
# Nemontron_VL
def run_nemotron_vl(questions: list[str], modality: str) -> ModelRequestData:
model_name = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=8192,
limit_mm_per_prompt={modality: 1},
)
assert modality == "image"
placeholder = "<image>"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
messages = [
[{"role": "user", "content": f"{placeholder}\n{question}"}]
for question in questions
]
prompts = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
# Stop tokens for InternVL
# models variants may have different stop tokens
# please refer to the model card for the correct "stop words":
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
stop_token_ids=stop_token_ids,
)
# Keye-VL # Keye-VL
def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData: def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
model_name = "Kwai-Keye/Keye-VL-8B-Preview" model_name = "Kwai-Keye/Keye-VL-8B-Preview"
...@@ -615,6 +551,41 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData: ...@@ -615,6 +551,41 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
) )
def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=4,
tensor_parallel_size=8,
gpu_memory_utilization=0.4,
limit_mm_per_prompt={modality: 1},
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
messages = [
[
{
"role": "user",
"content": [{"type": "image"}, {"type": "text", "text": f"{question}"}],
}
]
for question in questions
]
prompts = tokenizer.apply_chat_template(
messages, add_generation_prompt=True, tokenize=False
)
stop_token_ids = None
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
stop_token_ids=stop_token_ids,
)
# LLaVA-1.5 # LLaVA-1.5
def run_llava(questions: list[str], modality: str) -> ModelRequestData: def run_llava(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image" assert modality == "image"
...@@ -857,63 +828,66 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData: ...@@ -857,63 +828,66 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
) )
def run_llama4(questions: list[str], modality: str) -> ModelRequestData: # Molmo
def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image" assert modality == "image"
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct" model_name = "allenai/Molmo-7B-D-0924"
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=8192, trust_remote_code=True,
max_num_seqs=4, dtype="bfloat16",
tensor_parallel_size=8,
gpu_memory_utilization=0.4,
limit_mm_per_prompt={modality: 1}, limit_mm_per_prompt={modality: 1},
) )
tokenizer = AutoTokenizer.from_pretrained(model_name) prompts = [
messages = [ f"<|im_start|>user <image>\n{question}<|im_end|> \
[ <|im_start|>assistant\n"
{
"role": "user",
"content": [{"type": "image"}, {"type": "text", "text": f"{question}"}],
}
]
for question in questions for question in questions
] ]
prompts = tokenizer.apply_chat_template(
messages, add_generation_prompt=True, tokenize=False
)
stop_token_ids = None
return ModelRequestData( return ModelRequestData(
engine_args=engine_args, engine_args=engine_args,
prompts=prompts, prompts=prompts,
stop_token_ids=stop_token_ids,
) )
# Molmo # Nemontron_VL
def run_molmo(questions: list[str], modality: str) -> ModelRequestData: def run_nemotron_vl(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image" model_name = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
model_name = "allenai/Molmo-7B-D-0924"
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
dtype="bfloat16", max_model_len=8192,
limit_mm_per_prompt={modality: 1}, limit_mm_per_prompt={modality: 1},
) )
prompts = [ assert modality == "image"
f"<|im_start|>user <image>\n{question}<|im_end|> \ placeholder = "<image>"
<|im_start|>assistant\n"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
messages = [
[{"role": "user", "content": f"{placeholder}\n{question}"}]
for question in questions for question in questions
] ]
prompts = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
# Stop tokens for InternVL
# models variants may have different stop tokens
# please refer to the model card for the correct "stop words":
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None]
return ModelRequestData( return ModelRequestData(
engine_args=engine_args, engine_args=engine_args,
prompts=prompts, prompts=prompts,
stop_token_ids=stop_token_ids,
) )
...@@ -1274,10 +1248,11 @@ def run_qwen2_5_omni(questions: list[str], modality: str): ...@@ -1274,10 +1248,11 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
) )
# omni-research/Tarsier-7b # SkyworkR1V
def run_tarsier(questions: list[str], modality: str) -> ModelRequestData: def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image" assert modality == "image"
model_name = "omni-research/Tarsier-7b"
model_name = "Skywork/Skywork-R1V-38B"
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
...@@ -1285,36 +1260,73 @@ def run_tarsier(questions: list[str], modality: str) -> ModelRequestData: ...@@ -1285,36 +1260,73 @@ def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=4096, max_model_len=4096,
limit_mm_per_prompt={modality: 1}, limit_mm_per_prompt={modality: 1},
) )
prompts = [(f"USER: <image>\n{question} ASSISTANT:") for question in questions]
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
messages = [
[{"role": "user", "content": f"<image>\n{question}"}] for question in questions
]
prompts = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
# Stop tokens for SkyworkR1V
# https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py
stop_tokens = ["<|end▁of▁sentence|>", "<|endoftext|>"]
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
return ModelRequestData( return ModelRequestData(
engine_args=engine_args, engine_args=engine_args,
prompts=prompts, prompts=prompts,
stop_token_ids=stop_token_ids,
) )
def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData: # SmolVLM2-2.2B-Instruct
model_name = "omni-research/Tarsier2-Recap-7b" def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=4096, max_model_len=8192,
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}, max_num_seqs=2,
enforce_eager=True,
mm_processor_kwargs={
"max_image_size": {"longest_edge": 384},
},
limit_mm_per_prompt={modality: 1}, limit_mm_per_prompt={modality: 1},
) )
prompts = [
(f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
for question in questions
]
if modality == "image": return ModelRequestData(
placeholder = "<|image_pad|>" engine_args=engine_args,
elif modality == "video": prompts=prompts,
placeholder = "<|video_pad|>" )
# Step3
def run_step3(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "stepfun-ai/step3-fp8"
# NOTE: Below are verified configurations for step3-fp8
# on 8xH100 GPUs.
engine_args = EngineArgs(
model=model_name,
max_num_batched_tokens=4096,
gpu_memory_utilization=0.85,
tensor_parallel_size=8,
limit_mm_per_prompt={modality: 1},
reasoning_parser="step3",
)
prompts = [ prompts = [
( "<|begin▁of▁sentence|> You are a helpful assistant. <|BOT|>user\n "
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" f"<im_patch>{question} <|EOT|><|BOT|>assistant\n<think>\n"
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n"
)
for question in questions for question in questions
] ]
...@@ -1324,11 +1336,10 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData: ...@@ -1324,11 +1336,10 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
) )
# SkyworkR1V # omni-research/Tarsier-7b
def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData: def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image" assert modality == "image"
model_name = "omni-research/Tarsier-7b"
model_name = "Skywork/Skywork-R1V-38B"
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
...@@ -1336,24 +1347,42 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData: ...@@ -1336,24 +1347,42 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=4096, max_model_len=4096,
limit_mm_per_prompt={modality: 1}, limit_mm_per_prompt={modality: 1},
) )
prompts = [(f"USER: <image>\n{question} ASSISTANT:") for question in questions]
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) return ModelRequestData(
messages = [ engine_args=engine_args,
[{"role": "user", "content": f"<image>\n{question}"}] for question in questions prompts=prompts,
]
prompts = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
) )
# Stop tokens for SkyworkR1V
# https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
stop_tokens = ["<|end▁of▁sentence|>", "<|endoftext|>"] model_name = "omni-research/Tarsier2-Recap-7b"
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
limit_mm_per_prompt={modality: 1},
)
if modality == "image":
placeholder = "<|image_pad|>"
elif modality == "video":
placeholder = "<|video_pad|>"
prompts = [
(
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n"
)
for question in questions
]
return ModelRequestData( return ModelRequestData(
engine_args=engine_args, engine_args=engine_args,
prompts=prompts, prompts=prompts,
stop_token_ids=stop_token_ids,
) )
...@@ -1373,9 +1402,9 @@ model_example_map = { ...@@ -1373,9 +1402,9 @@ model_example_map = {
"idefics3": run_idefics3, "idefics3": run_idefics3,
"interns1": run_interns1, "interns1": run_interns1,
"internvl_chat": run_internvl, "internvl_chat": run_internvl,
"nemotron_vl": run_nemotron_vl,
"keye_vl": run_keye_vl, "keye_vl": run_keye_vl,
"kimi_vl": run_kimi_vl, "kimi_vl": run_kimi_vl,
"llama4": run_llama4,
"llava": run_llava, "llava": run_llava,
"llava-next": run_llava_next, "llava-next": run_llava_next,
"llava-next-video": run_llava_next_video, "llava-next-video": run_llava_next_video,
...@@ -1385,8 +1414,8 @@ model_example_map = { ...@@ -1385,8 +1414,8 @@ model_example_map = {
"minicpmv": run_minicpmv, "minicpmv": run_minicpmv,
"mistral3": run_mistral3, "mistral3": run_mistral3,
"mllama": run_mllama, "mllama": run_mllama,
"llama4": run_llama4,
"molmo": run_molmo, "molmo": run_molmo,
"nemotron_vl": run_nemotron_vl,
"NVLM_D": run_nvlm_d, "NVLM_D": run_nvlm_d,
"ovis": run_ovis, "ovis": run_ovis,
"paligemma": run_paligemma, "paligemma": run_paligemma,
...@@ -1401,6 +1430,7 @@ model_example_map = { ...@@ -1401,6 +1430,7 @@ model_example_map = {
"qwen2_5_omni": run_qwen2_5_omni, "qwen2_5_omni": run_qwen2_5_omni,
"skywork_chat": run_skyworkr1v, "skywork_chat": run_skyworkr1v,
"smolvlm": run_smolvlm, "smolvlm": run_smolvlm,
"step3": run_step3,
"tarsier": run_tarsier, "tarsier": run_tarsier,
"tarsier2": run_tarsier2, "tarsier2": run_tarsier2,
} }
......
...@@ -197,36 +197,55 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData: ...@@ -197,36 +197,55 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
) )
def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData: def load_hyperclovax_seed_vision(
model_name = "HuggingFaceM4/Idefics3-8B-Llama3" question: str, image_urls: list[str]
) -> ModelRequestData:
model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# The configuration below has been confirmed to launch on a single L40 GPU.
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=8192, trust_remote_code=True,
max_num_seqs=16, max_model_len=16384,
enforce_eager=True,
limit_mm_per_prompt={"image": len(image_urls)}, limit_mm_per_prompt={"image": len(image_urls)},
# if you are running out of memory, you can reduce the "longest_edge".
# see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
mm_processor_kwargs={
"size": {"longest_edge": 2 * 364},
},
) )
placeholders = "\n".join( message = {"role": "user", "content": list()}
f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1) for _image_url in image_urls:
message["content"].append(
{
"type": "image",
"image": _image_url,
"ocr": "",
"lens_keywords": "",
"lens_local_keywords": "",
}
)
message["content"].append(
{
"type": "text",
"text": question,
}
) )
prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501
prompt = tokenizer.apply_chat_template(
[
message,
],
tokenize=False,
add_generation_prompt=True,
)
return ModelRequestData( return ModelRequestData(
engine_args=engine_args, engine_args=engine_args,
prompt=prompt, prompt=prompt,
stop_token_ids=None,
image_data=[fetch_image(url) for url in image_urls], image_data=[fetch_image(url) for url in image_urls],
) )
def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData: def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct" model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
# The configuration below has been confirmed to launch on a single L40 GPU. # The configuration below has been confirmed to launch on a single L40 GPU.
engine_args = EngineArgs( engine_args = EngineArgs(
...@@ -235,17 +254,17 @@ def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData: ...@@ -235,17 +254,17 @@ def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
max_num_seqs=16, max_num_seqs=16,
enforce_eager=True, enforce_eager=True,
limit_mm_per_prompt={"image": len(image_urls)}, limit_mm_per_prompt={"image": len(image_urls)},
# if you are running out of memory, you can reduce the "longest_edge".
# see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
mm_processor_kwargs={ mm_processor_kwargs={
"max_image_size": {"longest_edge": 384}, "size": {"longest_edge": 2 * 364},
}, },
) )
placeholders = "\n".join( placeholders = "\n".join(
f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1) f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
) )
prompt = ( prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501
f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501
)
return ModelRequestData( return ModelRequestData(
engine_args=engine_args, engine_args=engine_args,
prompt=prompt, prompt=prompt,
...@@ -316,60 +335,13 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData: ...@@ -316,60 +335,13 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
) )
def load_hyperclovax_seed_vision( def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
question: str, image_urls: list[str] model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
) -> ModelRequestData:
model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=16384,
limit_mm_per_prompt={"image": len(image_urls)},
)
message = {"role": "user", "content": list()}
for _image_url in image_urls:
message["content"].append(
{
"type": "image",
"image": _image_url,
"ocr": "",
"lens_keywords": "",
"lens_local_keywords": "",
}
)
message["content"].append(
{
"type": "text",
"text": question,
}
)
prompt = tokenizer.apply_chat_template(
[
message,
],
tokenize=False,
add_generation_prompt=True,
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
stop_token_ids=None,
image_data=[fetch_image(url) for url in image_urls],
)
def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
# NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
# it will generate poor response for multi-image inputs!
model_name = "llava-hf/llava-1.5-7b-hf"
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
max_num_seqs=16, max_model_len=131072,
tensor_parallel_size=8,
limit_mm_per_prompt={"image": len(image_urls)}, limit_mm_per_prompt={"image": len(image_urls)},
) )
...@@ -397,11 +369,12 @@ def load_llava(question: str, image_urls: list[str]) -> ModelRequestData: ...@@ -397,11 +369,12 @@ def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
) )
def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData: def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "llava-hf/llava-v1.6-mistral-7b-hf" # NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
# it will generate poor response for multi-image inputs!
model_name = "llava-hf/llava-1.5-7b-hf"
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=8192,
max_num_seqs=16, max_num_seqs=16,
limit_mm_per_prompt={"image": len(image_urls)}, limit_mm_per_prompt={"image": len(image_urls)},
) )
...@@ -430,11 +403,11 @@ def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData: ...@@ -430,11 +403,11 @@ def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData:
) )
def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestData: def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "llava-hf/llava-onevision-qwen2-7b-ov-hf" model_name = "llava-hf/llava-v1.6-mistral-7b-hf"
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=16384, max_model_len=8192,
max_num_seqs=16, max_num_seqs=16,
limit_mm_per_prompt={"image": len(image_urls)}, limit_mm_per_prompt={"image": len(image_urls)},
) )
...@@ -463,13 +436,12 @@ def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestDa ...@@ -463,13 +436,12 @@ def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestDa
) )
def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData: def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct" model_name = "llava-hf/llava-onevision-qwen2-7b-ov-hf"
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=131072, max_model_len=16384,
tensor_parallel_size=8, max_num_seqs=16,
limit_mm_per_prompt={"image": len(image_urls)}, limit_mm_per_prompt={"image": len(image_urls)},
) )
...@@ -954,6 +926,62 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData: ...@@ -954,6 +926,62 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
) )
def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
# The configuration below has been confirmed to launch on a single L40 GPU.
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=16,
enforce_eager=True,
limit_mm_per_prompt={"image": len(image_urls)},
mm_processor_kwargs={
"max_image_size": {"longest_edge": 384},
},
)
placeholders = "\n".join(
f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
)
prompt = (
f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
)
def load_step3(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "stepfun-ai/step3-fp8"
# NOTE: Below are verified configurations for step3-fp8
# on 8xH100 GPUs.
engine_args = EngineArgs(
model=model_name,
max_num_batched_tokens=4096,
gpu_memory_utilization=0.85,
tensor_parallel_size=8,
limit_mm_per_prompt={"image": len(image_urls)},
reasoning_parser="step3",
)
prompt = (
"<|begin▁of▁sentence|> You are a helpful assistant. <|BOT|>user\n "
f"{'<im_patch>' * len(image_urls)}{question} <|EOT|><|BOT|"
">assistant\n<think>\n"
)
image_data = [fetch_image(url) for url in image_urls]
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=image_data,
)
def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData: def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "omni-research/Tarsier-7b" model_name = "omni-research/Tarsier-7b"
...@@ -1006,16 +1034,16 @@ model_example_map = { ...@@ -1006,16 +1034,16 @@ model_example_map = {
"deepseek_vl_v2": load_deepseek_vl2, "deepseek_vl_v2": load_deepseek_vl2,
"gemma3": load_gemma3, "gemma3": load_gemma3,
"h2ovl_chat": load_h2ovl, "h2ovl_chat": load_h2ovl,
"hyperclovax_seed_vision": load_hyperclovax_seed_vision,
"idefics3": load_idefics3, "idefics3": load_idefics3,
"interns1": load_interns1, "interns1": load_interns1,
"internvl_chat": load_internvl, "internvl_chat": load_internvl,
"hyperclovax_seed_vision": load_hyperclovax_seed_vision,
"keye_vl": load_keye_vl, "keye_vl": load_keye_vl,
"kimi_vl": load_kimi_vl, "kimi_vl": load_kimi_vl,
"llama4": load_llama4,
"llava": load_llava, "llava": load_llava,
"llava-next": load_llava_next, "llava-next": load_llava_next,
"llava-onevision": load_llava_onevision, "llava-onevision": load_llava_onevision,
"llama4": load_llama4,
"mistral3": load_mistral3, "mistral3": load_mistral3,
"mllama": load_mllama, "mllama": load_mllama,
"NVLM_D": load_nvlm_d, "NVLM_D": load_nvlm_d,
...@@ -1028,6 +1056,7 @@ model_example_map = { ...@@ -1028,6 +1056,7 @@ model_example_map = {
"qwen2_vl": load_qwen2_vl, "qwen2_vl": load_qwen2_vl,
"qwen2_5_vl": load_qwen2_5_vl, "qwen2_5_vl": load_qwen2_5_vl,
"smolvlm": load_smolvlm, "smolvlm": load_smolvlm,
"step3": load_step3,
"tarsier": load_tarsier, "tarsier": load_tarsier,
"tarsier2": load_tarsier2, "tarsier2": load_tarsier2,
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment