[StepVL] add step vl offline example (#33054)

Signed-off-by: luotingdan <luotingdan@stepfun.com> Co-authored-by: luotingdan <luotingdan@stepfun.com>

[StepVL] add step vl offline example (#33054)
Signed-off-by: luotingdan <luotingdan@stepfun.com> Co-authored-by: luotingdan <luotingdan@stepfun.com>
b40db4df · ltd0924 · GitHub · 11b55687 · b40db4df · b40db4df
Unverified Commit b40db4df authored Jan 26, 2026 by ltd0924 Committed by GitHub Jan 26, 2026
Showing with 54 additions and 0 deletions

examples/offline_inference/vision_language.py examples/offline_inference/vision_language.py +27 -0

examples/offline_inference/vision_language_multi_image.py examples/offline_inference/vision_language_multi_image.py +27 -0

No files found.
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -1889,6 +1889,32 @@ def run_step3(questions: list[str], modality: str) -> ModelRequestData:
    )
+# StepVL10B
+def run_step_vl(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "stepfun-ai/Step3-VL-10B"
+    engine_args = EngineArgs(
+        model=model_name,
+        max_num_batched_tokens=4096,
+        tensor_parallel_size=1,
+        trust_remote_code=True,
+        limit_mm_per_prompt={modality: 1},
+        reasoning_parser="deepseek_r1",
+    )
+    prompts = [
+        "<｜begin▁of▁sentence｜> You are a helpful assistant.<|BOT|>user\n "
+        f"<im_patch>{question} <|EOT|><|BOT|>assistant\n<think>\n"
+        for question in questions
+    ]
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 # omni-research/Tarsier-7b
 def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
@@ -2006,6 +2032,7 @@ model_example_map = {
    "skywork_chat": run_skyworkr1v,
    "smolvlm": run_smolvlm,
    "step3": run_step3,
+    "stepvl": run_step_vl,
    "tarsier": run_tarsier,
    "tarsier2": run_tarsier2,
 }

--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -1182,6 +1182,32 @@ def load_step3(question: str, image_urls: list[str]) -> ModelRequestData:
    )
+def load_step_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "stepfun-ai/Step3-VL-10B"
+    engine_args = EngineArgs(
+        model=model_name,
+        max_num_batched_tokens=4096,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        hf_overrides={"vision_config": {"enable_patch": False}},
+        trust_remote_code=True,
+        reasoning_parser="deepseek_r1",
+    )
+    prompt = (
+        "<｜begin▁of▁sentence｜> You are a helpful assistant.<|BOT|>user\n "
+        f"{'<im_patch>' * len(image_urls)}{question}<|EOT|><|BOT|>"
+        "assistant\n<think>\n"
+    )
+    image_data = [fetch_image(url) for url in image_urls]
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=image_data,
+    )
 def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "omni-research/Tarsier-7b"
@@ -1374,6 +1400,7 @@ model_example_map = {
    "rvl": load_r_vl,
    "smolvlm": load_smolvlm,
    "step3": load_step3,
+    "stepvl": load_step_vl,
    "tarsier": load_tarsier,
    "tarsier2": load_tarsier2,
    "glm4_5v": load_glm4_5v,