Add tarsier model support (#18985)

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>

Add tarsier model support (#18985)
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
1282bd81 · 汪志鹏 · GitHub · bdce64f2 · 1282bd81 · 1282bd81
Unverified Commit 1282bd81 authored Jun 03, 2025 by 汪志鹏 Committed by GitHub Jun 03, 2025
7 changed files
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -550,6 +550,7 @@ Specified using `--task generate`.
 | `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni                                                             | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup>                   | `Qwen/Qwen2.5-Omni-7B`                                                                                                                                  |                      | ✅︎                         | ✅︎\*                     |
 | `SkyworkR1VChatModel`                        | Skywork-R1V-38B                                                          | T + I                                                                 | `Skywork/Skywork-R1V-38B`                                                                                                                               |                       | ✅︎                          | ✅︎                      |
 | `SmolVLMForConditionalGeneration`            | SmolVLM2                                                                 | T + I                                                                 | `SmolVLM2-2.2B-Instruct`                                                                                                                                | ✅︎                     |                           | ✅︎                      |
+| `TarsierForConditionalGeneration`                | Tarsier                                                                  | T + I<sup>E+</sup>                                                                                                     | `omni-search/Tarsier-7b`,`omni-search/Tarsier-34b`                                                                                                           |                      | ✅︎                      | ✅︎                   |
 <sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.  
 &nbsp;&nbsp;&nbsp;&nbsp;• For example, to use DeepSeek-VL2 series models:  

--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -333,6 +333,25 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
    )
+# omni-research/Tarsier-7b
+def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "omni-research/Tarsier-7b"
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        limit_mm_per_prompt={modality: 1},
+    )
+    prompts = [(f"USER: <image>\n{question} ASSISTANT:") for question in questions]
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 # InternVL
 def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "OpenGVLab/InternVL3-2B"
@@ -1091,6 +1110,7 @@ model_example_map = {
    "qwen2_5_omni": run_qwen2_5_omni,
    "skywork_chat": run_skyworkr1v,
    "smolvlm": run_smolvlm,
+    "tarsier": run_tarsier,
 }

--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -691,6 +691,26 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
    )
+def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "omni-research/Tarsier-7b"
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+    prompt = f"USER: {'<image>' * len(image_urls)}\n{question}\n ASSISTANT:"
+    image_data = [fetch_image(url) for url in image_urls]
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=image_data,
+    )
 model_example_map = {
    "aria": load_aria,
    "aya_vision": load_aya_vision,
@@ -712,6 +732,7 @@ model_example_map = {
    "qwen2_vl": load_qwen2_vl,
    "qwen2_5_vl": load_qwen2_5_vl,
    "smolvlm": load_smolvlm,
+    "tarsier": load_tarsier,
 }

--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -282,6 +282,7 @@ def _test_processing_correctness_one(
    "Skywork/Skywork-R1V-38B",
    "fixie-ai/ultravox-v0_5-llama-3_2-1b",
    "openai/whisper-large-v3",
+    "omni-research/Tarsier-7b",
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -406,6 +406,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"),  # noqa: E501
    "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",  # noqa: E501
                                     trust_remote_code=True),
+    "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b",  # noqa: E501
+                                                        hf_overrides={"architectures": ["TarsierForConditionalGeneration"]}),  # noqa: E501
    # [Encoder-decoder]
    # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
    # Therefore, we borrow the BartTokenizer from the original Bart model

--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -211,6 +211,7 @@ _MULTIMODAL_MODELS = {
    "Qwen2_5OmniForConditionalGeneration": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"),  # noqa: E501
    "UltravoxModel": ("ultravox", "UltravoxModel"),
    "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
+    "TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"),  # noqa: E501
    # [Encoder-decoder]
    "Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"),  # noqa: E501
    "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"),  # noqa: E501

--- a/vllm/model_executor/models/tarsier.py
+++ b/vllm/model_executor/models/tarsier.py