Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
c3bf9bad
Unverified
Commit
c3bf9bad
authored
Jun 21, 2025
by
汪志鹏
Committed by
GitHub
Jun 21, 2025
Browse files
[New model support]Support Tarsier2 (#19887)
Signed-off-by:
汪志鹏
<
wangzhipeng628@gmail.com
>
parent
6f170f11
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
152 additions
and
1 deletion
+152
-1
docs/models/supported_models.md
docs/models/supported_models.md
+1
-0
examples/offline_inference/vision_language.py
examples/offline_inference/vision_language.py
+32
-0
examples/offline_inference/vision_language_multi_image.py
examples/offline_inference/vision_language_multi_image.py
+27
-0
tests/models/multimodal/processing/test_common.py
tests/models/multimodal/processing/test_common.py
+1
-0
tests/models/registry.py
tests/models/registry.py
+2
-0
vllm/model_executor/models/qwen2_vl.py
vllm/model_executor/models/qwen2_vl.py
+88
-1
vllm/model_executor/models/registry.py
vllm/model_executor/models/registry.py
+1
-0
No files found.
docs/models/supported_models.md
View file @
c3bf9bad
...
@@ -562,6 +562,7 @@ Specified using `--task generate`.
...
@@ -562,6 +562,7 @@ Specified using `--task generate`.
|
`SkyworkR1VChatModel`
| Skywork-R1V-38B | T + I |
`Skywork/Skywork-R1V-38B`
| | ✅︎ | ✅︎ |
|
`SkyworkR1VChatModel`
| Skywork-R1V-38B | T + I |
`Skywork/Skywork-R1V-38B`
| | ✅︎ | ✅︎ |
|
`SmolVLMForConditionalGeneration`
| SmolVLM2 | T + I |
`SmolVLM2-2.2B-Instruct`
| ✅︎ | | ✅︎ |
|
`SmolVLMForConditionalGeneration`
| SmolVLM2 | T + I |
`SmolVLM2-2.2B-Instruct`
| ✅︎ | | ✅︎ |
|
`TarsierForConditionalGeneration`
| Tarsier | T + I
<sup>
E+
</sup>
|
`omni-search/Tarsier-7b`
,
`omni-search/Tarsier-34b`
| | ✅︎ | ✅︎ |
|
`TarsierForConditionalGeneration`
| Tarsier | T + I
<sup>
E+
</sup>
|
`omni-search/Tarsier-7b`
,
`omni-search/Tarsier-34b`
| | ✅︎ | ✅︎ |
|
`Tarsier2ForConditionalGeneration`
<sup>
^
</sup>
| Tarsier2 | T + I
<sup>
E+
</sup>
+ V
<sup>
E+
</sup>
|
`omni-research/Tarsier2-Recap-7b`
,
`omni-research/Tarsier2-7b-0115`
| | ✅︎ | ✅︎ |
<sup>
^
</sup>
You need to set the architecture name via
`--hf-overrides`
to match the one in vLLM.
<sup>
^
</sup>
You need to set the architecture name via
`--hf-overrides`
to match the one in vLLM.
• For example, to use DeepSeek-VL2 series models:
• For example, to use DeepSeek-VL2 series models:
...
...
examples/offline_inference/vision_language.py
View file @
c3bf9bad
...
@@ -1040,6 +1040,37 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
...
@@ -1040,6 +1040,37 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
)
)
def
run_tarsier2
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
model_name
=
"omni-research/Tarsier2-Recap-7b"
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
4096
,
hf_overrides
=
{
"architectures"
:
[
"Tarsier2ForConditionalGeneration"
]},
limit_mm_per_prompt
=
{
modality
:
1
},
)
if
modality
==
"image"
:
placeholder
=
"<|image_pad|>"
elif
modality
==
"video"
:
placeholder
=
"<|video_pad|>"
prompts
=
[
(
"<|im_start|>system
\n
You are a helpful assistant.<|im_end|>
\n
"
f
"<|im_start|>user
\n
<|vision_start|>
{
placeholder
}
<|vision_end|>"
f
"
{
question
}
<|im_end|>
\n
"
"<|im_start|>assistant
\n
"
)
for
question
in
questions
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# SkyworkR1V
# SkyworkR1V
def
run_skyworkr1v
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
def
run_skyworkr1v
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
assert
modality
==
"image"
...
@@ -1112,6 +1143,7 @@ model_example_map = {
...
@@ -1112,6 +1143,7 @@ model_example_map = {
"skywork_chat"
:
run_skyworkr1v
,
"skywork_chat"
:
run_skyworkr1v
,
"smolvlm"
:
run_smolvlm
,
"smolvlm"
:
run_smolvlm
,
"tarsier"
:
run_tarsier
,
"tarsier"
:
run_tarsier
,
"tarsier2"
:
run_tarsier2
,
}
}
...
...
examples/offline_inference/vision_language_multi_image.py
View file @
c3bf9bad
...
@@ -828,6 +828,32 @@ def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
...
@@ -828,6 +828,32 @@ def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
)
)
def
load_tarsier2
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
model_name
=
"omni-research/Tarsier2-Recap-7b"
engine_args
=
EngineArgs
(
model
=
model_name
,
trust_remote_code
=
True
,
max_model_len
=
32768
,
limit_mm_per_prompt
=
{
"image"
:
len
(
image_urls
)},
hf_overrides
=
{
"architectures"
:
[
"Tarsier2ForConditionalGeneration"
]},
)
prompt
=
(
"<|im_start|>system
\n
You are a helpful assistant.<|im_end|>
\n
"
f
"<|im_start|>user
\n
<|vision_start|>
{
'<|image_pad|>'
*
len
(
image_urls
)
}
"
f
"<|vision_end|>
{
question
}
<|im_end|>
\n
"
"<|im_start|>assistant
\n
"
)
image_data
=
[
fetch_image
(
url
)
for
url
in
image_urls
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompt
=
prompt
,
image_data
=
image_data
,
)
model_example_map
=
{
model_example_map
=
{
"aria"
:
load_aria
,
"aria"
:
load_aria
,
"aya_vision"
:
load_aya_vision
,
"aya_vision"
:
load_aya_vision
,
...
@@ -853,6 +879,7 @@ model_example_map = {
...
@@ -853,6 +879,7 @@ model_example_map = {
"qwen2_5_vl"
:
load_qwen2_5_vl
,
"qwen2_5_vl"
:
load_qwen2_5_vl
,
"smolvlm"
:
load_smolvlm
,
"smolvlm"
:
load_smolvlm
,
"tarsier"
:
load_tarsier
,
"tarsier"
:
load_tarsier
,
"tarsier2"
:
load_tarsier2
,
}
}
...
...
tests/models/multimodal/processing/test_common.py
View file @
c3bf9bad
...
@@ -284,6 +284,7 @@ def _test_processing_correctness_one(
...
@@ -284,6 +284,7 @@ def _test_processing_correctness_one(
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
,
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
,
"openai/whisper-large-v3"
,
"openai/whisper-large-v3"
,
"omni-research/Tarsier-7b"
,
"omni-research/Tarsier-7b"
,
"omni-research/Tarsier2-Recap-7b"
])
])
@
pytest
.
mark
.
parametrize
(
"hit_rate"
,
[
0.3
,
0.5
,
1.0
])
@
pytest
.
mark
.
parametrize
(
"hit_rate"
,
[
0.3
,
0.5
,
1.0
])
@
pytest
.
mark
.
parametrize
(
"num_batches"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"num_batches"
,
[
32
])
...
...
tests/models/registry.py
View file @
c3bf9bad
...
@@ -398,6 +398,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -398,6 +398,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"TarsierForConditionalGeneration"
:
_HfExamplesInfo
(
"omni-research/Tarsier-7b"
,
# noqa: E501
"TarsierForConditionalGeneration"
:
_HfExamplesInfo
(
"omni-research/Tarsier-7b"
,
# noqa: E501
hf_overrides
=
{
"architectures"
:
[
"TarsierForConditionalGeneration"
]}),
# noqa: E501
hf_overrides
=
{
"architectures"
:
[
"TarsierForConditionalGeneration"
]}),
# noqa: E501
"Tarsier2ForConditionalGeneration"
:
_HfExamplesInfo
(
"omni-research/Tarsier2-Recap-7b"
,
# noqa: E501
hf_overrides
=
{
"architectures"
:
[
"Tarsier2ForConditionalGeneration"
]}),
# noqa: E501
# [Encoder-decoder]
# [Encoder-decoder]
# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
# Therefore, we borrow the BartTokenizer from the original Bart model
# Therefore, we borrow the BartTokenizer from the original Bart model
...
...
vllm/model_executor/models/qwen2_vl.py
View file @
c3bf9bad
...
@@ -32,12 +32,14 @@ import torch
...
@@ -32,12 +32,14 @@ import torch
import
torch.nn
as
nn
import
torch.nn
as
nn
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
from
einops
import
rearrange
,
repeat
from
einops
import
rearrange
,
repeat
from
transformers
import
BatchFeature
from
transformers
import
AutoConfig
,
BatchFeature
from
transformers.models.qwen2_vl
import
(
Qwen2VLImageProcessor
,
from
transformers.models.qwen2_vl
import
(
Qwen2VLImageProcessor
,
Qwen2VLProcessor
)
Qwen2VLProcessor
)
from
transformers.models.qwen2_vl.configuration_qwen2_vl
import
(
from
transformers.models.qwen2_vl.configuration_qwen2_vl
import
(
Qwen2VLConfig
,
Qwen2VLVisionConfig
)
Qwen2VLConfig
,
Qwen2VLVisionConfig
)
from
transformers.models.qwen2_vl.image_processing_qwen2_vl
import
smart_resize
from
transformers.models.qwen2_vl.image_processing_qwen2_vl
import
smart_resize
from
transformers.models.qwen2_vl.video_processing_qwen2_vl
import
(
Qwen2VLVideoProcessor
)
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.distributed
import
parallel_state
,
tensor_model_parallel_all_gather
from
vllm.distributed
import
parallel_state
,
tensor_model_parallel_all_gather
...
@@ -69,6 +71,7 @@ from vllm.sequence import IntermediateTensors
...
@@ -69,6 +71,7 @@ from vllm.sequence import IntermediateTensors
from
vllm.transformers_utils.config
import
uses_mrope
from
vllm.transformers_utils.config
import
uses_mrope
from
vllm.transformers_utils.processor
import
(
from
vllm.transformers_utils.processor
import
(
cached_image_processor_from_config
)
cached_image_processor_from_config
)
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
.interfaces
import
(
MultiModalEmbeddings
,
SupportsLoRA
,
from
.interfaces
import
(
MultiModalEmbeddings
,
SupportsLoRA
,
SupportsMultiModal
,
SupportsPP
)
SupportsMultiModal
,
SupportsPP
)
...
@@ -1405,3 +1408,87 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -1405,3 +1408,87 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
connector
=
"visual.merger."
,
connector
=
"visual.merger."
,
tower_model
=
"visual."
,
tower_model
=
"visual."
,
)
)
class
Tarsier2MultiModalProcessor
(
Qwen2VLMultiModalProcessor
):
pass
class
Tarsier2ImageProcessor
(
Qwen2VLImageProcessor
):
def
__init__
(
self
,
size
:
Optional
[
dict
[
str
,
int
]]
=
None
,
**
kwargs
,
)
->
None
:
if
size
is
not
None
and
"min_pixels"
in
size
and
"max_pixels"
in
size
:
# Remap if Tarsier2-specific format is provided
remapped_size
=
{
"shortest_edge"
:
size
[
"min_pixels"
],
"longest_edge"
:
size
[
"max_pixels"
]
}
super
().
__init__
(
size
=
remapped_size
,
**
kwargs
)
else
:
super
().
__init__
(
size
=
size
,
**
kwargs
)
class
Tarsier2Processor
(
Qwen2VLProcessor
):
def
__init__
(
self
,
vision_config
:
dict
,
tokenizer
:
AnyTokenizer
,
**
kwargs
,
):
self
.
image_processor
=
Tarsier2ImageProcessor
(
**
vision_config
)
super
().
__init__
(
image_processor
=
self
.
image_processor
,
tokenizer
=
tokenizer
,
video_processor
=
Qwen2VLVideoProcessor
(),
chat_template
=
None
,
**
kwargs
)
class
Tarsier2ProcessingInfo
(
Qwen2VLProcessingInfo
):
def
get_hf_config
(
self
)
->
Qwen2VLConfig
:
model_path
=
self
.
ctx
.
model_config
.
model
original_config
=
AutoConfig
.
from_pretrained
(
model_path
)
config_dict
=
original_config
.
to_dict
()
correct_config
=
Qwen2VLConfig
.
from_dict
(
config_dict
)
return
correct_config
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
Tarsier2Processor
:
return
Tarsier2Processor
(
vision_config
=
self
.
ctx
.
get_hf_image_processor_config
(),
tokenizer
=
self
.
get_tokenizer
(),
**
kwargs
,
)
def
get_image_processor
(
self
)
->
Tarsier2ImageProcessor
:
return
Tarsier2ImageProcessor
(
**
self
.
ctx
.
get_hf_image_processor_config
())
@
MULTIMODAL_REGISTRY
.
register_processor
(
Tarsier2MultiModalProcessor
,
info
=
Tarsier2ProcessingInfo
,
dummy_inputs
=
Qwen2VLDummyInputsBuilder
)
class
Tarsier2ForConditionalGeneration
(
Qwen2VLForConditionalGeneration
):
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"vision_tower."
:
"visual."
,
})
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
# Tarsier2 uses llava as model_type, which will create a Qwen2VLConfig
# as text_config, we need to reconstruct Qwen2VLConfig from LlavaConfig.
config
=
vllm_config
.
model_config
.
hf_config
qwen2vl_config
=
config
.
text_config
qwen2vl_config
.
architectures
=
config
.
architectures
vllm_config
.
model_config
.
hf_config
=
qwen2vl_config
super
().
__init__
(
vllm_config
=
vllm_config
,
prefix
=
prefix
)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
loader
=
AutoWeightsLoader
(
self
)
return
loader
.
load_weights
(
weights
,
mapper
=
self
.
hf_to_vllm_mapper
)
vllm/model_executor/models/registry.py
View file @
c3bf9bad
...
@@ -217,6 +217,7 @@ _MULTIMODAL_MODELS = {
...
@@ -217,6 +217,7 @@ _MULTIMODAL_MODELS = {
"UltravoxModel"
:
(
"ultravox"
,
"UltravoxModel"
),
"UltravoxModel"
:
(
"ultravox"
,
"UltravoxModel"
),
"Phi4MMForCausalLM"
:
(
"phi4mm"
,
"Phi4MMForCausalLM"
),
"Phi4MMForCausalLM"
:
(
"phi4mm"
,
"Phi4MMForCausalLM"
),
"TarsierForConditionalGeneration"
:
(
"tarsier"
,
"TarsierForConditionalGeneration"
),
# noqa: E501
"TarsierForConditionalGeneration"
:
(
"tarsier"
,
"TarsierForConditionalGeneration"
),
# noqa: E501
"Tarsier2ForConditionalGeneration"
:
(
"qwen2_vl"
,
"Tarsier2ForConditionalGeneration"
),
# noqa: E501
# [Encoder-decoder]
# [Encoder-decoder]
"Florence2ForConditionalGeneration"
:
(
"florence2"
,
"Florence2ForConditionalGeneration"
),
# noqa: E501
"Florence2ForConditionalGeneration"
:
(
"florence2"
,
"Florence2ForConditionalGeneration"
),
# noqa: E501
"MllamaForConditionalGeneration"
:
(
"mllama"
,
"MllamaForConditionalGeneration"
),
# noqa: E501
"MllamaForConditionalGeneration"
:
(
"mllama"
,
"MllamaForConditionalGeneration"
),
# noqa: E501
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment