Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0552cfb1
Unverified
Commit
0552cfb1
authored
Oct 24, 2025
by
Yu Jiaqi
Committed by
GitHub
Oct 23, 2025
Browse files
[Model] Siglip Embedding Support (#27324)
Signed-off-by:
piood
<
2477084691@qq.com
>
parent
51dd14ac
Changes
8
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
870 additions
and
86 deletions
+870
-86
docs/models/supported_models.md
docs/models/supported_models.md
+7
-6
examples/offline_inference/vision_language_pooling.py
examples/offline_inference/vision_language_pooling.py
+49
-24
examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
...ng/pooling/openai_chat_embedding_client_for_multimodal.py
+78
-36
tests/models/multimodal/pooling/test_siglip.py
tests/models/multimodal/pooling/test_siglip.py
+137
-0
tests/models/registry.py
tests/models/registry.py
+1
-0
vllm/model_executor/models/registry.py
vllm/model_executor/models/registry.py
+1
-0
vllm/model_executor/models/siglip.py
vllm/model_executor/models/siglip.py
+594
-18
vllm/transformers_utils/chat_templates/registry.py
vllm/transformers_utils/chat_templates/registry.py
+3
-2
No files found.
docs/models/supported_models.md
View file @
0552cfb1
...
...
@@ -800,12 +800,13 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A
The following table lists those that are tested in vLLM.
| Architecture | Models | Inputs | Example HF Models |
[
LoRA
](
../features/lora.md
)
|
[
PP
](
../serving/parallelism_scaling.md
)
|
|--------------|--------|--------|-------------------|----------------------|---------------------------|
|
`CLIPModel`
| CLIP | T / I |
`openai/clip-vit-base-patch32`
,
`openai/clip-vit-large-patch14`
, etc. | | |
|
`LlavaNextForConditionalGeneration`
<sup>
C
</sup>
| LLaVA-NeXT-based | T / I |
`royokong/e5-v`
| | ✅︎ |
|
`Phi3VForCausalLM`
<sup>
C
</sup>
| Phi-3-Vision-based | T + I |
`TIGER-Lab/VLM2Vec-Full`
| | ✅︎ |
|
`*ForConditionalGeneration`
<sup>
C
</sup>
,
`*ForCausalLM`
<sup>
C
</sup>
, etc. | Generative models |
\*
| N/A |
\*
|
\*
|
| Architecture | Models | Inputs | Example HF Models |
[
LoRA
](
../features/lora.md
)
|
[
PP
](
../serving/parallelism_scaling.md
)
|
[
V1
](
gh-issue:8779
)
|
|--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------|
|
`CLIPModel`
| CLIP | T / I |
`openai/clip-vit-base-patch32`
,
`openai/clip-vit-large-patch14`
, etc. | | | ✅︎ |
|
`LlavaNextForConditionalGeneration`
<sup>
C
</sup>
| LLaVA-NeXT-based | T / I |
`royokong/e5-v`
| | ✅︎ | ✅︎ |
|
`Phi3VForCausalLM`
<sup>
C
</sup>
| Phi-3-Vision-based | T + I |
`TIGER-Lab/VLM2Vec-Full`
| | ✅︎ | ✅︎ |
|
`SiglipModel`
| SigLIP | T / I |
`google/siglip-base-patch16-224`
| | | ✅︎ |
|
`*ForConditionalGeneration`
<sup>
C
</sup>
,
`*ForCausalLM`
<sup>
C
</sup>
, etc. | Generative models |
\*
| N/A |
\*
|
\*
|
\*
|
<sup>
C
</sup>
Automatically converted into an embedding model via
`--convert embed`
. (
[
details
](
./pooling_models.md#model-conversion
)
)
\*
Feature support is the same as that of the original model.
...
...
examples/offline_inference/vision_language_pooling.py
View file @
0552cfb1
...
...
@@ -110,6 +110,53 @@ def run_e5_v(query: Query) -> ModelRequestData:
)
def
run_jinavl_reranker
(
query
:
Query
)
->
ModelRequestData
:
if
query
[
"modality"
]
!=
"text+images"
:
raise
ValueError
(
f
"Unsupported query modality: '
{
query
[
'modality'
]
}
'"
)
engine_args
=
EngineArgs
(
model
=
"jinaai/jina-reranker-m0"
,
runner
=
"pooling"
,
max_model_len
=
32768
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
{
"min_pixels"
:
3136
,
"max_pixels"
:
602112
,
},
limit_mm_per_prompt
=
{
"image"
:
1
},
)
return
ModelRequestData
(
engine_args
=
engine_args
,
query
=
query
[
"text"
],
documents
=
query
[
"image"
],
)
def
run_siglip
(
query
:
Query
)
->
ModelRequestData
:
if
query
[
"modality"
]
==
"text"
:
prompt
=
query
[
"text"
]
image
=
None
elif
query
[
"modality"
]
==
"image"
:
prompt
=
""
# For image input, make sure that the prompt text is empty
image
=
query
[
"image"
]
else
:
modality
=
query
[
"modality"
]
raise
ValueError
(
f
"Unsupported query modality: '
{
modality
}
'"
)
engine_args
=
EngineArgs
(
model
=
"google/siglip-base-patch16-224"
,
runner
=
"pooling"
,
limit_mm_per_prompt
=
{
"image"
:
1
},
)
return
ModelRequestData
(
engine_args
=
engine_args
,
prompt
=
prompt
,
image
=
image
,
)
def
_get_vlm2vec_prompt_image
(
query
:
Query
,
image_token
:
str
):
if
query
[
"modality"
]
==
"text"
:
text
=
query
[
"text"
]
...
...
@@ -211,29 +258,6 @@ def run_vlm2vec_qwen2vl(query: Query) -> ModelRequestData:
)
def
run_jinavl_reranker
(
query
:
Query
)
->
ModelRequestData
:
if
query
[
"modality"
]
!=
"text+images"
:
raise
ValueError
(
f
"Unsupported query modality: '
{
query
[
'modality'
]
}
'"
)
engine_args
=
EngineArgs
(
model
=
"jinaai/jina-reranker-m0"
,
runner
=
"pooling"
,
max_model_len
=
32768
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
{
"min_pixels"
:
3136
,
"max_pixels"
:
602112
,
},
limit_mm_per_prompt
=
{
"image"
:
1
},
)
return
ModelRequestData
(
engine_args
=
engine_args
,
query
=
query
[
"text"
],
documents
=
query
[
"image"
],
)
def
get_query
(
modality
:
QueryModality
):
if
modality
==
"text"
:
return
TextQuery
(
modality
=
"text"
,
text
=
"A dog sitting in the grass"
)
...
...
@@ -328,9 +352,10 @@ def run_score(model: str, modality: QueryModality, seed: int | None):
model_example_map
=
{
"clip"
:
run_clip
,
"e5_v"
:
run_e5_v
,
"jinavl_reranker"
:
run_jinavl_reranker
,
"siglip"
:
run_siglip
,
"vlm2vec_phi3v"
:
run_vlm2vec_phi3v
,
"vlm2vec_qwen2vl"
:
run_vlm2vec_qwen2vl
,
"jinavl_reranker"
:
run_jinavl_reranker
,
}
...
...
examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
View file @
0552cfb1
...
...
@@ -83,25 +83,29 @@ def run_clip(client: OpenAI, model: str):
print
(
"Text embedding output:"
,
response
.
data
[
0
].
embedding
)
def
run_
vlm2vec
(
client
:
OpenAI
,
model
:
str
):
def
run_
dse_qwen2_vl
(
client
:
OpenAI
,
model
:
str
):
"""
Start the server using:
vllm serve
TIGER-Lab/VLM2Vec-Full
\
vllm serve
MrLight/dse-qwen2-2b-mrl-v1
\
--runner pooling
\
--trust-remote-code
\
--max-model-len
4096
\
--chat-template examples/template_
vlm2vec_phi3v
.jinja
--max-model-len
8192
\
--chat-template examples/template_
dse_qwen2_vl
.jinja
"""
response
=
create_chat_embeddings
(
client
,
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}},
{
"type"
:
"text"
,
"text"
:
"Represent the given image."
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
,
},
},
{
"type"
:
"text"
,
"text"
:
"What is shown in this image?"
},
],
}
],
...
...
@@ -111,17 +115,26 @@ def run_vlm2vec(client: OpenAI, model: str):
print
(
"Image embedding output:"
,
response
.
data
[
0
].
embedding
)
# MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
# of the minimum input size
buffer
=
io
.
BytesIO
()
image_placeholder
=
Image
.
new
(
"RGB"
,
(
56
,
56
))
image_placeholder
.
save
(
buffer
,
"png"
)
buffer
.
seek
(
0
)
image_placeholder
=
base64
.
b64encode
(
buffer
.
read
()).
decode
(
"utf-8"
)
response
=
create_chat_embeddings
(
client
,
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}},
{
"type"
:
"text"
,
"text"
:
"Represent the given image with the following question: What is in the image."
,
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
f
"data:image/jpeg;base64,
{
image_placeholder
}
"
,
},
},
{
"type"
:
"text"
,
"text"
:
"Query: What is the weather like today?"
},
],
}
],
...
...
@@ -129,7 +142,16 @@ def run_vlm2vec(client: OpenAI, model: str):
encoding_format
=
"float"
,
)
print
(
"Image+Text embedding output:"
,
response
.
data
[
0
].
embedding
)
print
(
"Text embedding output:"
,
response
.
data
[
0
].
embedding
)
def
run_siglip
(
client
:
OpenAI
,
model
:
str
):
"""
Start the server using:
vllm serve google/siglip-base-patch16-224
\
--runner pooling
"""
response
=
create_chat_embeddings
(
client
,
...
...
@@ -137,7 +159,23 @@ def run_vlm2vec(client: OpenAI, model: str):
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"A cat and a dog"
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}},
],
}
],
model
=
model
,
encoding_format
=
"float"
,
)
print
(
"Image embedding output:"
,
response
.
data
[
0
].
embedding
)
response
=
create_chat_embeddings
(
client
,
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"a photo of a cat"
},
],
}
],
...
...
@@ -148,29 +186,25 @@ def run_vlm2vec(client: OpenAI, model: str):
print
(
"Text embedding output:"
,
response
.
data
[
0
].
embedding
)
def
run_
dse_qwen2_vl
(
client
:
OpenAI
,
model
:
str
):
def
run_
vlm2vec
(
client
:
OpenAI
,
model
:
str
):
"""
Start the server using:
vllm serve
MrLight/dse-qwen2-2b-mrl-v1
\
vllm serve
TIGER-Lab/VLM2Vec-Full
\
--runner pooling
\
--trust-remote-code
\
--max-model-len
8192
\
--chat-template examples/template_
dse_qwen2_vl
.jinja
--max-model-len
4096
\
--chat-template examples/template_
vlm2vec_phi3v
.jinja
"""
response
=
create_chat_embeddings
(
client
,
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
,
},
},
{
"type"
:
"text"
,
"text"
:
"What is shown in this image?"
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}},
{
"type"
:
"text"
,
"text"
:
"Represent the given image."
},
],
}
],
...
...
@@ -180,26 +214,33 @@ def run_dse_qwen2_vl(client: OpenAI, model: str):
print
(
"Image embedding output:"
,
response
.
data
[
0
].
embedding
)
# MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
# of the minimum input size
buffer
=
io
.
BytesIO
()
image_placeholder
=
Image
.
new
(
"RGB"
,
(
56
,
56
))
image_placeholder
.
save
(
buffer
,
"png"
)
buffer
.
seek
(
0
)
image_placeholder
=
base64
.
b64encode
(
buffer
.
read
()).
decode
(
"utf-8"
)
response
=
create_chat_embeddings
(
client
,
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
f
"data:image/jpeg;base64,
{
image_placeholder
}
"
,
},
"type"
:
"text"
,
"text"
:
"Represent the given image with the following question: What is in the image."
,
},
{
"type"
:
"text"
,
"text"
:
"Query: What is the weather like today?"
},
],
}
],
model
=
model
,
encoding_format
=
"float"
,
)
print
(
"Image+Text embedding output:"
,
response
.
data
[
0
].
embedding
)
response
=
create_chat_embeddings
(
client
,
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"A cat and a dog"
},
],
}
],
...
...
@@ -212,8 +253,9 @@ def run_dse_qwen2_vl(client: OpenAI, model: str):
model_example_map
=
{
"clip"
:
run_clip
,
"vlm2vec"
:
run_vlm2vec
,
"dse_qwen2_vl"
:
run_dse_qwen2_vl
,
"siglip"
:
run_siglip
,
"vlm2vec"
:
run_vlm2vec
,
}
...
...
tests/models/multimodal/pooling/test_siglip.py
0 → 100644
View file @
0552cfb1
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
transformers
import
SiglipModel
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
from
...utils
import
check_embeddings_close
HF_TEXT_PROMPTS
=
[
"a photo of a stop sign"
,
"a photo of a cherry blossom"
,
]
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
(
{
"stop_sign"
:
""
,
"cherry_blossom"
:
""
,
}
)
MODELS
=
[
"google/siglip-base-patch16-224"
]
def
_run_test
(
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
type
[
VllmRunner
],
input_texts
:
list
[
str
],
input_images
:
PromptImageInput
,
model
:
str
,
*
,
dtype
:
str
,
)
->
None
:
with
vllm_runner
(
model
,
runner
=
"pooling"
,
dtype
=
dtype
,
enforce_eager
=
True
,
max_model_len
=
64
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
embed
(
input_texts
,
images
=
input_images
)
with
hf_runner
(
model
,
dtype
=
dtype
,
auto_cls
=
SiglipModel
)
as
hf_model
:
all_inputs
=
hf_model
.
get_inputs
(
input_texts
,
images
=
input_images
)
all_outputs
=
[]
for
inputs
in
all_inputs
:
inputs
=
hf_model
.
wrap_device
(
inputs
)
if
"pixel_values"
in
inputs
:
pooled_output
=
hf_model
.
model
.
get_image_features
(
pixel_values
=
inputs
.
pixel_values
,
).
squeeze
(
0
)
else
:
pooled_output
=
hf_model
.
model
.
get_text_features
(
input_ids
=
inputs
.
input_ids
,
).
squeeze
(
0
)
all_outputs
.
append
(
pooled_output
.
tolist
())
hf_outputs
=
all_outputs
check_embeddings_close
(
embeddings_0_lst
=
hf_outputs
,
embeddings_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
def
test_models_text
(
hf_runner
,
vllm_runner
,
image_assets
,
model
:
str
,
dtype
:
str
,
)
->
None
:
input_texts_images
=
[(
text
,
None
)
for
text
in
HF_TEXT_PROMPTS
]
input_texts
=
[
text
for
text
,
_
in
input_texts_images
]
input_images
=
[
image
for
_
,
image
in
input_texts_images
]
_run_test
(
hf_runner
,
vllm_runner
,
input_texts
,
input_images
,
# type: ignore
model
,
dtype
=
dtype
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
def
test_models_image
(
hf_runner
,
vllm_runner
,
image_assets
,
model
:
str
,
dtype
:
str
,
)
->
None
:
input_texts_images
=
[
(
text
,
asset
.
pil_image
)
for
text
,
asset
in
zip
(
HF_IMAGE_PROMPTS
,
image_assets
)
]
input_texts
=
[
text
for
text
,
_
in
input_texts_images
]
input_images
=
[
image
for
_
,
image
in
input_texts_images
]
_run_test
(
hf_runner
,
vllm_runner
,
input_texts
,
input_images
,
model
,
dtype
=
dtype
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
def
test_models_text_image_no_crash
(
vllm_runner
,
image_assets
,
model
:
str
,
dtype
:
str
,
)
->
None
:
texts
=
[
HF_TEXT_PROMPTS
[
0
]]
images
=
[
image_assets
[
0
].
pil_image
]
with
vllm_runner
(
model
,
runner
=
"pooling"
,
dtype
=
dtype
,
enforce_eager
=
True
,
max_model_len
=
64
,
)
as
vllm_model
:
with
pytest
.
raises
(
ValueError
,
match
=
"not both"
):
vllm_model
.
embed
(
texts
,
images
=
images
)
vllm_model
.
embed
(
texts
)
vllm_model
.
embed
([
""
],
images
=
images
)
tests/models/registry.py
View file @
0552cfb1
...
...
@@ -471,6 +471,7 @@ _EMBEDDING_EXAMPLE_MODELS = {
"TIGER-Lab/VLM2Vec-Full"
,
trust_remote_code
=
True
),
"Qwen2VLForConditionalGeneration"
:
_HfExamplesInfo
(
"MrLight/dse-qwen2-2b-mrl-v1"
),
"SiglipModel"
:
_HfExamplesInfo
(
"google/siglip-base-patch16-224"
),
"PrithviGeoSpatialMAE"
:
_HfExamplesInfo
(
"ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
,
dtype
=
"float16"
,
...
...
vllm/model_executor/models/registry.py
View file @
0552cfb1
...
...
@@ -209,6 +209,7 @@ _EMBEDDING_MODELS = {
),
"Phi3VForCausalLM"
:
(
"phi3v"
,
"Phi3VForCausalLM"
),
"Qwen2VLForConditionalGeneration"
:
(
"qwen2_vl"
,
"Qwen2VLForConditionalGeneration"
),
# noqa: E501
"SiglipModel"
:
(
"siglip"
,
"SiglipEmbeddingModel"
),
# Technically Terratorch models work on images, both in
# input and output. I am adding it here because it piggy-backs on embedding
# models for the time being.
...
...
vllm/model_executor/models/siglip.py
View file @
0552cfb1
This diff is collapsed.
Click to expand it.
vllm/transformers_utils/chat_templates/registry.py
View file @
0552cfb1
...
...
@@ -31,14 +31,15 @@ def _get_minicpmv_chat_template_fallback(tokenizer_name_or_path: str) -> Path |
_MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK
:
dict
[
str
,
ChatTemplatePath
]
=
{
"blip-2"
:
CHAT_TEMPLATES_DIR
/
"template_blip2.jinja"
,
"clip"
:
CHAT_TEMPLATES_DIR
/
"template_basic.jinja"
,
"chameleon"
:
CHAT_TEMPLATES_DIR
/
"template_basic.jinja"
,
"
deepseek_vl_v2
"
:
CHAT_TEMPLATES_DIR
/
"template_
deepseek_vl2
.jinja"
,
"
clip
"
:
CHAT_TEMPLATES_DIR
/
"template_
basic
.jinja"
,
"deepseek_ocr"
:
CHAT_TEMPLATES_DIR
/
"template_deepseek_ocr.jinja"
,
"deepseek_vl_v2"
:
CHAT_TEMPLATES_DIR
/
"template_deepseek_vl2.jinja"
,
"fuyu"
:
CHAT_TEMPLATES_DIR
/
"template_fuyu.jinja"
,
"minicpmv"
:
_get_minicpmv_chat_template_fallback
,
"paligemma"
:
CHAT_TEMPLATES_DIR
/
"template_basic.jinja"
,
"qwen"
:
_get_qwen_chat_template_fallback
,
"siglip"
:
CHAT_TEMPLATES_DIR
/
"template_basic.jinja"
,
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment