Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
fcfc474d
Commit
fcfc474d
authored
Apr 09, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.3' into v0.8.3-dev
parents
bb94d2e5
296c6572
Changes
503
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
583 additions
and
233 deletions
+583
-233
tests/lora/test_transfomers_model.py
tests/lora/test_transfomers_model.py
+5
-12
tests/metrics/test_metrics.py
tests/metrics/test_metrics.py
+8
-4
tests/model_executor/test_enabled_custom_ops.py
tests/model_executor/test_enabled_custom_ops.py
+36
-0
tests/models/decoder_only/language/test_mistral.py
tests/models/decoder_only/language/test_mistral.py
+9
-32
tests/models/decoder_only/vision_language/test_interleaved.py
...s/models/decoder_only/vision_language/test_interleaved.py
+77
-0
tests/models/decoder_only/vision_language/test_models.py
tests/models/decoder_only/vision_language/test_models.py
+87
-55
tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
...els/decoder_only/vision_language/vlm_utils/model_utils.py
+64
-0
tests/models/embedding/language/test_jina_reranker_v2.py
tests/models/embedding/language/test_jina_reranker_v2.py
+70
-0
tests/models/embedding/vision_language/test_phi3v.py
tests/models/embedding/vision_language/test_phi3v.py
+13
-0
tests/models/encoder_decoder/vision_language/test_mllama.py
tests/models/encoder_decoder/vision_language/test_mllama.py
+1
-2
tests/models/multimodal/processing/test_common.py
tests/models/multimodal/processing/test_common.py
+48
-53
tests/models/multimodal/processing/test_h2ovl.py
tests/models/multimodal/processing/test_h2ovl.py
+1
-6
tests/models/multimodal/processing/test_idefics3.py
tests/models/multimodal/processing/test_idefics3.py
+2
-7
tests/models/multimodal/processing/test_internvl.py
tests/models/multimodal/processing/test_internvl.py
+1
-6
tests/models/multimodal/processing/test_llama4.py
tests/models/multimodal/processing/test_llama4.py
+99
-0
tests/models/multimodal/processing/test_llava_next.py
tests/models/multimodal/processing/test_llava_next.py
+3
-13
tests/models/multimodal/processing/test_llava_onevision.py
tests/models/multimodal/processing/test_llava_onevision.py
+3
-13
tests/models/multimodal/processing/test_phi3v.py
tests/models/multimodal/processing/test_phi3v.py
+2
-7
tests/models/multimodal/processing/test_qwen2_vl.py
tests/models/multimodal/processing/test_qwen2_vl.py
+2
-6
tests/models/registry.py
tests/models/registry.py
+52
-17
No files found.
tests/lora/test_transfomers_model.py
View file @
fcfc474d
...
...
@@ -4,6 +4,7 @@ import pytest
import
vllm
from
vllm.lora.request
import
LoRARequest
from
vllm.platforms
import
current_platform
from
..utils
import
create_new_process_for_each_test
,
multi_gpu_test
...
...
@@ -46,16 +47,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
return
generated_texts
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines_lora
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@
pytest
.
mark
.
skip_v1
@
create_new_process_for_each_test
()
def
test_ilama_lora
(
ilama_lora_files
):
llm
=
vllm
.
LLM
(
MODEL_PATH
,
max_model_len
=
1024
,
...
...
@@ -74,7 +65,8 @@ def test_ilama_lora(ilama_lora_files):
assert
output2
[
i
]
==
EXPECTED_LORA_OUTPUT
[
i
]
@
pytest
.
mark
.
skip_v1
@
pytest
.
mark
.
skipif
(
current_platform
.
is_cuda_alike
(),
reason
=
"Skipping to avoid redundant model tests"
)
@
multi_gpu_test
(
num_gpus
=
4
)
@
create_new_process_for_each_test
()
def
test_ilama_lora_tp4
(
ilama_lora_files
):
...
...
@@ -96,7 +88,8 @@ def test_ilama_lora_tp4(ilama_lora_files):
assert
output2
[
i
]
==
EXPECTED_LORA_OUTPUT
[
i
]
@
pytest
.
mark
.
skip_v1
@
pytest
.
mark
.
skipif
(
current_platform
.
is_cuda_alike
(),
reason
=
"Skipping to avoid redundant model tests"
)
@
multi_gpu_test
(
num_gpus
=
4
)
@
create_new_process_for_each_test
()
def
test_ilama_lora_tp4_fully_sharded_loras
(
ilama_lora_files
):
...
...
tests/metrics/test_metrics.py
View file @
fcfc474d
...
...
@@ -252,8 +252,10 @@ def test_metric_spec_decode(
dtype
=
dtype
,
disable_log_stats
=
False
,
gpu_memory_utilization
=
0.4
,
speculative_model
=
model
,
num_speculative_tokens
=
k
,
speculative_config
=
{
"model"
:
model
,
"num_speculative_tokens"
:
k
,
},
)
as
vllm_model
:
# Force log interval to be 0 to catch all metrics.
...
...
@@ -304,8 +306,10 @@ def test_metric_spec_decode_interval(
dtype
=
dtype
,
disable_log_stats
=
False
,
gpu_memory_utilization
=
0.4
,
speculative_model
=
model
,
num_speculative_tokens
=
k
,
speculative_config
=
{
"model"
:
model
,
"num_speculative_tokens"
:
k
,
},
enforce_eager
=
True
,
)
...
...
tests/model_executor/test_enabled_custom_ops.py
View file @
fcfc474d
...
...
@@ -7,6 +7,10 @@ from vllm.model_executor.custom_op import CustomOp
from
vllm.model_executor.layers.activation
import
(
GeluAndMul
,
ReLUSquaredActivation
,
SiluAndMul
)
from
vllm.model_executor.layers.fused_moe.fused_moe
import
(
dispatch_fused_experts_func
,
dispatch_topk_func
,
torch_vllm_inplace_fused_experts
,
torch_vllm_outplace_fused_experts
,
vllm_topk_softmax
)
from
vllm.model_executor.layers.layernorm
import
(
RMSNorm
,
dispatch_cuda_rmsnorm_func
,
fused_add_rms_norm
,
rms_norm
,
rocm_aiter_fused_add_rms_norm
,
rocm_aiter_rms_norm
)
...
...
@@ -92,6 +96,38 @@ def test_enabled_ops_invalid(env: str):
RMSNorm
(
1024
).
enabled
()
@
pytest
.
mark
.
parametrize
(
"use_rocm_aiter"
,
[
"0"
,
"1"
])
def
test_topk_dispatch
(
use_rocm_aiter
:
str
,
monkeypatch
):
monkeypatch
.
setenv
(
"VLLM_ROCM_USE_AITER"
,
use_rocm_aiter
)
topk_func
=
dispatch_topk_func
()
if
current_platform
.
is_rocm
()
and
int
(
use_rocm_aiter
):
from
vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe
import
(
rocm_aiter_topk_softmax
)
assert
topk_func
==
rocm_aiter_topk_softmax
else
:
assert
topk_func
==
vllm_topk_softmax
@
pytest
.
mark
.
parametrize
(
"use_rocm_aiter"
,
[
"0"
,
"1"
])
@
pytest
.
mark
.
parametrize
(
"inplace"
,
[
True
,
False
])
def
test_fused_experts_dispatch
(
use_rocm_aiter
:
str
,
inplace
:
bool
,
monkeypatch
):
monkeypatch
.
setenv
(
"VLLM_ROCM_USE_AITER"
,
use_rocm_aiter
)
fused_experts_func
=
dispatch_fused_experts_func
(
inplace
)
if
current_platform
.
is_rocm
()
and
int
(
use_rocm_aiter
):
from
vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe
import
(
rocm_aiter_fused_experts
)
assert
fused_experts_func
==
rocm_aiter_fused_experts
elif
inplace
:
assert
fused_experts_func
==
torch_vllm_inplace_fused_experts
else
:
assert
fused_experts_func
==
torch_vllm_outplace_fused_experts
@
pytest
.
mark
.
parametrize
(
"add_residual"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"use_rocm_aiter"
,
[
"0"
,
"1"
])
@
pytest
.
mark
.
parametrize
(
"use_rocm_aiter_norm"
,
[
"0"
,
"1"
])
...
...
tests/models/decoder_only/language/test_mistral.py
View file @
fcfc474d
...
...
@@ -176,15 +176,8 @@ SAMPLE_JSON_SCHEMA = {
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_models
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
)
->
None
:
def
test_models
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
# TODO(sang): Sliding window should be tested separately.
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy_logprobs_limit
(
...
...
@@ -208,14 +201,8 @@ def test_models(
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_mistral_format
(
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
)
->
None
:
def
test_mistral_format
(
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
with
vllm_runner
(
model
,
dtype
=
dtype
,
...
...
@@ -246,11 +233,8 @@ def test_mistral_format(
@
pytest
.
mark
.
parametrize
(
"model"
,
MISTRAL_FORMAT_MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
def
test_mistral_symbolic_languages
(
vllm_runner
,
model
:
str
,
dtype
:
str
,
)
->
None
:
def
test_mistral_symbolic_languages
(
vllm_runner
,
model
:
str
,
dtype
:
str
)
->
None
:
with
vllm_runner
(
model
,
dtype
=
dtype
,
max_model_len
=
8192
,
...
...
@@ -268,11 +252,7 @@ def test_mistral_symbolic_languages(
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
MISTRAL_FORMAT_MODELS
)
# v1 can't do func calling
def
test_mistral_function_calling
(
vllm_runner
,
model
:
str
,
dtype
:
str
,
)
->
None
:
def
test_mistral_function_calling
(
vllm_runner
,
model
:
str
,
dtype
:
str
)
->
None
:
with
vllm_runner
(
model
,
dtype
=
dtype
,
tokenizer_mode
=
"mistral"
,
...
...
@@ -303,11 +283,8 @@ def test_mistral_function_calling(
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"guided_backend"
,
[
"outlines"
,
"lm-format-enforcer"
,
"xgrammar"
])
def
test_mistral_guided_decoding
(
vllm_runner
,
model
:
str
,
guided_backend
:
str
,
)
->
None
:
def
test_mistral_guided_decoding
(
vllm_runner
,
model
:
str
,
guided_backend
:
str
)
->
None
:
with
vllm_runner
(
model
,
dtype
=
'bfloat16'
,
tokenizer_mode
=
"mistral"
)
as
vllm_model
:
...
...
tests/models/decoder_only/vision_language/test_interleaved.py
0 → 100644
View file @
fcfc474d
# SPDX-License-Identifier: Apache-2.0
import
pytest
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.video
import
VideoAsset
models
=
[
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
]
def
base_prompt
(
modalities_str
:
str
)
->
str
:
return
f
"<|im_start|>user
{
modalities_str
}
\n
Describe what you see from these items.<|im_end|><|im_start|>assistant
\n
"
# noqa: E501
INTERLEAVED_PROMPT
=
base_prompt
(
"<image><video><image>
\n
"
)
NONINTERLEAVED_PROMPT
=
base_prompt
(
"<image><image><video>
\n
"
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
def
test_models
(
vllm_runner
,
model
,
dtype
:
str
,
max_tokens
:
int
)
->
None
:
"""
This is a simple test to check if interleaved and non-interleaved prompts
give the same result.
"""
image_cherry
=
ImageAsset
(
"cherry_blossom"
).
pil_image
.
convert
(
"RGB"
)
image_stop
=
ImageAsset
(
"stop_sign"
).
pil_image
.
convert
(
"RGB"
)
images
=
[
image_cherry
,
image_stop
]
video
=
VideoAsset
(
name
=
"sample_demo_1.mp4"
,
num_frames
=
16
).
np_ndarrays
inputs
=
[
(
[
INTERLEAVED_PROMPT
],
[
images
],
[
video
],
),
(
[
NONINTERLEAVED_PROMPT
],
[
images
],
[
video
],
),
]
with
vllm_runner
(
model
,
task
=
"generate"
,
dtype
=
dtype
,
limit_mm_per_prompt
=
{
"image"
:
2
},
max_model_len
=
32768
,
max_num_seqs
=
2
,
tensor_parallel_size
=
1
,
enforce_eager
=
True
)
as
vllm_model
:
vllm_outputs_per_case
=
[
vllm_model
.
generate_greedy
(
prompts
,
max_tokens
,
images
=
images
,
videos
=
videos
)
for
prompts
,
images
,
videos
in
inputs
]
all_results
=
[
output
[
0
][
1
]
for
output
in
vllm_outputs_per_case
]
outputs
=
[(
total_str
,
total_str
.
find
(
"assistant
\n
"
)
+
len
(
"assistant
\n
"
))
for
total_str
in
all_results
]
prompt_lengths
=
[
prompt_len
for
_
,
prompt_len
in
outputs
]
generated_strs
=
[
total_str
[
prompt_len
:]
for
total_str
,
prompt_len
in
outputs
]
interleaved_prompt_len
,
noninterleaved_prompt_len
=
prompt_lengths
interleaved_output_str
,
noninterleaved_output_str
=
generated_strs
# The two prompts are identical except for the order of modality tokens.
assert
interleaved_prompt_len
==
noninterleaved_prompt_len
# The two generated strings should be different because of the
# interleaved modality tokens.
assert
interleaved_output_str
!=
noninterleaved_output_str
tests/models/decoder_only/vision_language/test_models.py
View file @
fcfc474d
...
...
@@ -9,9 +9,7 @@ from pathlib import PosixPath
import
os
import
pytest
from
packaging.version
import
Version
from
transformers
import
AutoModelForImageTextToText
,
AutoModelForVision2Seq
from
transformers
import
__version__
as
TRANSFORMERS_VERSION
from
vllm.platforms
import
current_platform
from
vllm.utils
import
identity
...
...
@@ -38,8 +36,6 @@ REQUIRES_V0_MODELS = [
# V1 Test: no way to fall back for head_dim = 80
# https://github.com/vllm-project/vllm/issues/14524
"qwen_vl"
,
"h2ovl"
,
"blip2"
,
# V1 Test: not enough KV cache space in C1.
"fuyu"
,
]
...
...
@@ -128,10 +124,9 @@ VLM_TEST_SETTINGS = {
dtype
=
"bfloat16"
,
marks
=
[
pytest
.
mark
.
skip
(
reason
=
"vLLM does not support PrefixLM attention mask"
)],
# noqa: E501
),
# TODO(ywang96): Move Qwen2-VL out of core models in favor of Qwen2.5-VL
# once we upgraded to transformers>=4.49.0.
"qwen2_vl"
:
VLMTestInfo
(
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-VL-2B-Instruct"
)],
"qwen2_5_vl"
:
VLMTestInfo
(
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-VL-3B-Instruct"
)],
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
,
...
...
@@ -147,43 +142,41 @@ VLM_TEST_SETTINGS = {
image_size_factors
=
[(),
(
0.25
,),
(
0.25
,
0.25
,
0.25
),
(
0.25
,
0.2
,
0.15
)],
marks
=
[
pytest
.
mark
.
core_model
,
pytest
.
mark
.
cpu_model
],
),
"qwen2_5_vl"
:
VLMTestInfo
(
models
=
[
"Qwen/Qwen2.5-VL-3B-Instruct"
],
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
,
VLMTestType
.
VIDEO
),
prompt_formatter
=
lambda
img_prompt
:
f
"<|im_start|>User
\n
{
img_prompt
}
<|im_end|>
\n
<|im_start|>assistant
\n
"
,
# noqa: E501
img_idx_to_prompt
=
lambda
idx
:
"<|vision_start|><|image_pad|><|vision_end|>"
,
# noqa: E501
video_idx_to_prompt
=
lambda
idx
:
"<|vision_start|><|video_pad|><|vision_end|>"
,
# noqa: E501
#### Extended model tests
"aria"
:
VLMTestInfo
(
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"rhymes-ai/Aria"
)],
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
prompt_formatter
=
lambda
img_prompt
:
f
"<|im_start|>user
\n
{
img_prompt
}
<|im_end|>
\n
<|im_start|>assistant
\n
"
,
# noqa: E501
img_idx_to_prompt
=
lambda
idx
:
"<fim_prefix><|img|><fim_suffix>
\n
"
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
auto_cls
=
AutoModelForVision2Seq
,
vllm_output_post_proc
=
model_utils
.
qwen2_vllm_to_hf_output
,
image_size_factors
=
[(),
(
0.25
,),
(
0.25
,
0.25
,
0.25
),
(
0.25
,
0.2
,
0.15
)],
marks
=
[
pytest
.
mark
.
core_model
,
pytest
.
mark
.
cpu_model
],
auto_cls
=
AutoModelForImageTextToText
,
single_image_prompts
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
"<vlm_image>Please describe the image shortly."
,
"cherry_blossom"
:
"<vlm_image>Please infer the season with reason."
,
# noqa: E501
}),
multi_image_prompt
=
"<vlm_image><vlm_image>Describe the two images shortly."
,
# noqa: E501
stop_str
=
[
"<|im_end|>"
],
image_size_factors
=
[(
0.10
,
0.15
)],
max_tokens
=
64
,
marks
=
[
large_gpu_mark
(
min_gb
=
64
)],
),
"aya_vision"
:
VLMTestInfo
(
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"CohereForAI/aya-vision-8b"
)],
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
prompt_formatter
=
lambda
img_prompt
:
f
"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>
{
img_prompt
}
<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
,
# noqa: E501
single_image_prompts
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
"<image>What's the content in the center of the image?"
,
# noqa: E501
"cherry_blossom"
:
"<image>What is the season?"
,
# noqa: E501
}),
multi_image_prompt
=
"<image><image>Describe the two images in detail."
,
# noqa: E501
max_model_len
=
8192
,
max_num_seqs
=
2
,
auto_cls
=
AutoModelForImageTextToText
,
vllm_runner_kwargs
=
{
"mm_processor_kwargs"
:
{
"crop_to_patches"
:
True
}}
),
#### Extended model tests
# "aria": VLMTestInfo(
# models=["rhymes-ai/Aria"],
# test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
# prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
# img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
# max_model_len=4096,
# max_num_seqs=2,
# auto_cls=AutoModelForImageTextToText,
# single_image_prompts=IMAGE_ASSETS.prompts({
# "stop_sign": "<vlm_image>Please describe the image shortly.",
# "cherry_blossom": "<vlm_image>Please infer the season with reason.", # noqa: E501
# }),
# multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501
# stop_str=["<|im_end|>"],
# image_size_factors=[(0.10, 0.15)],
# max_tokens=64,
# marks=[large_gpu_mark(min_gb=64)],
# ),
"blip2"
:
VLMTestInfo
(
# TODO: Change back to 2.7b once head_dim = 80 is supported
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"Salesforce/blip2-opt-2.7b"
)],
test_type
=
VLMTestType
.
IMAGE
,
prompt_formatter
=
lambda
img_prompt
:
f
"Question:
{
img_prompt
}
Answer:"
,
...
...
@@ -220,12 +213,6 @@ VLM_TEST_SETTINGS = {
hf_output_post_proc
=
model_utils
.
deepseekvl2_trunc_hf_output
,
stop_str
=
[
"<|end▁of▁sentence|>"
,
"<|begin▁of▁sentence|>"
],
# noqa: E501
image_size_factors
=
[(),
(
1.0
,
),
(
1.0
,
1.0
,
1.0
),
(
0.1
,
0.5
,
1.0
)],
marks
=
[
pytest
.
mark
.
skipif
(
Version
(
TRANSFORMERS_VERSION
)
>=
Version
(
"4.48"
),
reason
=
"HF model is not compatible with transformers>=4.48"
,
)
],
),
"fuyu"
:
VLMTestInfo
(
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"adept/fuyu-8b"
)],
...
...
@@ -277,7 +264,8 @@ VLM_TEST_SETTINGS = {
"h2ovl"
:
VLMTestInfo
(
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"h2oai/h2ovl-mississippi-800m"
),
os
.
path
.
join
(
models_path_prefix
,
"h2oai/h2ovl-mississippi-2b"
),
# TODO: Re-enable once head_dim = 80 is supported
# "h2oai/h2ovl-mississippi-2b",
],
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
prompt_formatter
=
lambda
img_prompt
:
f
"<|prompt|>
{
img_prompt
}
<|end|><|answer|>"
,
# noqa: E501
...
...
@@ -338,6 +326,7 @@ VLM_TEST_SETTINGS = {
prompt_formatter
=
lambda
vid_prompt
:
f
"<|im_start|>user
\n
{
vid_prompt
}
<|im_end|>
\n
<|im_start|>assistant
\n
"
,
# noqa: E501
num_video_frames
=
16
,
max_model_len
=
16384
,
hf_model_kwargs
=
model_utils
.
llava_onevision_hf_model_kwargs
(
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
),
# noqa: E501
auto_cls
=
AutoModelForVision2Seq
,
vllm_output_post_proc
=
model_utils
.
llava_onevision_vllm_to_hf_output
,
custom_test_opts
=
[
CustomTestOptions
(
...
...
@@ -354,6 +343,7 @@ VLM_TEST_SETTINGS = {
prompt_formatter
=
lambda
vid_prompt
:
f
"USER:
{
vid_prompt
}
ASSISTANT:"
,
num_video_frames
=
16
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
auto_cls
=
AutoModelForVision2Seq
,
vllm_output_post_proc
=
model_utils
.
llava_video_vllm_to_hf_output
,
),
...
...
@@ -366,12 +356,6 @@ VLM_TEST_SETTINGS = {
auto_cls
=
AutoModelForImageTextToText
,
vllm_output_post_proc
=
model_utils
.
mantis_vllm_to_hf_output
,
patch_hf_runner
=
model_utils
.
mantis_patch_hf_runner
,
marks
=
[
pytest
.
mark
.
skipif
(
Version
(
TRANSFORMERS_VERSION
)
>=
Version
(
"4.48"
),
reason
=
"HF model is not compatible with transformers>=4.48"
,
)
],
),
"minicpmv_25"
:
VLMTestInfo
(
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM-Llama3-V-2_5"
)],
...
...
@@ -408,7 +392,7 @@ VLM_TEST_SETTINGS = {
),
"molmo"
:
VLMTestInfo
(
models
=
[
"allenai/Molmo-7B-D-0924"
],
test_type
=
(
VLMTestType
.
IMAGE
),
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
prompt_formatter
=
identity
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
...
...
@@ -451,6 +435,37 @@ VLM_TEST_SETTINGS = {
vllm_output_post_proc
=
model_utils
.
qwen_vllm_to_hf_output
,
prompt_path_encoder
=
model_utils
.
qwen_prompt_path_encoder
,
),
"qwen2_vl"
:
VLMTestInfo
(
models
=
[
"Qwen/Qwen2-VL-2B-Instruct"
],
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
,
VLMTestType
.
VIDEO
),
prompt_formatter
=
lambda
img_prompt
:
f
"<|im_start|>User
\n
{
img_prompt
}
<|im_end|>
\n
<|im_start|>assistant
\n
"
,
# noqa: E501
img_idx_to_prompt
=
lambda
idx
:
"<|vision_start|><|image_pad|><|vision_end|>"
,
# noqa: E501
video_idx_to_prompt
=
lambda
idx
:
"<|vision_start|><|video_pad|><|vision_end|>"
,
# noqa: E501
max_model_len
=
4096
,
max_num_seqs
=
2
,
auto_cls
=
AutoModelForVision2Seq
,
vllm_output_post_proc
=
model_utils
.
qwen2_vllm_to_hf_output
,
image_size_factors
=
[(),
(
0.25
,),
(
0.25
,
0.25
,
0.25
),
(
0.25
,
0.2
,
0.15
)],
marks
=
[
pytest
.
mark
.
cpu_model
],
),
"skywork_r1v"
:
VLMTestInfo
(
models
=
[
"Skywork/Skywork-R1V-38B"
],
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
prompt_formatter
=
lambda
img_prompt
:
f
"<|begin▁of▁sentence|><|User|>
\n
{
img_prompt
}
<|Assistant|><think>
\n
"
,
# noqa: E501
single_image_prompts
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
"<image>
\n
What's the content in the center of the image?"
,
# noqa: E501
"cherry_blossom"
:
"<image>
\n
What is the season?"
,
}),
multi_image_prompt
=
"<image>
\n
<image>
\n
Describe the two images in short."
,
# noqa: E501
max_model_len
=
4096
,
use_tokenizer_eos
=
True
,
patch_hf_runner
=
model_utils
.
skyworkr1v_patch_hf_runner
,
marks
=
[
large_gpu_mark
(
min_gb
=
80
)],
),
### Tensor parallel / multi-gpu broadcast tests
"chameleon-broadcast"
:
VLMTestInfo
(
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"facebook/chameleon-7b"
)],
...
...
@@ -502,6 +517,7 @@ VLM_TEST_SETTINGS = {
max_model_len
=
16384
,
max_num_seqs
=
2
,
auto_cls
=
AutoModelForVision2Seq
,
hf_model_kwargs
=
model_utils
.
llava_onevision_hf_model_kwargs
(
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
),
# noqa: E501
vllm_output_post_proc
=
model_utils
.
llava_onevision_vllm_to_hf_output
,
custom_test_opts
=
[
CustomTestOptions
(
inputs
=
custom_inputs
.
multi_image_multi_aspect_ratio_inputs
(
...
...
@@ -523,6 +539,22 @@ VLM_TEST_SETTINGS = {
limit_mm_per_prompt
=
{
"image"
:
1
},
)],
),
"llama4"
:
VLMTestInfo
(
models
=
[
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
],
prompt_formatter
=
lambda
img_prompt
:
f
"<|begin_of_text|><|header_start|>user<|header_end|>
\n\n
{
img_prompt
}
<|eot|><|header_start|>assistant<|header_end|>
\n\n
"
,
# noqa: E501
img_idx_to_prompt
=
lambda
_
:
"<|image|>"
,
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
distributed_executor_backend
=
"mp"
,
image_size_factors
=
[(.
25
,
0.5
,
1.0
)],
hf_model_kwargs
=
{
"device_map"
:
"auto"
},
max_model_len
=
8192
,
max_num_seqs
=
4
,
dtype
=
"bfloat16"
,
auto_cls
=
AutoModelForImageTextToText
,
tensor_parallel_size
=
8
,
vllm_runner_kwargs
=
{
"gpu_memory_utilization"
:
0.8
},
marks
=
[
large_gpu_mark
(
min_gb
=
80
),
multi_gpu_marks
(
num_gpus
=
8
)],
),
}
# yapf: enable
...
...
tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
View file @
fcfc474d
...
...
@@ -104,6 +104,13 @@ def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
return
hf_output_ids
,
hf_output_str
,
out_logprobs
def
llava_onevision_hf_model_kwargs
(
model
:
str
)
->
dict
:
"""Workaround to fix the sliding window issue in llava_onevision."""
config
=
AutoConfig
.
from_pretrained
(
model
)
config
.
text_config
.
sliding_window
=
None
return
config
.
to_dict
()
def
llava_onevision_vllm_to_hf_output
(
vllm_output
:
RunnerOutput
,
model
:
str
)
->
RunnerOutput
:
"""Sanitize vllm output [llava-onevision] to compare with hf output."""
...
...
@@ -376,6 +383,63 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
return
hf_model
def
skyworkr1v_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
"""Patches and returns an instance of the HfRunner to use for SkyworkR1V."""
class
SkyworkR1VProcessor
:
"""A simple processor for SkyworkR1V."""
def
__init__
(
self
,
hf_runner
:
HfRunner
):
self
.
num_image_token
=
hf_runner
.
model
.
num_image_token
self
.
tokenizer
=
hf_runner
.
tokenizer
self
.
config
=
AutoConfig
.
from_pretrained
(
hf_runner
.
model_name
,
trust_remote_code
=
True
)
self
.
vision_config
=
self
.
config
.
vision_config
self
.
use_thumbnail
=
self
.
config
.
use_thumbnail
self
.
min_num
=
self
.
config
.
min_dynamic_patch
self
.
max_num
=
self
.
config
.
max_dynamic_patch
self
.
image_size
=
self
.
vision_config
.
image_size
def
__call__
(
self
,
text
:
str
,
images
:
Union
[
Image
,
list
[
Image
]],
**
kwargs
):
from
vllm.model_executor.models.skyworkr1v
import
(
IMG_CONTEXT
,
IMG_END
,
IMG_START
,
image_to_pixel_values_skyworkr1v
)
images
=
[
images
]
if
isinstance
(
images
,
Image
)
else
images
pixel_values
=
[
image_to_pixel_values_skyworkr1v
(
image
,
input_size
=
self
.
image_size
,
min_num
=
self
.
min_num
,
max_num
=
self
.
max_num
,
use_thumbnail
=
self
.
use_thumbnail
,
)
for
image
in
images
]
num_patches_list
=
[
pixel_value
.
shape
[
0
]
for
pixel_value
in
pixel_values
]
pixel_values
=
torch
.
cat
(
pixel_values
,
dim
=
0
)
for
num_patches
in
num_patches_list
:
context_tokens
=
IMG_CONTEXT
*
self
.
num_image_token
\
*
num_patches
image_tokens
=
IMG_START
+
context_tokens
+
IMG_END
text
=
text
.
replace
(
'<image>'
,
image_tokens
,
1
)
prompt
=
self
.
tokenizer
(
text
,
return_tensors
=
"pt"
)
prompt
.
update
({
"pixel_values"
:
pixel_values
})
return
prompt
img_context_token_id
=
hf_model
.
tokenizer
.
convert_tokens_to_ids
(
"<IMG_CONTEXT>"
)
hf_model
.
model
.
img_context_token_id
=
img_context_token_id
hf_model
.
processor
=
SkyworkR1VProcessor
(
hf_model
)
hf_model
.
model
.
get_output_embeddings
=
lambda
:
\
hf_model
.
model
.
language_model
.
get_output_embeddings
()
hf_model
.
model
.
generate
=
types
.
MethodType
(
_internvl_generate
,
hf_model
.
model
)
return
hf_model
def
internvl_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
"""Patches and returns an instance of the HfRunner to use for InternVL."""
...
...
tests/models/embedding/language/test_jina_reranker_v2.py
0 → 100644
View file @
fcfc474d
# SPDX-License-Identifier: Apache-2.0
# ruff: noqa: E501
"""Compare the scoring outputs of HF and vLLM models.
Run `pytest tests/models/embedding/language/test_jina_reranker_v2.py`.
"""
import
math
import
pytest
MODELS
=
[
"jinaai/jina-reranker-v2-base-multilingual"
,
# Roberta
]
TEXTS_1
=
[
"Organic skincare products for sensitive skin"
]
TEXTS_2
=
[
"Organic skincare for sensitive skin with aloe vera and chamomile."
,
"New makeup trends focus on bold colors and innovative techniques"
,
"Bio-Hautpflege für empfindliche Haut mit Aloe Vera und Kamille"
,
"Neue Make-up-Trends setzen auf kräftige Farben und innovative Techniken"
,
"Cuidado de la piel orgánico para piel sensible con aloe vera y manzanilla"
,
"Las nuevas tendencias de maquillaje se centran en colores vivos y técnicas innovadoras"
,
"针对敏感肌专门设计的天然有机护肤产品"
,
"新的化妆趋势注重鲜艳的颜色和创新的技巧"
,
"敏感肌のために特別に設計された天然有機スキンケア製品"
,
"新しいメイクのトレンドは鮮やかな色と革新的な技術に焦点を当てています"
,
]
@
pytest
.
fixture
(
scope
=
"module"
,
params
=
MODELS
)
def
model_name
(
request
):
yield
request
.
param
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_llm_1_to_1
(
vllm_runner
,
hf_runner
,
model_name
,
dtype
:
str
):
text_pair
=
[
TEXTS_1
[
0
],
TEXTS_2
[
0
]]
with
hf_runner
(
model_name
,
dtype
=
dtype
,
is_cross_encoder
=
True
)
as
hf_model
:
hf_outputs
=
hf_model
.
predict
([
text_pair
]).
tolist
()
with
vllm_runner
(
model_name
,
task
=
"score"
,
dtype
=
dtype
,
max_model_len
=
None
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
score
(
text_pair
[
0
],
text_pair
[
1
])
assert
len
(
vllm_outputs
)
==
1
assert
len
(
hf_outputs
)
==
1
assert
math
.
isclose
(
hf_outputs
[
0
],
vllm_outputs
[
0
],
rel_tol
=
0.01
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_llm_1_to_N
(
vllm_runner
,
hf_runner
,
model_name
,
dtype
:
str
):
text_pairs
=
[[
TEXTS_1
[
0
],
text
]
for
text
in
TEXTS_2
]
with
hf_runner
(
model_name
,
dtype
=
dtype
,
is_cross_encoder
=
True
)
as
hf_model
:
hf_outputs
=
hf_model
.
predict
(
text_pairs
).
tolist
()
with
vllm_runner
(
model_name
,
task
=
"score"
,
dtype
=
dtype
,
max_model_len
=
None
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
score
(
TEXTS_1
[
0
],
TEXTS_2
)
assert
len
(
vllm_outputs
)
==
10
assert
len
(
hf_outputs
)
==
10
assert
math
.
isclose
(
hf_outputs
[
0
],
vllm_outputs
[
0
],
rel_tol
=
0.01
)
assert
math
.
isclose
(
hf_outputs
[
1
],
vllm_outputs
[
1
],
rel_tol
=
0.01
)
tests/models/embedding/vision_language/test_phi3v.py
View file @
fcfc474d
...
...
@@ -3,6 +3,10 @@
import
os
import
pytest
import
torch.nn.functional
as
F
from
PIL
import
Image
from
vllm.assets.base
import
get_vllm_public_assets
from
vllm.assets.image
import
VLM_IMAGES_DIR
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
from
....utils
import
large_gpu_test
,
models_path_prefix
...
...
@@ -113,6 +117,15 @@ def test_models_image(
(
text
,
asset
.
pil_image
)
for
text
,
asset
in
zip
(
HF_IMAGE_PROMPTS
,
image_assets
)
]
# add cases for special_tokens
input_texts_images
.
append
((
"
\n
<s><|user|>
\n
<|image_1|>
\n\t
<s>"
"Represent the given image for classification<|end|>"
"
\n
<|assistant|>
\n
"
,
Image
.
open
(
get_vllm_public_assets
(
filename
=
"cherry_blossom.jpg"
,
s3_prefix
=
VLM_IMAGES_DIR
)),
))
input_texts
=
[
text
for
text
,
_
in
input_texts_images
]
input_images
=
[
image
for
_
,
image
in
input_texts_images
]
...
...
tests/models/encoder_decoder/vision_language/test_mllama.py
View file @
fcfc474d
...
...
@@ -214,7 +214,7 @@ def _run_test(
with
vllm_runner
(
model
,
dtype
=
dtype
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
max_num_seqs
=
3
,
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
limit_mm_per_prompt
=
{
"image"
:
_LIMIT_IMAGE_PER_PROMPT
...
...
@@ -427,7 +427,6 @@ def test_bnb_regression(
max_model_len
=
4096
,
max_num_seqs
=
2
,
quantization
=
"bitsandbytes"
,
load_format
=
"bitsandbytes"
,
)
sampling_params
=
SamplingParams
(
temperature
=
0
,
...
...
tests/models/multimodal/processing/test_common.py
View file @
fcfc474d
# SPDX-License-Identifier: Apache-2.0
import
copy
from
functools
import
partial
from
typing
import
Optional
,
Union
...
...
@@ -29,7 +28,7 @@ def _test_processing_correctness(
hit_rate
:
float
,
num_batches
:
int
,
simplify_rate
:
float
,
ignore_mm_keys
:
Optional
[
li
st
[
str
]]
=
None
,
ignore_mm_keys
:
Optional
[
s
e
t
[
str
]]
=
None
,
):
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model_id
)
model_info
.
check_available_online
(
on_fail
=
"skip"
)
...
...
@@ -145,7 +144,7 @@ def _test_processing_correctness_hf(
baseline_processor
:
BaseMultiModalProcessor
,
cached_processor
:
BaseMultiModalProcessor
,
batch_idx
:
int
,
ignore_mm_keys
:
Optional
[
li
st
[
str
]]
=
None
,
ignore_mm_keys
:
Optional
[
s
e
t
[
str
]]
=
None
,
):
if
model_config
.
hf_config
.
model_type
in
(
"mllama"
,
"whisper"
,
"ultravox"
):
# For some multimodal models, tokenizer will always add bos_token
...
...
@@ -167,11 +166,12 @@ def _test_processing_correctness_hf(
hf_processor_mm_kwargs
=
{},
)
assert
_inputs_equal
(
_
assert_inputs_equal
(
baseline_result
,
cached_result
,
ignore_mm_keys
,
),
f
"Failed (
{
batch_idx
=
}
,
{
prompt
=
}
,
{
mm_data
=
}
)"
ignore_mm_keys
=
ignore_mm_keys
,
msg
=
f
"Failed (
{
batch_idx
=
}
,
{
prompt
=
}
,
{
mm_data
=
}
)"
,
)
baseline_tokenized_result
=
baseline_processor
.
apply
(
token_prompt
,
...
...
@@ -179,11 +179,12 @@ def _test_processing_correctness_hf(
hf_processor_mm_kwargs
=
{},
)
assert
_inputs_equal
(
_
assert_inputs_equal
(
baseline_result
,
baseline_tokenized_result
,
ignore_mm_keys
,
),
f
"Failed (
{
batch_idx
=
}
,
{
prompt
=
}
,
{
mm_data
=
}
)"
ignore_mm_keys
=
ignore_mm_keys
,
msg
=
f
"Failed (
{
batch_idx
=
}
,
{
prompt
=
}
,
{
mm_data
=
}
)"
,
)
cached_tokenized_result
=
cached_processor
.
apply
(
token_prompt
,
...
...
@@ -191,11 +192,12 @@ def _test_processing_correctness_hf(
hf_processor_mm_kwargs
=
{},
)
assert
_inputs_equal
(
_
assert_inputs_equal
(
cached_result
,
cached_tokenized_result
,
ignore_mm_keys
,
),
f
"Failed (
{
batch_idx
=
}
,
{
prompt
=
}
,
{
mm_data
=
}
)"
ignore_mm_keys
=
ignore_mm_keys
,
msg
=
f
"Failed (
{
batch_idx
=
}
,
{
prompt
=
}
,
{
mm_data
=
}
)"
,
)
def
_test_processing_correctness_mistral
(
...
...
@@ -206,7 +208,7 @@ def _test_processing_correctness_mistral(
baseline_processor
:
BaseMultiModalProcessor
,
cached_processor
:
BaseMultiModalProcessor
,
batch_idx
:
int
,
ignore_mm_keys
:
Optional
[
li
st
[
str
]]
=
None
,
ignore_mm_keys
:
Optional
[
s
e
t
[
str
]]
=
None
,
):
images
=
mm_data
.
get
(
"image"
,
[])
if
not
isinstance
(
images
,
list
):
...
...
@@ -233,16 +235,18 @@ def _test_processing_correctness_mistral(
hf_processor_mm_kwargs
=
{},
)
assert
_inputs_equal
(
_
assert_inputs_equal
(
baseline_tokenized_result
,
cached_tokenized_result
,
ignore_mm_keys
,
),
f
"Failed (
{
batch_idx
=
}
,
{
prompt
=
}
,
{
mm_data
=
}
)"
ignore_mm_keys
=
ignore_mm_keys
,
msg
=
f
"Failed (
{
batch_idx
=
}
,
{
prompt
=
}
,
{
mm_data
=
}
)"
,
)
# yapf: disable
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"rhymes-ai/Aria"
,
"CohereForAI/aya-vision-8b"
,
"Salesforce/blip2-opt-2.7b"
,
"facebook/chameleon-7b"
,
"deepseek-ai/deepseek-vl2-tiny"
,
...
...
@@ -259,21 +263,24 @@ def _test_processing_correctness_mistral(
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
,
"meta-llama/Llama-3.2-11B-Vision-Instruct"
,
"TIGER-Lab/Mantis-8B-siglip-llama3"
,
"mistralai/Pixtral-12B-2409"
,
"mistral-community/pixtral-12b"
,
"openbmb/MiniCPM-Llama3-V-2_5"
,
"openbmb/MiniCPM-o-2_6"
,
"openbmb/MiniCPM-V-2_6"
,
"allenai/Molmo-7B-D-0924"
,
"allenai/Molmo-7B-O-0924"
,
"nvidia/NVLM-D-72B"
,
"google/paligemma-3b-mix-224"
,
"google/paligemma2-3b-ft-docci-448"
,
"mistralai/Pixtral-12B-2409"
,
"mistral-community/pixtral-12b"
,
"Qwen/Qwen-VL-Chat"
,
"Qwen/Qwen2-VL-2B-Instruct"
,
"Qwen/Qwen2.5-VL-3B-Instruct"
,
"Qwen/Qwen2-Audio-7B-Instruct"
,
"Skywork/Skywork-R1V-38B"
,
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
,
"openai/whisper-large-v3"
,
"google/paligemma-3b-mix-224"
,
"google/paligemma2-3b-ft-docci-448"
,
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
,
])
@
pytest
.
mark
.
parametrize
(
"hit_rate"
,
[
0.3
,
0.5
,
1.0
])
@
pytest
.
mark
.
parametrize
(
"num_batches"
,
[
32
])
...
...
@@ -290,7 +297,7 @@ def test_processing_correctness(
# In Ultravox, the audio_features can be different depending on padding
# The slight difference should not be a problem though, since
# attention_mask lets us ignore the difference.
ignore_mm_keys
=
[
'
audio_features
'
]
ignore_mm_keys
=
{
"
audio_features
"
}
_test_processing_correctness
(
model_id
,
...
...
@@ -328,38 +335,26 @@ def test_processing_correctness_phi3v(
)
def
_inputs_equal
(
def
_assert
_inputs_equal
(
a
:
MultiModalInputs
,
b
:
MultiModalInputs
,
ignore_mm_keys
:
Optional
[
list
[
str
]]
=
None
,
*
,
ignore_mm_keys
:
Optional
[
set
[
str
]]
=
None
,
msg
:
str
=
""
,
):
return
_drop_mm_kwargs_keys
(
a
,
ignore_mm_keys
)
==
_drop_mm_kwargs_keys
(
b
,
ignore_mm_keys
)
def
_drop_mm_kwargs_keys
(
result
:
MultiModalInputs
,
ignore_mm_keys
:
Optional
[
list
[
str
]]
=
None
,
)
->
MultiModalInputs
:
"""Drop specified keys from result['mm_kwargs'].
This is mainly to avoid doing exact match of audio_features in ultravox.
Args:
result: Result to drop keys from
ignore_mm_keys: List of keys to ignore, e.g. ['audio_features']
"""
if
not
ignore_mm_keys
:
return
result
if
'mm_kwargs'
in
result
:
result
=
copy
.
deepcopy
(
result
)
mm_kwargs
=
result
[
'mm_kwargs'
]
for
key
in
ignore_mm_keys
:
mm_kwargs
.
pop
(
key
,
None
)
for
items
in
mm_kwargs
.
_items_by_modality
.
values
():
for
item
in
items
:
for
key
in
ignore_mm_keys
:
item
.
pop
(
key
,
None
)
return
result
if
ignore_mm_keys
is
None
:
ignore_mm_keys
=
set
()
if
msg
is
None
:
assert
"mm_kwargs"
in
a
and
"mm_kwargs"
in
b
else
:
assert
"mm_kwargs"
in
a
and
"mm_kwargs"
in
b
,
msg
for
key
in
ignore_mm_keys
:
a
[
"mm_kwargs"
].
pop
(
key
,
None
)
b
[
"mm_kwargs"
].
pop
(
key
,
None
)
if
msg
is
None
:
assert
a
==
b
else
:
assert
a
==
b
,
msg
tests/models/multimodal/processing/test_h2ovl.py
View file @
fcfc474d
...
...
@@ -10,7 +10,6 @@ from transformers import PretrainedConfig
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
from
vllm.transformers_utils.tokenizer
import
cached_tokenizer_from_config
from
....conftest
import
_ImageAssets
from
...utils
import
build_model_context
...
...
@@ -156,11 +155,7 @@ def test_processor_override(
mm_processor_kwargs
=
mm_processor_kwargs
if
kwargs_on_init
else
None
,
limit_mm_per_prompt
=
{
"image"
:
len
(
size_factors
)},
)
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
tokenizer
,
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
)
hf_processor_mm_kwargs
=
{}
if
kwargs_on_init
else
mm_processor_kwargs
min_num
=
min_dynamic_patch
if
dynamic_image_size
else
1
...
...
tests/models/multimodal/processing/test_idefics3.py
View file @
fcfc474d
...
...
@@ -5,7 +5,6 @@ import pytest
from
transformers
import
Idefics3Config
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.transformers_utils.tokenizer
import
cached_tokenizer_from_config
from
....conftest
import
_ImageAssets
from
...utils
import
build_model_context
...
...
@@ -31,7 +30,7 @@ def test_processor_override(
num_imgs
:
int
,
kwargs_on_init
:
bool
,
):
"""Ensure
input_processor_for_idefics3
handles num_crops properly."""
"""Ensure
Idefics3MultiModalProcessor
handles num_crops properly."""
# Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by
# the partial when calling the custom input processor.
...
...
@@ -40,11 +39,7 @@ def test_processor_override(
mm_processor_kwargs
=
mm_processor_kwargs
if
kwargs_on_init
else
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
tokenizer
,
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
)
hf_processor_mm_kwargs
=
{}
if
kwargs_on_init
else
mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass
...
...
tests/models/multimodal/processing/test_internvl.py
View file @
fcfc474d
...
...
@@ -11,7 +11,6 @@ from transformers import PretrainedConfig
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
from
vllm.transformers_utils.tokenizer
import
cached_tokenizer_from_config
from
....conftest
import
_ImageAssets
from
...utils
import
build_model_context
...
...
@@ -115,11 +114,7 @@ def test_processor_override(
mm_processor_kwargs
=
mm_processor_kwargs
if
kwargs_on_init
else
None
,
limit_mm_per_prompt
=
{
"image"
:
len
(
size_factors
)},
)
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
tokenizer
,
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
)
hf_processor_mm_kwargs
=
{}
if
kwargs_on_init
else
mm_processor_kwargs
min_num
=
min_dynamic_patch
if
dynamic_image_size
else
1
...
...
tests/models/multimodal/processing/test_llama4.py
0 → 100644
View file @
fcfc474d
# SPDX-License-Identifier: Apache-2.0
"""Tests for Llama4's multimodal preprocessing kwargs."""
import
pytest
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.transformers_utils.tokenizer
import
encode_tokens
from
....conftest
import
_ImageAssets
from
...utils
import
build_model_context
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
])
@
pytest
.
mark
.
parametrize
(
"mm_processor_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
5
])
@
pytest
.
mark
.
parametrize
(
"disable_mm_preprocessor_cache"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"tokenized_prompt"
,
[
True
,
False
])
def
test_processor_override
(
image_assets
:
_ImageAssets
,
model_id
:
str
,
mm_processor_kwargs
:
dict
,
num_imgs
:
int
,
disable_mm_preprocessor_cache
:
bool
,
tokenized_prompt
:
bool
,
):
"""Ensure llama4 processor works properly."""
ctx
=
build_model_context
(
model_id
,
mm_processor_kwargs
=
mm_processor_kwargs
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
disable_mm_preprocessor_cache
=
disable_mm_preprocessor_cache
,
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
)
config
=
processor
.
info
.
get_hf_config
()
tokenizer
=
processor
.
info
.
get_tokenizer
()
hf_processor
=
processor
.
info
.
get_hf_processor
()
vocab
=
tokenizer
.
get_vocab
()
prompt
=
"<|begin_of_text|><|header_start|>user<|header_end|>"
\
+
"<|image|>"
*
num_imgs
\
+
"<|eot|><|header_start|>assistant<|header_end|>"
mm_data
=
{
"image"
:
[
image_assets
[(
i
%
len
(
image_assets
))].
pil_image
for
i
in
range
(
num_imgs
)
]
}
if
tokenized_prompt
:
prompt
=
encode_tokens
(
tokenizer
,
prompt
)
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
mm_processor_kwargs
)
mm_kwargs
=
processed_inputs
[
"mm_kwargs"
]
# place holder replacements
prompt_token_ids
=
processed_inputs
[
"prompt_token_ids"
]
assert
prompt_token_ids
.
count
(
config
.
boi_token_index
)
==
num_imgs
assert
prompt_token_ids
.
count
(
config
.
eoi_token_index
)
==
num_imgs
assert
prompt_token_ids
.
count
(
vocab
[
hf_processor
.
image_token
])
==
num_imgs
aspect_ratios
=
mm_kwargs
[
"aspect_ratios"
]
num_x_separators
=
num_y_separators
=
0
for
tiles_y
,
tiles_x
in
aspect_ratios
:
if
tiles_x
*
tiles_y
>
1
:
num_x_separators
+=
(
tiles_x
-
1
)
*
tiles_y
num_y_separators
+=
tiles_y
assert
prompt_token_ids
.
count
(
vocab
[
hf_processor
.
tile_token
])
\
==
num_x_separators
assert
prompt_token_ids
.
count
(
vocab
[
hf_processor
.
tile_global_token
])
\
==
num_y_separators
# image token offsets
img_locs
=
processed_inputs
[
"mm_placeholders"
].
get
(
"image"
,
[])
assert
len
(
img_locs
)
==
num_imgs
assert
[
img_loc
[
"offset"
]
for
img_loc
in
img_locs
]
==
\
[
i
for
i
,
v
in
enumerate
(
prompt_token_ids
)
\
if
v
==
config
.
boi_token_index
]
# patch sizes and masks
assert
prompt_token_ids
.
count
(
config
.
image_token_index
)
\
==
sum
(
img_patch
.
sum
()
for
img_patch
in
mm_kwargs
[
"embed_is_patch"
])
patch_token_id
=
vocab
[
hf_processor
.
img_patch_token
]
num_patches
=
processed_inputs
[
"prompt_token_ids"
].
count
(
patch_token_id
)
mm_counts
=
{
"image"
:
num_imgs
}
assert
num_patches
/
num_imgs
<=
\
processor
.
info
.
get_mm_max_tokens_per_item
(
32768
,
mm_counts
)[
"image"
]
num_patches_per_chunk
=
processor
.
info
.
get_patch_per_chunk
(
config
.
vision_config
)
assert
prompt_token_ids
.
count
(
config
.
image_token_index
)
\
==
mm_kwargs
[
"patches_per_image"
].
sum
()
*
num_patches_per_chunk
assert
mm_kwargs
[
"pixel_values"
].
shape
[
0
]
\
==
mm_kwargs
[
"patches_per_image"
].
sum
()
for
embed_is_patch
,
aspect_ratio
in
zip
(
mm_kwargs
[
"embed_is_patch"
],
mm_kwargs
[
"aspect_ratios"
]):
assert
embed_is_patch
.
shape
[
0
]
==
\
len
(
tokenizer
.
encode
(
hf_processor
.
_prompt_split_image
(
aspect_ratio
,
num_patches_per_chunk
),
add_special_tokens
=
False
))
tests/models/multimodal/processing/test_llava_next.py
View file @
fcfc474d
...
...
@@ -10,7 +10,6 @@ from pqdm.threads import pqdm
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.parse
import
ImageSize
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
from
vllm.transformers_utils.tokenizer
import
cached_tokenizer_from_config
from
...utils
import
build_model_context
...
...
@@ -40,10 +39,7 @@ def test_processor_max_tokens(model_id):
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"image"
:
1
},
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
),
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
)
info
=
processor
.
info
seen_aspect_ratios
=
set
[
float
]()
...
...
@@ -139,10 +135,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
),
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
)
image_ratios
=
[(
171
,
152
),
(
184
,
161
),
(
198
,
176
),
(
333
,
296
),
(
369
,
328
),
(
488
,
183
),
(
2560
,
1669
)]
...
...
@@ -168,10 +161,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
),
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
)
seen_aspect_ratios
=
set
[
float
]()
image_sizes
=
list
[
ImageSize
]()
...
...
tests/models/multimodal/processing/test_llava_onevision.py
View file @
fcfc474d
...
...
@@ -10,7 +10,6 @@ from pqdm.threads import pqdm
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.parse
import
ImageSize
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
from
vllm.transformers_utils.tokenizer
import
cached_tokenizer_from_config
from
...utils
import
build_model_context
...
...
@@ -41,10 +40,7 @@ def test_processor_max_tokens(model_id):
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"image"
:
1
},
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
),
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
)
info
=
processor
.
info
seen_aspect_ratios
=
set
[
float
]()
...
...
@@ -139,10 +135,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
),
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
)
image_ratios
=
[(
171
,
152
),
(
184
,
161
),
(
198
,
176
),
(
333
,
296
),
(
369
,
328
),
(
488
,
183
),
(
2560
,
1669
)]
...
...
@@ -169,10 +162,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
),
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
)
seen_aspect_ratios
=
set
[
float
]()
image_sizes
=
list
[
ImageSize
]()
...
...
tests/models/multimodal/processing/test_phi3v.py
View file @
fcfc474d
...
...
@@ -3,7 +3,6 @@
import
pytest
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.transformers_utils.tokenizer
import
cached_tokenizer_from_config
from
....conftest
import
_ImageAssets
from
...utils
import
build_model_context
...
...
@@ -30,7 +29,7 @@ def test_processor_override(
num_imgs
:
int
,
kwargs_on_init
:
bool
,
):
"""Ensure
input_processor_for_phi3v
handles num_crops properly."""
"""Ensure
Phi3VMultiModalProcessor
handles num_crops properly."""
# Avoid initializing CUDA early
from
vllm.model_executor.models.phi3v
import
_IMAGE_TOKEN_ID
...
...
@@ -39,11 +38,7 @@ def test_processor_override(
mm_processor_kwargs
=
mm_processor_kwargs
if
kwargs_on_init
else
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
tokenizer
,
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
)
hf_processor_mm_kwargs
=
{}
if
kwargs_on_init
else
mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass
...
...
tests/models/multimodal/processing/test_qwen2_vl.py
View file @
fcfc474d
...
...
@@ -3,7 +3,6 @@
import
pytest
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.transformers_utils.tokenizer
import
cached_tokenizer_from_config
from
....conftest
import
_ImageAssets
from
...utils
import
build_model_context
...
...
@@ -34,11 +33,8 @@ def test_processor_override(
mm_processor_kwargs
=
mm_processor_kwargs
if
kwargs_on_init
else
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
tokenizer
,
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
)
tokenizer
=
processor
.
info
.
get_tokenizer
()
hf_processor_mm_kwargs
=
{}
if
kwargs_on_init
else
mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass
...
...
tests/models/registry.py
View file @
fcfc474d
...
...
@@ -34,6 +34,16 @@ class _HfExamplesInfo:
The minimum version of HF Transformers that is required to run this model.
"""
max_transformers_version
:
Optional
[
str
]
=
None
"""
The maximum version of HF Transformers that this model runs on.
"""
transformers_version_reason
:
Optional
[
str
]
=
None
"""
The reason for the minimum/maximum version requirement.
"""
is_available_online
:
bool
=
True
"""
Set this to ``False`` if the name of this architecture no longer exists on
...
...
@@ -57,21 +67,28 @@ class _HfExamplesInfo:
If the installed transformers version does not meet the requirements,
perform the given action.
"""
if
self
.
min_transformers_version
is
None
:
if
(
self
.
min_transformers_version
is
None
and
self
.
max_transformers_version
is
None
):
return
current_version
=
TRANSFORMERS_VERSION
required_version
=
self
.
min_transformers_version
if
Version
(
current_version
)
<
Version
(
required_version
):
msg
=
(
f
"You have `transformers==
{
current_version
}
` installed, but "
f
"`transformers>=
{
required_version
}
` is required to run this "
"model"
)
min_version
=
self
.
min_transformers_version
max_version
=
self
.
max_transformers_version
msg
=
f
"`transformers==
{
current_version
}
` installed, but `transformers"
if
min_version
and
Version
(
current_version
)
<
Version
(
min_version
):
msg
+=
f
">=
{
min_version
}
` is required to run this model."
elif
max_version
and
Version
(
current_version
)
>
Version
(
max_version
):
msg
+=
f
"<=
{
max_version
}
` is required to run this model."
else
:
return
if
on_fail
==
"error"
:
raise
RuntimeError
(
msg
)
else
:
pytest
.
skip
(
msg
)
if
self
.
transformers_version_reason
:
msg
+=
f
" Reason:
{
self
.
transformers_version_reason
}
"
if
on_fail
==
"error"
:
raise
RuntimeError
(
msg
)
else
:
pytest
.
skip
(
msg
)
def
check_available_online
(
self
,
...
...
@@ -112,7 +129,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"Cohere2ForCausalLM"
:
_HfExamplesInfo
(
"CohereForAI/c4ai-command-r7b-12-2024"
,
# noqa: E501
trust_remote_code
=
True
),
"DbrxForCausalLM"
:
_HfExamplesInfo
(
"databricks/dbrx-instruct"
),
"DeciLMForCausalLM"
:
_HfExamplesInfo
(
"
Deci/DeciLM-7B-instruct"
,
"DeciLMForCausalLM"
:
_HfExamplesInfo
(
"
nvidia/Llama-3_3-Nemotron-Super-49B-v1"
,
# noqa: E501
trust_remote_code
=
True
),
"DeepseekForCausalLM"
:
_HfExamplesInfo
(
"deepseek-ai/deepseek-llm-7b-chat"
),
"DeepseekV2ForCausalLM"
:
_HfExamplesInfo
(
"deepseek-ai/DeepSeek-V2-Lite-Chat"
,
# noqa: E501
...
...
@@ -159,6 +176,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code
=
True
),
"MiniCPM3ForCausalLM"
:
_HfExamplesInfo
(
"openbmb/MiniCPM3-4B"
,
trust_remote_code
=
True
),
"MiniMaxText01ForCausalLM"
:
_HfExamplesInfo
(
"MiniMaxAI/MiniMax-Text-01"
,
trust_remote_code
=
True
),
"MistralForCausalLM"
:
_HfExamplesInfo
(
"mistralai/Mistral-7B-Instruct-v0.1"
),
"MixtralForCausalLM"
:
_HfExamplesInfo
(
"mistralai/Mixtral-8x7B-Instruct-v0.1"
),
# noqa: E501
"QuantMixtralForCausalLM"
:
_HfExamplesInfo
(
"mistral-community/Mixtral-8x22B-v0.1-AWQ"
),
# noqa: E501
...
...
@@ -242,9 +261,14 @@ _CROSS_ENCODER_EXAMPLE_MODELS = {
_MULTIMODAL_EXAMPLE_MODELS
=
{
# [Decoder-only]
"AriaForConditionalGeneration"
:
_HfExamplesInfo
(
"rhymes-ai/Aria"
),
"Blip2ForConditionalGeneration"
:
_HfExamplesInfo
(
"Salesforce/blip2-opt-2.7b"
),
# noqa: E501
"AyaVisionForConditionalGeneration"
:
_HfExamplesInfo
(
"CohereForAI/aya-vision-8b"
),
# noqa: E501
"Blip2ForConditionalGeneration"
:
_HfExamplesInfo
(
"Salesforce/blip2-opt-2.7b"
,
# noqa: E501
extras
=
{
"6b"
:
"Salesforce/blip2-opt-6.7b"
}),
# noqa: E501
"ChameleonForConditionalGeneration"
:
_HfExamplesInfo
(
"facebook/chameleon-7b"
),
# noqa: E501
"DeepseekVLV2ForCausalLM"
:
_HfExamplesInfo
(
"deepseek-ai/deepseek-vl2-tiny"
,
# noqa: E501
extras
=
{
"fork"
:
"Isotr0py/deepseek-vl2-tiny"
},
# noqa: E501
max_transformers_version
=
"4.48"
,
# noqa: E501
transformers_version_reason
=
"HF model is not compatible."
,
# noqa: E501
hf_overrides
=
{
"architectures"
:
[
"DeepseekVLV2ForCausalLM"
]}),
# noqa: E501
"FuyuForCausalLM"
:
_HfExamplesInfo
(
"adept/fuyu-8b"
),
"Gemma3ForConditionalGeneration"
:
_HfExamplesInfo
(
"google/gemma-3-4b-it"
,
...
...
@@ -266,13 +290,22 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"LlavaNextVideoForConditionalGeneration"
:
_HfExamplesInfo
(
"llava-hf/LLaVA-NeXT-Video-7B-hf"
),
# noqa: E501
"LlavaOnevisionForConditionalGeneration"
:
_HfExamplesInfo
(
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
),
# noqa: E501
"MantisForConditionalGeneration"
:
_HfExamplesInfo
(
"TIGER-Lab/Mantis-8B-siglip-llama3"
,
# noqa: E501
max_transformers_version
=
"4.48"
,
# noqa: E501
transformers_version_reason
=
"HF model is not compatible."
,
# noqa: E501
hf_overrides
=
{
"architectures"
:
[
"MantisForConditionalGeneration"
]}),
# noqa: E501
"MiniCPMO"
:
_HfExamplesInfo
(
"openbmb/MiniCPM-o-2_6"
,
max_transformers_version
=
"4.48"
,
transformers_version_reason
=
"Use of deprecated imports which have been removed."
,
# noqa: E501
trust_remote_code
=
True
),
"MiniCPMV"
:
_HfExamplesInfo
(
"openbmb/MiniCPM-Llama3-V-2_5"
,
extras
=
{
"2.6"
:
"openbmb/MiniCPM-V-2_6"
},
# noqa: E501
trust_remote_code
=
True
),
"Mistral3ForConditionalGeneration"
:
_HfExamplesInfo
(
"mistralai/Mistral-Small-3.1-24B-Instruct-2503"
,
# noqa: E501
min_transformers_version
=
"4.50"
,
# noqa: E501
extras
=
{
"fp8"
:
"nm-testing/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic"
}),
# noqa: E501
"MolmoForCausalLM"
:
_HfExamplesInfo
(
"allenai/Molmo-7B-D-0924"
,
max_transformers_version
=
"4.48"
,
transformers_version_reason
=
"Use of private method which no longer exists."
,
# noqa: E501
extras
=
{
"olmo"
:
"allenai/Molmo-7B-O-0924"
},
# noqa: E501
trust_remote_code
=
True
),
"NVLM_D"
:
_HfExamplesInfo
(
"nvidia/NVLM-D-72B"
,
...
...
@@ -281,7 +314,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
extras
=
{
"v2"
:
"google/paligemma2-3b-ft-docci-448"
}),
# noqa: E501
"Phi3VForCausalLM"
:
_HfExamplesInfo
(
"microsoft/Phi-3-vision-128k-instruct"
,
trust_remote_code
=
True
,
extras
=
{
"phi3.5"
:
"microsoft/Phi-3.5-vision-instruct"
}),
# noqa: E501
),
extras
=
{
"phi3.5"
:
"microsoft/Phi-3.5-vision-instruct"
}),
# noqa: E501
"Phi4MMForCausalLM"
:
_HfExamplesInfo
(
"microsoft/Phi-4-multimodal-instruct"
,
trust_remote_code
=
True
),
"PixtralForConditionalGeneration"
:
_HfExamplesInfo
(
"mistralai/Pixtral-12B-2409"
,
# noqa: E501
...
...
@@ -294,6 +327,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"Qwen2VLForConditionalGeneration"
:
_HfExamplesInfo
(
"Qwen/Qwen2-VL-2B-Instruct"
),
# noqa: E501
"Qwen2_5_VLForConditionalGeneration"
:
_HfExamplesInfo
(
"Qwen/Qwen2.5-VL-3B-Instruct"
,
# noqa: E501
min_transformers_version
=
"4.49"
),
# noqa: E501
"SkyworkR1VChatModel"
:
_HfExamplesInfo
(
"Skywork/Skywork-R1V-38B"
),
"UltravoxModel"
:
_HfExamplesInfo
(
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
,
# noqa: E501
trust_remote_code
=
True
),
# [Encoder-decoder]
...
...
@@ -303,6 +337,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
tokenizer
=
"facebook/bart-base"
,
trust_remote_code
=
True
),
# noqa: E501
"MllamaForConditionalGeneration"
:
_HfExamplesInfo
(
"meta-llama/Llama-3.2-11B-Vision-Instruct"
),
# noqa: E501
"Llama4ForConditionalGeneration"
:
_HfExamplesInfo
(
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
),
# noqa: E501
"WhisperForConditionalGeneration"
:
_HfExamplesInfo
(
"openai/whisper-large-v3"
),
# noqa: E501
}
...
...
@@ -318,8 +353,8 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
trust_remote_code
=
True
),
}
_
FALLBACK
_MODEL
=
{
"Transformers
Model
"
:
_HfExamplesInfo
(
"ArthurZ/Ilama-3.2-1B"
,
trust_remote_code
=
True
),
# noqa: E501
_
TRANSFORMERS
_MODEL
S
=
{
"Transformers
ForCausalLM
"
:
_HfExamplesInfo
(
"ArthurZ/Ilama-3.2-1B"
,
trust_remote_code
=
True
),
# noqa: E501
}
_EXAMPLE_MODELS
=
{
...
...
@@ -328,7 +363,7 @@ _EXAMPLE_MODELS = {
**
_CROSS_ENCODER_EXAMPLE_MODELS
,
**
_MULTIMODAL_EXAMPLE_MODELS
,
**
_SPECULATIVE_DECODING_EXAMPLE_MODELS
,
**
_
FALLBACK
_MODEL
,
**
_
TRANSFORMERS
_MODEL
S
,
}
...
...
Prev
1
…
7
8
9
10
11
12
13
14
15
…
26
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment