Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d2b52805
Commit
d2b52805
authored
Sep 07, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.10.2rc1' into v0.10.2rc1-ori
parents
9a521c23
5438967f
Changes
501
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
491 additions
and
245 deletions
+491
-245
tests/models/multimodal/generation/vlm_utils/custom_inputs.py
...s/models/multimodal/generation/vlm_utils/custom_inputs.py
+4
-7
tests/models/multimodal/generation/vlm_utils/model_utils.py
tests/models/multimodal/generation/vlm_utils/model_utils.py
+58
-2
tests/models/multimodal/processing/test_common.py
tests/models/multimodal/processing/test_common.py
+33
-11
tests/models/multimodal/processing/test_glm4_1v.py
tests/models/multimodal/processing/test_glm4_1v.py
+2
-1
tests/models/multimodal/processing/test_h2ovl.py
tests/models/multimodal/processing/test_h2ovl.py
+2
-1
tests/models/multimodal/processing/test_internvl.py
tests/models/multimodal/processing/test_internvl.py
+2
-1
tests/models/multimodal/processing/test_llama4.py
tests/models/multimodal/processing/test_llama4.py
+5
-5
tests/models/multimodal/processing/test_mllama.py
tests/models/multimodal/processing/test_mllama.py
+3
-3
tests/models/multimodal/processing/test_mllama4.py
tests/models/multimodal/processing/test_mllama4.py
+5
-5
tests/models/multimodal/processing/test_nemotron_vl.py
tests/models/multimodal/processing/test_nemotron_vl.py
+2
-1
tests/models/multimodal/processing/test_qwen2_vl.py
tests/models/multimodal/processing/test_qwen2_vl.py
+2
-1
tests/models/multimodal/processing/test_tensor_schema.py
tests/models/multimodal/processing/test_tensor_schema.py
+61
-86
tests/models/quantization/test_fp8.py
tests/models/quantization/test_fp8.py
+1
-4
tests/models/registry.py
tests/models/registry.py
+56
-29
tests/models/test_initialization.py
tests/models/test_initialization.py
+2
-5
tests/models/test_registry.py
tests/models/test_registry.py
+3
-0
tests/models/utils.py
tests/models/utils.py
+18
-10
tests/multimodal/test_cache.py
tests/multimodal/test_cache.py
+179
-12
tests/multimodal/test_hasher.py
tests/multimodal/test_hasher.py
+4
-3
tests/multimodal/test_processing.py
tests/multimodal/test_processing.py
+49
-58
No files found.
Too many changes to show.
To preserve performance only
501 of 501+
files are displayed.
Plain diff
Email patch
tests/models/multimodal/generation/vlm_utils/custom_inputs.py
View file @
d2b52805
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Custom input builders for edge-cases in different models."""
"""Custom input builders for edge-cases in different models."""
from
io
import
BytesIO
from
typing
import
Callable
from
typing
import
Callable
import
requests
from
vllm.assets.image
import
ImageAsset
from
PIL
import
Image
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.multimodal.video
import
(
rescale_video_size
,
resize_video
,
from
vllm.multimodal.video
import
(
rescale_video_size
,
resize_video
,
sample_frames_from_video
)
sample_frames_from_video
)
...
@@ -118,9 +115,9 @@ def different_patch_input_cases_internvl():
...
@@ -118,9 +115,9 @@ def different_patch_input_cases_internvl():
def
windows_attention_image_qwen2_5_vl
():
def
windows_attention_image_qwen2_5_vl
():
# image from regression issue: https://github.com/vllm-project/vllm/issues/15122
image
_url
=
"https://aomediacodec.github.io/av1-avif/testFiles/Link-U/hato.jpg"
#
image
from regression issue: https://github.com/vllm-project/vllm/issues/15122 # noqa: E501
image
=
Image
.
open
(
BytesIO
(
requests
.
get
(
image_url
).
content
))
image
=
Image
Asset
(
"hato"
).
pil_image
question
=
"Describe the image."
question
=
"Describe the image."
img_prompt
=
"<|vision_start|><|image_pad|><|vision_end|>"
img_prompt
=
"<|vision_start|><|image_pad|><|vision_end|>"
...
...
tests/models/multimodal/generation/vlm_utils/model_utils.py
View file @
d2b52805
...
@@ -10,6 +10,7 @@ from typing import Optional, Union
...
@@ -10,6 +10,7 @@ from typing import Optional, Union
import
numpy
as
np
import
numpy
as
np
import
numpy.typing
as
npt
import
numpy.typing
as
npt
import
PIL.Image
import
pytest
import
pytest
import
regex
as
re
import
regex
as
re
import
torch
import
torch
...
@@ -19,7 +20,6 @@ from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
...
@@ -19,7 +20,6 @@ from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
from
transformers.video_utils
import
VideoMetadata
from
transformers.video_utils
import
VideoMetadata
from
vllm.sequence
import
SampleLogprobs
from
vllm.sequence
import
SampleLogprobs
from
vllm.transformers_utils.tokenizer
import
patch_padding_side
from
vllm.utils
import
is_list_of
from
vllm.utils
import
is_list_of
from
.....conftest
import
HfRunner
,
ImageAsset
,
ImageTestAssets
from
.....conftest
import
HfRunner
,
ImageAsset
,
ImageTestAssets
...
@@ -343,7 +343,6 @@ def gemma3_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
...
@@ -343,7 +343,6 @@ def gemma3_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
def
glm4v_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
def
glm4v_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
"""Patches and returns an instance of the HfRunner to use for GLM4V."""
"""Patches and returns an instance of the HfRunner to use for GLM4V."""
hf_processor
=
hf_model
.
processor
hf_processor
=
hf_model
.
processor
patch_padding_side
(
hf_processor
)
def
processor
(
*
args
,
text
=
""
,
images
=
None
,
**
kwargs
):
def
processor
(
*
args
,
text
=
""
,
images
=
None
,
**
kwargs
):
if
images
is
None
:
if
images
is
None
:
...
@@ -812,6 +811,63 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
...
@@ -812,6 +811,63 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
return
hf_model
return
hf_model
def
ovis2_5_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
"""Patches and returns an instance of the HfRunner to use for Ovis2."""
hf_model
.
model
.
get_output_embeddings
=
lambda
:
\
hf_model
.
model
.
llm
.
get_output_embeddings
()
def
processor
(
*
args
,
text
=
""
,
images
=
None
,
videos
=
None
,
**
kwargs
):
if
images
is
None
:
images
=
[]
else
:
images
=
[
images
]
if
isinstance
(
images
,
Image
)
else
images
if
videos
is
None
:
videos
=
[]
else
:
videos
=
[
videos
]
if
isinstance
(
videos
,
np
.
ndarray
)
else
videos
videos
=
[[
PIL
.
Image
.
fromarray
(
frame
)
for
frame
in
vid
]
for
vid
in
videos
]
prompt_start_and_end
=
{
"qwen2"
:
(
"<|im_start|>user
\n
"
,
"<|im_end|>
\n
"
),
"llama"
:
(
"<|start_header_id|>user<|end_header_id|>
\n\n
"
,
"<|eot_id|>"
),
"gemma2"
:
(
"<start_of_turn>user
\n
"
,
"<end_of_turn>
\n
"
),
}
for
start
,
end
in
prompt_start_and_end
.
values
():
if
start
in
text
and
end
in
text
:
text
=
text
.
split
(
start
)[
1
].
split
(
end
)[
0
]
break
images_message
=
[{
"type"
:
"image"
,
"image"
:
img
}
for
img
in
images
]
videos_message
=
[{
"type"
:
"video"
,
"video"
:
vid
}
for
vid
in
videos
]
messages
=
[{
"role"
:
"user"
,
"content"
:
[
*
images_message
,
*
videos_message
,
{
"type"
:
"text"
,
"text"
:
text
},
],
}]
input_ids
,
pixel_values
,
grid_thws
=
hf_model
.
model
.
preprocess_inputs
(
messages
=
messages
,
enable_thinking
=
True
)
inputs
=
{
"inputs"
:
input_ids
,
"pixel_values"
:
pixel_values
,
"grid_thws"
:
grid_thws
,
}
return
BatchFeature
(
data
=
inputs
,
tensor_type
=
"pt"
)
hf_model
.
processor
=
processor
return
hf_model
def
qwen2_5_omni_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
def
qwen2_5_omni_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
"""Patches and returns an instance of the HfRunner for Qwen2.5-Omni."""
"""Patches and returns an instance of the HfRunner for Qwen2.5-Omni."""
thinker
=
hf_model
.
model
.
thinker
thinker
=
hf_model
.
model
.
thinker
...
...
tests/models/multimodal/processing/test_common.py
View file @
d2b52805
...
@@ -14,8 +14,9 @@ from PIL import Image
...
@@ -14,8 +14,9 @@ from PIL import Image
from
vllm.config
import
ModelConfig
from
vllm.config
import
ModelConfig
from
vllm.inputs
import
InputProcessingContext
from
vllm.inputs
import
InputProcessingContext
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalDataDict
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalDataDict
from
vllm.multimodal.cache
import
MultiModalProcessorOnlyCache
from
vllm.multimodal.inputs
import
MultiModalInputs
from
vllm.multimodal.inputs
import
MultiModalInputs
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
,
ProcessingCache
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
from
vllm.transformers_utils.tokenizer
import
(
AnyTokenizer
,
MistralTokenizer
,
from
vllm.transformers_utils.tokenizer
import
(
AnyTokenizer
,
MistralTokenizer
,
cached_tokenizer_from_config
,
cached_tokenizer_from_config
,
encode_tokens
)
encode_tokens
)
...
@@ -63,6 +64,8 @@ def _test_processing_correctness(
...
@@ -63,6 +64,8 @@ def _test_processing_correctness(
revision
=
model_info
.
revision
,
revision
=
model_info
.
revision
,
trust_remote_code
=
model_info
.
trust_remote_code
,
trust_remote_code
=
model_info
.
trust_remote_code
,
hf_overrides
=
model_info
.
hf_overrides
,
hf_overrides
=
model_info
.
hf_overrides
,
# Ensure that the cache can fit all of the data
mm_processor_cache_gb
=
2048
,
)
)
model_cls
=
MULTIMODAL_REGISTRY
.
_get_model_cls
(
model_config
)
model_cls
=
MULTIMODAL_REGISTRY
.
_get_model_cls
(
model_config
)
...
@@ -71,8 +74,7 @@ def _test_processing_correctness(
...
@@ -71,8 +74,7 @@ def _test_processing_correctness(
model_config
,
model_config
,
tokenizer
=
cached_tokenizer_from_config
(
model_config
),
tokenizer
=
cached_tokenizer_from_config
(
model_config
),
)
)
# Ensure that it can fit all of the data
cache
=
MultiModalProcessorOnlyCache
(
model_config
)
cache
=
ProcessingCache
(
capacity_gb
=
2048
)
processing_info
=
factories
.
info
(
ctx
)
processing_info
=
factories
.
info
(
ctx
)
supported_mm_limits
=
processing_info
.
get_supported_mm_limits
()
supported_mm_limits
=
processing_info
.
get_supported_mm_limits
()
...
@@ -102,7 +104,7 @@ def _test_processing_correctness(
...
@@ -102,7 +104,7 @@ def _test_processing_correctness(
partial
(
random_video
,
partial
(
random_video
,
rng
,
rng
,
min_frames
=
2
,
min_frames
=
2
,
max_frames
=
8
,
max_frames
=
16
,
min_wh
=
128
,
min_wh
=
128
,
max_wh
=
256
),
max_wh
=
256
),
"audio"
:
"audio"
:
...
@@ -160,8 +162,10 @@ def _test_processing_correctness(
...
@@ -160,8 +162,10 @@ def _test_processing_correctness(
# incorrect token ids. So we need use `add_special_tokens=False` here
# incorrect token ids. So we need use `add_special_tokens=False` here
# to leave bos_token to be added by the processor.
# to leave bos_token to be added by the processor.
_ADD_SPECIAL_TOKENS_OVERRIDES
=
{
_ADD_SPECIAL_TOKENS_OVERRIDES
=
{
"donut"
:
False
,
"mllama"
:
False
,
"mllama"
:
False
,
"ovis"
:
False
,
"ovis"
:
False
,
"ovis2_5"
:
False
,
"paligemma"
:
False
,
"paligemma"
:
False
,
"ultravox"
:
False
,
"ultravox"
:
False
,
"whisper"
:
False
,
"whisper"
:
False
,
...
@@ -267,23 +271,30 @@ def _test_processing_correctness_one(
...
@@ -267,23 +271,30 @@ def _test_processing_correctness_one(
"CohereForAI/aya-vision-8b"
,
"CohereForAI/aya-vision-8b"
,
"Salesforce/blip2-opt-2.7b"
,
"Salesforce/blip2-opt-2.7b"
,
"facebook/chameleon-7b"
,
"facebook/chameleon-7b"
,
"CohereLabs/command-a-vision-07-2025"
,
"deepseek-ai/deepseek-vl2-tiny"
,
"deepseek-ai/deepseek-vl2-tiny"
,
"naver-clova-ix/donut-base-finetuned-docvqa"
,
"baidu/ERNIE-4.5-VL-28B-A3B-PT"
,
"microsoft/Florence-2-base"
,
"microsoft/Florence-2-base"
,
"adept/fuyu-8b"
,
"adept/fuyu-8b"
,
"google/gemma-3-4b-it"
,
"google/gemma-3-4b-it"
,
"google/gemma-3n-E2B-it"
,
"google/gemma-3n-E2B-it"
,
"zai-org/glm-4v-9b"
,
"zai-org/glm-4v-9b"
,
"zai-org/GLM-4.1V-9B-Thinking"
,
"zai-org/GLM-4.1V-9B-Thinking"
,
"zai-org/GLM-4.5V"
,
"ibm-granite/granite-speech-3.3-2b"
,
"ibm-granite/granite-speech-3.3-2b"
,
"h2oai/h2ovl-mississippi-800m"
,
"h2oai/h2ovl-mississippi-800m"
,
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
,
"HuggingFaceM4/Idefics3-8B-Llama3"
,
"internlm/Intern-S1"
,
"internlm/Intern-S1"
,
"OpenGVLab/InternVL2-1B"
,
"OpenGVLab/InternVL2-1B"
,
"OpenGVLab/InternVL3-1B"
,
"OpenGVLab/InternVL3-1B"
,
"HuggingFaceM4/Idefics3-8B-Llama3"
,
"OpenGVLab/InternVL3_5-1B"
,
"HuggingFaceTB/SmolVLM2-2.2B-Instruct"
,
"OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview"
,
"OpenGVLab/InternVL3_5-30B-A3B"
,
"Kwai-Keye/Keye-VL-8B-Preview"
,
"moonshotai/Kimi-VL-A3B-Instruct"
,
"moonshotai/Kimi-VL-A3B-Instruct"
,
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
,
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
,
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
,
"llava-hf/llava-1.5-7b-hf"
,
"llava-hf/llava-1.5-7b-hf"
,
"llava-hf/llava-v1.6-mistral-7b-hf"
,
"llava-hf/llava-v1.6-mistral-7b-hf"
,
"llava-hf/LLaVA-NeXT-Video-7B-hf"
,
"llava-hf/LLaVA-NeXT-Video-7B-hf"
,
...
@@ -301,6 +312,7 @@ def _test_processing_correctness_one(
...
@@ -301,6 +312,7 @@ def _test_processing_correctness_one(
"AIDC-AI/Ovis1.6-Gemma2-9B"
,
"AIDC-AI/Ovis1.6-Gemma2-9B"
,
"AIDC-AI/Ovis1.6-Llama3.2-3B"
,
"AIDC-AI/Ovis1.6-Llama3.2-3B"
,
"AIDC-AI/Ovis2-1B"
,
"AIDC-AI/Ovis2-1B"
,
"AIDC-AI/Ovis2.5-2B"
,
"google/paligemma-3b-mix-224"
,
"google/paligemma-3b-mix-224"
,
"google/paligemma2-3b-ft-docci-448"
,
"google/paligemma2-3b-ft-docci-448"
,
"microsoft/Phi-3.5-vision-instruct"
,
"microsoft/Phi-3.5-vision-instruct"
,
...
@@ -312,11 +324,15 @@ def _test_processing_correctness_one(
...
@@ -312,11 +324,15 @@ def _test_processing_correctness_one(
"Qwen/Qwen2.5-VL-3B-Instruct"
,
"Qwen/Qwen2.5-VL-3B-Instruct"
,
"Qwen/Qwen2-Audio-7B-Instruct"
,
"Qwen/Qwen2-Audio-7B-Instruct"
,
"Qwen/Qwen2.5-Omni-3B"
,
"Qwen/Qwen2.5-Omni-3B"
,
"YannQi/R-4B"
,
"Skywork/Skywork-R1V-38B"
,
"Skywork/Skywork-R1V-38B"
,
"HuggingFaceTB/SmolVLM2-2.2B-Instruct"
,
"stepfun-ai/step3"
,
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
,
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
,
"openai/whisper-large-v3"
,
"openai/whisper-large-v3"
,
"omni-research/Tarsier-7b"
,
"omni-research/Tarsier-7b"
,
"omni-research/Tarsier2-Recap-7b"
,
"omni-research/Tarsier2-Recap-7b"
,
"mistralai/Voxtral-Mini-3B-2507"
,
])
])
@
pytest
.
mark
.
parametrize
(
"hit_rate"
,
[
0.3
,
0.5
,
1.0
])
@
pytest
.
mark
.
parametrize
(
"hit_rate"
,
[
0.3
,
0.5
,
1.0
])
@
pytest
.
mark
.
parametrize
(
"num_batches"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"num_batches"
,
[
32
])
...
@@ -370,10 +386,16 @@ def _assert_inputs_equal(
...
@@ -370,10 +386,16 @@ def _assert_inputs_equal(
if
ignore_mm_keys
is
None
:
if
ignore_mm_keys
is
None
:
ignore_mm_keys
=
set
()
ignore_mm_keys
=
set
()
assert
"mm_kwargs"
in
a
and
"mm_kwargs"
in
b
,
msg
a_rest
=
{
k
:
v
for
k
,
v
in
a
.
items
()
if
k
!=
"mm_kwargs"
}
b_rest
=
{
k
:
v
for
k
,
v
in
b
.
items
()
if
k
!=
"mm_kwargs"
}
assert
a_rest
==
b_rest
,
msg
a_data
=
a
[
"mm_kwargs"
].
get_data
()
b_data
=
b
[
"mm_kwargs"
].
get_data
()
for
key
in
ignore_mm_keys
:
for
key
in
ignore_mm_keys
:
a
[
"mm_kwargs"
]
.
pop
(
key
,
None
)
a
_data
.
pop
(
key
,
None
)
b
[
"mm_kwargs"
]
.
pop
(
key
,
None
)
b
_data
.
pop
(
key
,
None
)
assert
a
==
b
,
msg
assert
a
_data
==
b_data
,
msg
tests/models/multimodal/processing/test_glm4_1v.py
View file @
d2b52805
...
@@ -45,7 +45,8 @@ def test_processor_override(
...
@@ -45,7 +45,8 @@ def test_processor_override(
video_token_id
=
tokenizer
.
convert_tokens_to_ids
(
hf_processor
.
video_token
)
video_token_id
=
tokenizer
.
convert_tokens_to_ids
(
hf_processor
.
video_token
)
video_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
video_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
video_token_id
)
video_token_id
)
grid_t
,
_
,
_
=
processed_inputs
[
"mm_kwargs"
][
"video_grid_thw"
][
0
]
grid_t
,
_
,
_
=
processed_inputs
[
"mm_kwargs"
].
get_data
(
)[
"video_grid_thw"
][
0
]
assert
grid_t
==
expected_grid_t
assert
grid_t
==
expected_grid_t
assert
video_tok_count
==
expected_toks_per_frame
*
grid_t
assert
video_tok_count
==
expected_toks_per_frame
*
grid_t
tests/models/multimodal/processing/test_h2ovl.py
View file @
d2b52805
...
@@ -108,7 +108,8 @@ def _run_check(
...
@@ -108,7 +108,8 @@ def _run_check(
# Ensure we have the right number of placeholders per num_crops size
# Ensure we have the right number of placeholders per num_crops size
image_token_id
=
tokenizer
.
convert_tokens_to_ids
(
"<IMG_CONTEXT>"
)
image_token_id
=
tokenizer
.
convert_tokens_to_ids
(
"<IMG_CONTEXT>"
)
img_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
image_token_id
)
img_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
image_token_id
)
pixel_shape
=
processed_inputs
[
"mm_kwargs"
][
"pixel_values_flat"
].
shape
pixel_shape
=
processed_inputs
[
"mm_kwargs"
].
get_data
(
)[
"pixel_values_flat"
].
shape
assert
img_tok_count
==
256
*
total_expected_num_patches
assert
img_tok_count
==
256
*
total_expected_num_patches
assert
pixel_shape
[
0
]
==
total_expected_num_patches
assert
pixel_shape
[
0
]
==
total_expected_num_patches
...
...
tests/models/multimodal/processing/test_internvl.py
View file @
d2b52805
...
@@ -68,7 +68,8 @@ def _run_check(
...
@@ -68,7 +68,8 @@ def _run_check(
# Ensure we have the right number of placeholders per num_crops size
# Ensure we have the right number of placeholders per num_crops size
image_token_id
=
tokenizer
.
convert_tokens_to_ids
(
"<IMG_CONTEXT>"
)
image_token_id
=
tokenizer
.
convert_tokens_to_ids
(
"<IMG_CONTEXT>"
)
img_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
image_token_id
)
img_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
image_token_id
)
pixel_shape
=
processed_inputs
[
"mm_kwargs"
][
"pixel_values_flat"
].
shape
pixel_shape
=
processed_inputs
[
"mm_kwargs"
].
get_data
(
)[
"pixel_values_flat"
].
shape
assert
img_tok_count
==
256
*
total_expected_num_patches
assert
img_tok_count
==
256
*
total_expected_num_patches
assert
pixel_shape
[
0
]
==
total_expected_num_patches
assert
pixel_shape
[
0
]
==
total_expected_num_patches
...
...
tests/models/multimodal/processing/test_llama4.py
View file @
d2b52805
...
@@ -51,14 +51,14 @@ def test_processor_override(
...
@@ -51,14 +51,14 @@ def test_processor_override(
prompt
=
encode_tokens
(
tokenizer
,
prompt
)
prompt
=
encode_tokens
(
tokenizer
,
prompt
)
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
mm_processor_kwargs
)
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
mm_processor_kwargs
)
mm_
kwargs
=
processed_inputs
[
"mm_kwargs"
]
mm_
data
=
processed_inputs
[
"mm_kwargs"
]
.
get_data
()
# place holder replacements
# place holder replacements
prompt_token_ids
=
processed_inputs
[
"prompt_token_ids"
]
prompt_token_ids
=
processed_inputs
[
"prompt_token_ids"
]
assert
prompt_token_ids
.
count
(
config
.
boi_token_index
)
==
num_imgs
assert
prompt_token_ids
.
count
(
config
.
boi_token_index
)
==
num_imgs
assert
prompt_token_ids
.
count
(
config
.
eoi_token_index
)
==
num_imgs
assert
prompt_token_ids
.
count
(
config
.
eoi_token_index
)
==
num_imgs
assert
prompt_token_ids
.
count
(
vocab
[
hf_processor
.
image_token
])
==
num_imgs
assert
prompt_token_ids
.
count
(
vocab
[
hf_processor
.
image_token
])
==
num_imgs
aspect_ratios
=
mm_
kwargs
[
"aspect_ratios"
]
aspect_ratios
=
mm_
data
[
"aspect_ratios"
]
num_x_separators
=
num_y_separators
=
0
num_x_separators
=
num_y_separators
=
0
for
tiles_y
,
tiles_x
in
aspect_ratios
:
for
tiles_y
,
tiles_x
in
aspect_ratios
:
if
tiles_x
*
tiles_y
>
1
:
if
tiles_x
*
tiles_y
>
1
:
...
@@ -80,6 +80,6 @@ def test_processor_override(
...
@@ -80,6 +80,6 @@ def test_processor_override(
num_patches_per_chunk
=
processor
.
info
.
get_patch_per_chunk
(
num_patches_per_chunk
=
processor
.
info
.
get_patch_per_chunk
(
config
.
vision_config
)
config
.
vision_config
)
assert
prompt_token_ids
.
count
(
config
.
image_token_index
)
\
assert
prompt_token_ids
.
count
(
config
.
image_token_index
)
\
==
mm_kwargs
[
"patches_per_image"
]
.
sum
(
)
*
num_patches_per_chunk
==
sum
(
mm_data
[
"patches_per_image"
])
*
num_patches_per_chunk
assert
mm_kwargs
[
"pixel_values"
]
.
shape
[
0
]
\
assert
len
(
mm_data
[
"pixel_values"
]
)
\
==
mm_kwargs
[
"patches_per_image"
]
.
sum
(
)
==
sum
(
mm_data
[
"patches_per_image"
])
tests/models/multimodal/processing/test_mllama.py
View file @
d2b52805
...
@@ -49,18 +49,18 @@ def test_profiling(
...
@@ -49,18 +49,18 @@ def test_profiling(
encoder_seq_lens
=
[
len
(
dummy_encoder_data
.
prompt_token_ids
)
encoder_seq_lens
=
[
len
(
dummy_encoder_data
.
prompt_token_ids
)
]
*
max_num_seqs
]
*
max_num_seqs
mm_
kwargs
=
processor
.
apply
(
mm_
data
=
processor
.
apply
(
prompt
=
dummy_mm_data
.
prompt
,
prompt
=
dummy_mm_data
.
prompt
,
mm_data
=
dummy_mm_data
.
mm_data
,
mm_data
=
dummy_mm_data
.
mm_data
,
hf_processor_mm_kwargs
=
dict
(),
hf_processor_mm_kwargs
=
dict
(),
)[
"mm_kwargs"
]
)[
"mm_kwargs"
]
.
get_data
()
# Get the actual number of encoder tokens for each sample.
# Get the actual number of encoder tokens for each sample.
# Because attn_metadata.encoder_seq_lens only counts the last
# Because attn_metadata.encoder_seq_lens only counts the last
# group of images for each sample, which is used to cheat the
# group of images for each sample, which is used to cheat the
# block manager to allocate blocks for those images only.
# block manager to allocate blocks for those images only.
# See MllamaMultiModalProcessor for more details.
# See MllamaMultiModalProcessor for more details.
num_tiles
=
[[
t
]
for
t
in
mm_
kwargs
.
pop
(
"num_tiles"
)]
num_tiles
=
[[
t
]
for
t
in
mm_
data
.
pop
(
"num_tiles"
)]
num_tokens_per_tile
=
calc_token_per_chunk
(
image_size
)
num_tokens_per_tile
=
calc_token_per_chunk
(
image_size
)
actual_encoder_seq_lens
=
[
actual_encoder_seq_lens
=
[
sum
(
num_tile
)
*
num_tokens_per_tile
for
num_tile
in
num_tiles
sum
(
num_tile
)
*
num_tokens_per_tile
for
num_tile
in
num_tiles
...
...
tests/models/multimodal/processing/test_mllama4.py
View file @
d2b52805
...
@@ -38,21 +38,21 @@ def test_profiling(model_id: str, max_model_len: int):
...
@@ -38,21 +38,21 @@ def test_profiling(model_id: str, max_model_len: int):
hf_config
=
ctx
.
get_hf_config
(
Llama4Config
)
hf_config
=
ctx
.
get_hf_config
(
Llama4Config
)
mm_
kwargs
=
processor
.
apply
(
mm_
data
=
processor
.
apply
(
prompt
=
dummy_mm_data
.
prompt
,
prompt
=
dummy_mm_data
.
prompt
,
mm_data
=
dummy_mm_data
.
mm_data
,
mm_data
=
dummy_mm_data
.
mm_data
,
hf_processor_mm_kwargs
=
dict
(),
hf_processor_mm_kwargs
=
dict
(),
)[
"mm_kwargs"
]
)[
"mm_kwargs"
]
.
get_data
()
image_size
=
hf_config
.
vision_config
.
image_size
image_size
=
hf_config
.
vision_config
.
image_size
patch_size
=
hf_config
.
vision_config
.
patch_size
patch_size
=
hf_config
.
vision_config
.
patch_size
downsample_ratio
=
int
(
downsample_ratio
=
int
(
round
(
1.0
/
(
hf_config
.
vision_config
.
pixel_shuffle_ratio
**
2
)))
round
(
1.0
/
(
hf_config
.
vision_config
.
pixel_shuffle_ratio
**
2
)))
tokens_per_patch
=
((
image_size
//
patch_size
)
**
2
)
//
downsample_ratio
tokens_per_patch
=
((
image_size
//
patch_size
)
**
2
)
//
downsample_ratio
chunks_per_image
=
prod
(
mm_
kwargs
[
"patches_per_image"
])
chunks_per_image
=
prod
(
mm_
data
[
"patches_per_image"
])
total_num_patches
=
chunks_per_image
*
tokens_per_patch
total_num_patches
=
chunks_per_image
*
tokens_per_patch
num_tiles
=
mm_
kwargs
[
"aspect_ratios"
][
0
][
0
]
*
mm_
kwargs
[
"aspect_ratios"
][
num_tiles
=
mm_
data
[
"aspect_ratios"
][
0
][
0
]
*
mm_
data
[
"aspect_ratios"
][
0
][
0
][
1
]
# x-y seperator tokens
1
]
# x-y seperator tokens
total_tokens
=
total_num_patches
.
item
()
+
num_tiles
.
item
(
total_tokens
=
total_num_patches
.
item
()
+
num_tiles
.
item
(
)
+
3
# image start, image, image end
)
+
3
# image start, image, image end
...
...
tests/models/multimodal/processing/test_nemotron_vl.py
View file @
d2b52805
...
@@ -70,7 +70,8 @@ def _run_check(
...
@@ -70,7 +70,8 @@ def _run_check(
# Ensure we have the right number of placeholders per num_crops size
# Ensure we have the right number of placeholders per num_crops size
image_token_id
=
tokenizer
.
convert_tokens_to_ids
(
"<image>"
)
image_token_id
=
tokenizer
.
convert_tokens_to_ids
(
"<image>"
)
img_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
image_token_id
)
img_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
image_token_id
)
pixel_shape
=
processed_inputs
[
"mm_kwargs"
][
"pixel_values_flat"
].
shape
pixel_shape
=
processed_inputs
[
"mm_kwargs"
].
get_data
(
)[
"pixel_values_flat"
].
shape
print
(
"Image token count:"
,
img_tok_count
,
"Pixel shape:"
,
pixel_shape
)
print
(
"Image token count:"
,
img_tok_count
,
"Pixel shape:"
,
pixel_shape
)
assert
img_tok_count
==
256
*
total_expected_num_patches
assert
img_tok_count
==
256
*
total_expected_num_patches
assert
pixel_shape
[
0
]
==
total_expected_num_patches
assert
pixel_shape
[
0
]
==
total_expected_num_patches
...
...
tests/models/multimodal/processing/test_qwen2_vl.py
View file @
d2b52805
...
@@ -48,7 +48,8 @@ def test_processor_override(
...
@@ -48,7 +48,8 @@ def test_processor_override(
hf_processor
=
processor
.
info
.
get_hf_processor
(
**
hf_processor_mm_kwargs
)
hf_processor
=
processor
.
info
.
get_hf_processor
(
**
hf_processor_mm_kwargs
)
image_token_id
=
tokenizer
.
convert_tokens_to_ids
(
hf_processor
.
image_token
)
image_token_id
=
tokenizer
.
convert_tokens_to_ids
(
hf_processor
.
image_token
)
img_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
image_token_id
)
img_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
image_token_id
)
pixel_shape
=
processed_inputs
[
"mm_kwargs"
][
"pixel_values"
].
shape
pixel_shape
=
processed_inputs
[
"mm_kwargs"
].
get_data
(
)[
"pixel_values"
].
shape
assert
img_tok_count
==
expected_toks_per_img
*
num_imgs
assert
img_tok_count
==
expected_toks_per_img
*
num_imgs
assert
pixel_shape
[
0
]
==
expected_pixels_shape
[
0
]
*
num_imgs
assert
pixel_shape
[
0
]
==
expected_pixels_shape
[
0
]
*
num_imgs
...
...
tests/models/multimodal/test_tensor_schema.py
→
tests/models/multimodal/
processing/
test_tensor_schema.py
View file @
d2b52805
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
tempfile
from
collections.abc
import
Iterable
from
collections.abc
import
Iterable
from
contextlib
import
contextmanager
from
functools
import
partial
from
functools
import
partial
from
typing
import
Any
,
Union
from
typing
import
Any
,
Union
from
unittest.mock
import
patch
import
numpy
as
np
import
numpy
as
np
import
pytest
import
pytest
import
torch.nn
as
nn
from
mistral_common.protocol.instruct.messages
import
(
ImageChunk
,
TextChunk
,
from
mistral_common.protocol.instruct.messages
import
(
ImageChunk
,
TextChunk
,
UserMessage
)
UserMessage
)
from
mistral_common.protocol.instruct.request
import
ChatCompletionRequest
from
mistral_common.protocol.instruct.request
import
ChatCompletionRequest
from
PIL
import
Image
from
PIL
import
Image
from
vllm.config
import
ModelConfig
from
vllm.config
import
ModelConfig
,
VllmConfig
,
set_current_vllm_config
from
vllm.engine.llm_engine
import
LLMEngine
as
V0LLMEngine
from
vllm.distributed
import
(
cleanup_dist_env_and_memory
,
init_distributed_environment
,
initialize_model_parallel
)
from
vllm.inputs
import
InputProcessingContext
from
vllm.inputs
import
InputProcessingContext
from
vllm.m
ultimodal
import
(
MULTIMODAL_REGISTRY
,
BatchedTensorInputs
,
from
vllm.m
odel_executor.model_loader.utils
import
set_default_torch_dtype
M
ulti
M
odal
Kwargs
)
from
vllm.m
ulti
m
odal
import
MULTIMODAL_REGISTRY
,
BatchedTensorInputs
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
from
vllm.multimodal.utils
import
group_mm_kwargs_by_modality
from
vllm.multimodal.utils
import
group_mm_kwargs_by_modality
from
vllm.transformers_utils.tokenizer
import
cached_tokenizer_from_config
from
vllm.transformers_utils.tokenizer
import
cached_tokenizer_from_config
from
vllm.utils
import
GiB_bytes
,
is_list_of
,
set_default_torch_num_threads
from
vllm.utils
import
is_list_of
from
vllm.v1.core.kv_cache_utils
import
get_kv_cache_config
from
vllm.v1.engine.core
import
EngineCore
as
V1EngineCore
from
...conftest
import
VllmRunner
from
...registry
import
_MULTIMODAL_EXAMPLE_MODELS
,
HF_EXAMPLE_MODELS
from
..registry
import
_MULTIMODAL_EXAMPLE_MODELS
,
HF_EXAMPLE_MODELS
from
...utils
import
dummy_hf_overrides
from
..utils
import
dummy_hf_overrides
ARCH_TO_SKIP
=
{
ARCH_TO_SKIP
=
{
"MolmoForCausalLM"
:
"incompatible requirements"
,
"MolmoForCausalLM"
:
"incompatible requirements"
,
"MiniMaxVL01ForConditionalGeneration"
:
"broken model"
,
}
}
ARCH_NEEDS_EXTRAS
=
[
ARCH_NEEDS_EXTRAS
=
[
"InternVLChatModel"
,
"InternVLChatModel"
,
...
@@ -39,7 +39,12 @@ ARCH_NEEDS_EXTRAS = [
...
@@ -39,7 +39,12 @@ ARCH_NEEDS_EXTRAS = [
"MiniCPMV"
,
"MiniCPMV"
,
"PaliGemmaForConditionalGeneration"
,
"PaliGemmaForConditionalGeneration"
,
]
]
REPO_ID_TO_SKIP
=
{
"nm-testing/pixtral-12b-FP8-dynamic"
:
"duplicated test"
}
REPO_ID_TO_SKIP
=
{
"nm-testing/pixtral-12b-FP8-dynamic"
:
"duplicated test"
,
# FIXME(Isotr0py): enable GPT-OSS based InternVL3.5 model
# after support PP for GPT-OSS
"OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview"
:
"Broken model"
,
}
ImageInput
=
list
[
Image
.
Image
]
ImageInput
=
list
[
Image
.
Image
]
VideoInput
=
Union
[
list
[
Image
.
Image
],
list
[
np
.
ndarray
],
VideoInput
=
Union
[
list
[
Image
.
Image
],
list
[
np
.
ndarray
],
...
@@ -128,11 +133,32 @@ def create_batched_mm_kwargs(
...
@@ -128,11 +133,32 @@ def create_batched_mm_kwargs(
)[
"mm_kwargs"
]
)[
"mm_kwargs"
]
items
=
[
items
=
[
item
for
modality
in
supported_mm_limits
item
for
modality
in
supported_mm_limits
for
item
in
mm_kwargs
.
get_items
(
modality
)
for
item
in
mm_kwargs
[
modality
]
]
]
return
group_mm_kwargs_by_modality
(
items
)
return
group_mm_kwargs_by_modality
(
items
)
@
contextmanager
def
initialize_dummy_model
(
model_cls
:
nn
.
Module
,
model_config
:
ModelConfig
):
temp_file
=
tempfile
.
mkstemp
()[
1
]
init_distributed_environment
(
world_size
=
1
,
rank
=
0
,
distributed_init_method
=
f
"file://
{
temp_file
}
"
,
local_rank
=
0
,
backend
=
"nccl"
,
)
initialize_model_parallel
(
tensor_model_parallel_size
=
1
)
vllm_config
=
VllmConfig
(
model_config
=
model_config
)
with
set_current_vllm_config
(
vllm_config
=
vllm_config
):
with
set_default_torch_dtype
(
model_config
.
dtype
):
model
=
model_cls
(
vllm_config
=
vllm_config
)
yield
model
del
model
cleanup_dist_env_and_memory
()
def
get_model_id_to_test
(
def
get_model_id_to_test
(
model_arch_list
:
Iterable
[
str
])
->
list
[
tuple
[
str
,
str
]]:
model_arch_list
:
Iterable
[
str
])
->
list
[
tuple
[
str
,
str
]]:
filtered_results
=
[]
filtered_results
=
[]
...
@@ -148,12 +174,10 @@ def get_model_id_to_test(
...
@@ -148,12 +174,10 @@ def get_model_id_to_test(
return
filtered_results
return
filtered_results
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"model_arch, model_id"
,
"model_arch, model_id"
,
get_model_id_to_test
(
_MULTIMODAL_EXAMPLE_MODELS
.
keys
()))
get_model_id_to_test
(
_MULTIMODAL_EXAMPLE_MODELS
.
keys
()))
def
test_model_tensor_schema
(
model_arch
:
str
,
model_id
:
str
,
def
test_model_tensor_schema
(
model_arch
:
str
,
model_id
:
str
):
vllm_runner
:
type
[
VllmRunner
],
monkeypatch
):
if
model_arch
in
ARCH_TO_SKIP
:
if
model_arch
in
ARCH_TO_SKIP
:
pytest
.
skip
(
f
"Skipping
{
model_arch
}
due to
{
ARCH_TO_SKIP
[
model_arch
]
}
"
)
pytest
.
skip
(
f
"Skipping
{
model_arch
}
due to
{
ARCH_TO_SKIP
[
model_arch
]
}
"
)
if
model_id
in
REPO_ID_TO_SKIP
:
if
model_id
in
REPO_ID_TO_SKIP
:
...
@@ -174,14 +198,20 @@ def test_model_tensor_schema(model_arch: str, model_id: str,
...
@@ -174,14 +198,20 @@ def test_model_tensor_schema(model_arch: str, model_id: str,
tokenizer_mode
=
model_info
.
tokenizer_mode
,
tokenizer_mode
=
model_info
.
tokenizer_mode
,
revision
=
model_info
.
revision
,
revision
=
model_info
.
revision
,
trust_remote_code
=
model_info
.
trust_remote_code
,
trust_remote_code
=
model_info
.
trust_remote_code
,
hf_overrides
=
model_info
.
hf_overrides
,
hf_overrides
=
hf_overrides
_fn
,
)
)
model_cls
=
MULTIMODAL_REGISTRY
.
_get_model_cls
(
model_config
)
model_cls
=
MULTIMODAL_REGISTRY
.
_get_model_cls
(
model_config
)
factories
=
MULTIMODAL_REGISTRY
.
_processor_factories
[
model_cls
]
factories
=
MULTIMODAL_REGISTRY
.
_processor_factories
[
model_cls
]
if
not
any
(
inputs_parse_methods
=
[]
hasattr
(
model_cls
,
f
"_parse_and_validate_
{
m
}
_input"
)
for
attr_name
in
dir
(
model_cls
):
for
m
in
[
"image"
,
"video"
,
"audio"
]):
attr
=
getattr
(
model_cls
,
attr_name
)
if
hasattr
(
attr
,
"__annotations__"
):
return_type
=
attr
.
__annotations__
.
get
(
"return"
,
None
)
if
return_type
is
not
None
and
"Input"
in
str
(
return_type
):
inputs_parse_methods
.
append
(
attr_name
)
if
not
any
(
inputs_parse_methods
):
pytest
.
skip
(
f
"
{
model_arch
}
does not support tensor schema validation."
)
pytest
.
skip
(
f
"
{
model_arch
}
does not support tensor schema validation."
)
ctx
=
InputProcessingContext
(
ctx
=
InputProcessingContext
(
...
@@ -194,68 +224,13 @@ def test_model_tensor_schema(model_arch: str, model_id: str,
...
@@ -194,68 +224,13 @@ def test_model_tensor_schema(model_arch: str, model_id: str,
modality
:
3
if
limit
is
None
else
limit
modality
:
3
if
limit
is
None
else
limit
for
modality
,
limit
in
supported_mm_limits
.
items
()
for
modality
,
limit
in
supported_mm_limits
.
items
()
}
}
model_config
.
get_multimodal_config
().
limit_per_prompt
=
limit_mm_per_prompt
# Avoid calling model.forward()
processor
=
factories
.
build_processor
(
ctx
,
cache
=
None
)
def
_initialize_kv_caches_v0
(
self
)
->
None
:
self
.
cache_config
.
num_gpu_blocks
=
0
with
initialize_dummy_model
(
model_cls
,
model_config
)
as
model
:
self
.
cache_config
.
num_cpu_blocks
=
0
for
modality
,
_
,
mm_kwargs
in
create_batched_mm_kwargs
(
model_config
,
processor
):
def
_initialize_kv_caches_v1
(
self
,
vllm_config
):
for
method_name
in
inputs_parse_methods
:
kv_cache_specs
=
self
.
model_executor
.
get_kv_cache_specs
()
print
(
f
"Testing `
{
method_name
}
` with modality=
{
modality
}
"
scheduler_kv_cache_config
=
get_kv_cache_config
(
f
"and mm_kwargs
{
list
(
mm_kwargs
.
keys
())
}
"
)
vllm_config
,
getattr
(
model
,
method_name
)(
modality
=
modality
,
**
mm_kwargs
)
kv_cache_specs
[
0
],
10
*
GiB_bytes
,
)
# gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
return
1
,
0
,
scheduler_kv_cache_config
with
(
patch
.
object
(
V0LLMEngine
,
"_initialize_kv_caches"
,
_initialize_kv_caches_v0
),
patch
.
object
(
V1EngineCore
,
"_initialize_kv_caches"
,
_initialize_kv_caches_v1
),
monkeypatch
.
context
()
as
m
):
m
.
setenv
(
"VLLM_ALLOW_INSECURE_SERIALIZATION"
,
"1"
)
if
model_info
.
v0_only
:
m
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
# TODO(Isotr0py): Can we avoid initializing engine?
with
(
set_default_torch_num_threads
(
1
),
vllm_runner
(
model_id
,
tokenizer_name
=
model_info
.
tokenizer
,
tokenizer_mode
=
model_info
.
tokenizer_mode
,
revision
=
model_info
.
revision
,
trust_remote_code
=
model_info
.
trust_remote_code
,
max_model_len
=
model_info
.
max_model_len
,
load_format
=
"dummy"
,
hf_overrides
=
hf_overrides_fn
,
limit_mm_per_prompt
=
limit_mm_per_prompt
,
enforce_eager
=
True
,
)
as
vllm_model
,
):
model_config
=
vllm_model
.
llm
.
llm_engine
.
model_config
llm_engine
=
vllm_model
.
llm
.
llm_engine
if
hasattr
(
llm_engine
,
"processor"
):
# v1 processor
mm_registry
=
llm_engine
.
processor
.
mm_registry
else
:
# v0 input_preprocessor
mm_registry
=
llm_engine
.
input_preprocessor
.
mm_registry
processor
=
mm_registry
.
create_processor
(
model_config
)
def
validate_model_input
(
model
,
modality
:
str
,
mm_kwargs
:
MultiModalKwargs
):
method_name
=
f
"_parse_and_validate_
{
modality
}
_input"
if
hasattr
(
model
,
method_name
):
getattr
(
model
,
method_name
)(
**
mm_kwargs
)
for
modality
,
_
,
mm_kwargs
in
create_batched_mm_kwargs
(
model_config
,
processor
):
valid_func
=
partial
(
validate_model_input
,
modality
=
modality
,
mm_kwargs
=
mm_kwargs
)
vllm_model
.
apply_model
(
valid_func
)
tests/models/quantization/test_fp8.py
View file @
d2b52805
...
@@ -32,7 +32,7 @@ from ..utils import check_logprobs_close
...
@@ -32,7 +32,7 @@ from ..utils import check_logprobs_close
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
True
])
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
True
])
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"FLASH_ATTN"
,
"XFORMERS"
,
"FLASHINFER"
])
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"FLASH_ATTN"
,
"XFORMERS"
])
# NOTE: Increasing this in this suite will fail CI because we currently cannot
# NOTE: Increasing this in this suite will fail CI because we currently cannot
# reset distributed env properly. Use a value > 1 just when you test.
# reset distributed env properly. Use a value > 1 just when you test.
@
pytest
.
mark
.
parametrize
(
"tensor_parallel_size"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"tensor_parallel_size"
,
[
1
])
...
@@ -57,9 +57,6 @@ def test_models(
...
@@ -57,9 +57,6 @@ def test_models(
numerical sensitive kernels.
numerical sensitive kernels.
"""
"""
if
backend
==
"FLASHINFER"
and
current_platform
.
is_rocm
():
pytest
.
skip
(
"Flashinfer does not support ROCm/HIP."
)
if
kv_cache_dtype
==
"fp8_e5m2"
and
current_platform
.
is_rocm
():
if
kv_cache_dtype
==
"fp8_e5m2"
and
current_platform
.
is_rocm
():
pytest
.
skip
(
pytest
.
skip
(
f
"
{
kv_cache_dtype
}
is currently not supported on ROCm/HIP."
)
f
"
{
kv_cache_dtype
}
is currently not supported on ROCm/HIP."
)
...
...
tests/models/registry.py
View file @
d2b52805
...
@@ -137,6 +137,9 @@ class _HfExamplesInfo:
...
@@ -137,6 +137,9 @@ class _HfExamplesInfo:
# yapf: disable
# yapf: disable
_TEXT_GENERATION_EXAMPLE_MODELS
=
{
_TEXT_GENERATION_EXAMPLE_MODELS
=
{
# [Decoder-only]
# [Decoder-only]
"ApertusForCausalLM"
:
_HfExamplesInfo
(
"swiss-ai/Apertus-8B"
,
min_transformers_version
=
"4.56.0"
,
trust_remote_code
=
True
),
"AquilaModel"
:
_HfExamplesInfo
(
"BAAI/AquilaChat-7B"
,
"AquilaModel"
:
_HfExamplesInfo
(
"BAAI/AquilaChat-7B"
,
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"AquilaForCausalLM"
:
_HfExamplesInfo
(
"BAAI/AquilaChat2-7B"
,
"AquilaForCausalLM"
:
_HfExamplesInfo
(
"BAAI/AquilaChat2-7B"
,
...
@@ -215,9 +218,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -215,9 +218,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"HunYuanDenseV1ForCausalLM"
:
_HfExamplesInfo
(
"tencent/Hunyuan-7B-Instruct-0124"
,
"HunYuanDenseV1ForCausalLM"
:
_HfExamplesInfo
(
"tencent/Hunyuan-7B-Instruct-0124"
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
is_available_online
=
False
),
is_available_online
=
False
),
"HCXVisionForCausalLM"
:
_HfExamplesInfo
(
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
,
trust_remote_code
=
True
),
"InternLMForCausalLM"
:
_HfExamplesInfo
(
"internlm/internlm-chat-7b"
,
"InternLMForCausalLM"
:
_HfExamplesInfo
(
"internlm/internlm-chat-7b"
,
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"InternLM2ForCausalLM"
:
_HfExamplesInfo
(
"internlm/internlm2-chat-7b"
,
"InternLM2ForCausalLM"
:
_HfExamplesInfo
(
"internlm/internlm2-chat-7b"
,
...
@@ -233,6 +233,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -233,6 +233,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"tiny"
:
"ai21labs/Jamba-tiny-dev"
,
"tiny"
:
"ai21labs/Jamba-tiny-dev"
,
"random"
:
"ai21labs/Jamba-tiny-random"
,
# noqa: E501
"random"
:
"ai21labs/Jamba-tiny-random"
,
# noqa: E501
}),
}),
"Lfm2ForCausalLM"
:
_HfExamplesInfo
(
"LiquidAI/LFM2-1.2B"
,
min_transformers_version
=
"4.54"
),
"LlamaForCausalLM"
:
_HfExamplesInfo
(
"meta-llama/Llama-3.2-1B-Instruct"
,
"LlamaForCausalLM"
:
_HfExamplesInfo
(
"meta-llama/Llama-3.2-1B-Instruct"
,
extras
=
{
"guard"
:
"meta-llama/Llama-Guard-3-1B"
,
# noqa: E501
extras
=
{
"guard"
:
"meta-llama/Llama-Guard-3-1B"
,
# noqa: E501
"hermes"
:
"NousResearch/Hermes-3-Llama-3.1-8B"
,
# noqa: E501
"hermes"
:
"NousResearch/Hermes-3-Llama-3.1-8B"
,
# noqa: E501
...
@@ -293,13 +295,15 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -293,13 +295,15 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"Qwen3ForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen3-8B"
),
"Qwen3ForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen3-8B"
),
"Qwen3MoeForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen3-30B-A3B"
),
"Qwen3MoeForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen3-30B-A3B"
),
"RWForCausalLM"
:
_HfExamplesInfo
(
"tiiuae/falcon-40b"
),
"RWForCausalLM"
:
_HfExamplesInfo
(
"tiiuae/falcon-40b"
),
"SeedOssForCausalLM"
:
_HfExamplesInfo
(
"ByteDance-Seed/Seed-OSS-36B-Instruct"
,
# noqa: E501
trust_remote_code
=
True
,
is_available_online
=
False
),
"SmolLM3ForCausalLM"
:
_HfExamplesInfo
(
"HuggingFaceTB/SmolLM3-3B"
),
"SmolLM3ForCausalLM"
:
_HfExamplesInfo
(
"HuggingFaceTB/SmolLM3-3B"
),
"StableLMEpochForCausalLM"
:
_HfExamplesInfo
(
"stabilityai/stablelm-zephyr-3b"
),
# noqa: E501
"StableLMEpochForCausalLM"
:
_HfExamplesInfo
(
"stabilityai/stablelm-zephyr-3b"
),
# noqa: E501
"StableLmForCausalLM"
:
_HfExamplesInfo
(
"stabilityai/stablelm-3b-4e1t"
),
"StableLmForCausalLM"
:
_HfExamplesInfo
(
"stabilityai/stablelm-3b-4e1t"
),
"Starcoder2ForCausalLM"
:
_HfExamplesInfo
(
"bigcode/starcoder2-3b"
),
"Starcoder2ForCausalLM"
:
_HfExamplesInfo
(
"bigcode/starcoder2-3b"
),
"Step3TextForCausalLM"
:
_HfExamplesInfo
(
"stepfun-ai/step3"
,
"Step3TextForCausalLM"
:
_HfExamplesInfo
(
"stepfun-ai/step3"
,
trust_remote_code
=
True
,
trust_remote_code
=
True
),
is_available_online
=
False
),
"SolarForCausalLM"
:
_HfExamplesInfo
(
"upstage/solar-pro-preview-instruct"
,
"SolarForCausalLM"
:
_HfExamplesInfo
(
"upstage/solar-pro-preview-instruct"
,
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"TeleChat2ForCausalLM"
:
_HfExamplesInfo
(
"Tele-AI/TeleChat2-3B"
,
"TeleChat2ForCausalLM"
:
_HfExamplesInfo
(
"Tele-AI/TeleChat2-3B"
,
...
@@ -322,8 +326,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -322,8 +326,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
_EMBEDDING_EXAMPLE_MODELS
=
{
_EMBEDDING_EXAMPLE_MODELS
=
{
# [Text-only]
# [Text-only]
"BertModel"
:
_HfExamplesInfo
(
"BAAI/bge-base-en-v1.5"
,
v0_only
=
True
),
"BertModel"
:
_HfExamplesInfo
(
"BAAI/bge-base-en-v1.5"
),
"Gemma2Model"
:
_HfExamplesInfo
(
"BAAI/bge-multilingual-gemma2"
,
v0_only
=
True
),
# noqa: E501
"Gemma2Model"
:
_HfExamplesInfo
(
"BAAI/bge-multilingual-gemma2"
),
# noqa: E501
"GritLM"
:
_HfExamplesInfo
(
"parasail-ai/GritLM-7B-vllm"
),
"GritLM"
:
_HfExamplesInfo
(
"parasail-ai/GritLM-7B-vllm"
),
"GteModel"
:
_HfExamplesInfo
(
"Snowflake/snowflake-arctic-embed-m-v2.0"
,
"GteModel"
:
_HfExamplesInfo
(
"Snowflake/snowflake-arctic-embed-m-v2.0"
,
trust_remote_code
=
True
),
trust_remote_code
=
True
),
...
@@ -336,9 +340,9 @@ _EMBEDDING_EXAMPLE_MODELS = {
...
@@ -336,9 +340,9 @@ _EMBEDDING_EXAMPLE_MODELS = {
"LlamaModel"
:
_HfExamplesInfo
(
"llama"
,
is_available_online
=
False
),
"LlamaModel"
:
_HfExamplesInfo
(
"llama"
,
is_available_online
=
False
),
"MistralModel"
:
_HfExamplesInfo
(
"intfloat/e5-mistral-7b-instruct"
),
"MistralModel"
:
_HfExamplesInfo
(
"intfloat/e5-mistral-7b-instruct"
),
"ModernBertModel"
:
_HfExamplesInfo
(
"Alibaba-NLP/gte-modernbert-base"
,
"ModernBertModel"
:
_HfExamplesInfo
(
"Alibaba-NLP/gte-modernbert-base"
,
trust_remote_code
=
True
,
v0_only
=
True
),
trust_remote_code
=
True
),
"NomicBertModel"
:
_HfExamplesInfo
(
"nomic-ai/nomic-embed-text-v2-moe"
,
"NomicBertModel"
:
_HfExamplesInfo
(
"nomic-ai/nomic-embed-text-v2-moe"
,
trust_remote_code
=
True
,
v0_only
=
True
),
# noqa: E501
trust_remote_code
=
True
),
# noqa: E501
"Qwen2Model"
:
_HfExamplesInfo
(
"ssmits/Qwen2-7B-Instruct-embed-base"
),
"Qwen2Model"
:
_HfExamplesInfo
(
"ssmits/Qwen2-7B-Instruct-embed-base"
),
"Qwen2ForRewardModel"
:
_HfExamplesInfo
(
"Qwen/Qwen2.5-Math-RM-72B"
,
"Qwen2ForRewardModel"
:
_HfExamplesInfo
(
"Qwen/Qwen2.5-Math-RM-72B"
,
max_transformers_version
=
"4.53"
,
max_transformers_version
=
"4.53"
,
...
@@ -346,9 +350,9 @@ _EMBEDDING_EXAMPLE_MODELS = {
...
@@ -346,9 +350,9 @@ _EMBEDDING_EXAMPLE_MODELS = {
"Qwen2ForProcessRewardModel"
:
_HfExamplesInfo
(
"Qwen/Qwen2.5-Math-PRM-7B"
,
"Qwen2ForProcessRewardModel"
:
_HfExamplesInfo
(
"Qwen/Qwen2.5-Math-PRM-7B"
,
max_transformers_version
=
"4.53"
,
max_transformers_version
=
"4.53"
,
transformers_version_reason
=
"HF model uses remote code that is not compatible with latest Transformers"
),
# noqa: E501
transformers_version_reason
=
"HF model uses remote code that is not compatible with latest Transformers"
),
# noqa: E501
"RobertaModel"
:
_HfExamplesInfo
(
"sentence-transformers/stsb-roberta-base-v2"
,
v0_only
=
True
),
# noqa: E501
"RobertaModel"
:
_HfExamplesInfo
(
"sentence-transformers/stsb-roberta-base-v2"
),
# noqa: E501
"RobertaForMaskedLM"
:
_HfExamplesInfo
(
"sentence-transformers/all-roberta-large-v1"
,
v0_only
=
True
),
# noqa: E501
"RobertaForMaskedLM"
:
_HfExamplesInfo
(
"sentence-transformers/all-roberta-large-v1"
),
# noqa: E501
"XLMRobertaModel"
:
_HfExamplesInfo
(
"intfloat/multilingual-e5-small"
,
v0_only
=
True
),
# noqa: E501
"XLMRobertaModel"
:
_HfExamplesInfo
(
"intfloat/multilingual-e5-small"
),
# noqa: E501
# [Multimodal]
# [Multimodal]
"LlavaNextForConditionalGeneration"
:
_HfExamplesInfo
(
"royokong/e5-v"
),
"LlavaNextForConditionalGeneration"
:
_HfExamplesInfo
(
"royokong/e5-v"
),
"Phi3VForCausalLM"
:
_HfExamplesInfo
(
"TIGER-Lab/VLM2Vec-Full"
,
"Phi3VForCausalLM"
:
_HfExamplesInfo
(
"TIGER-Lab/VLM2Vec-Full"
,
...
@@ -363,16 +367,19 @@ _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = {
...
@@ -363,16 +367,19 @@ _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = {
"GPT2ForSequenceClassification"
:
_HfExamplesInfo
(
"nie3e/sentiment-polish-gpt2-small"
),
# noqa: E501
"GPT2ForSequenceClassification"
:
_HfExamplesInfo
(
"nie3e/sentiment-polish-gpt2-small"
),
# noqa: E501
# [Cross-encoder]
# [Cross-encoder]
"BertForSequenceClassification"
:
_HfExamplesInfo
(
"cross-encoder/ms-marco-MiniLM-L-6-v2"
,
v0_only
=
True
),
# noqa: E501
"BertForSequenceClassification"
:
_HfExamplesInfo
(
"cross-encoder/ms-marco-MiniLM-L-6-v2"
),
# noqa: E501
"ModernBertForSequenceClassification"
:
_HfExamplesInfo
(
"Alibaba-NLP/gte-reranker-modernbert-base"
,
v0_only
=
True
),
# noqa: E501
"GteNewForSequenceClassification"
:
_HfExamplesInfo
(
"Alibaba-NLP/gte-multilingual-reranker-base"
,
# noqa: E501
"RobertaForSequenceClassification"
:
_HfExamplesInfo
(
"cross-encoder/quora-roberta-base"
,
v0_only
=
True
),
# noqa: E501
trust_remote_code
=
True
,
"XLMRobertaForSequenceClassification"
:
_HfExamplesInfo
(
"BAAI/bge-reranker-v2-m3"
,
v0_only
=
True
),
# noqa: E501
hf_overrides
=
{
"architectures"
:
[
"GteNewForSequenceClassification"
]}),
# noqa: E501
"ModernBertForSequenceClassification"
:
_HfExamplesInfo
(
"Alibaba-NLP/gte-reranker-modernbert-base"
),
# noqa: E501
"RobertaForSequenceClassification"
:
_HfExamplesInfo
(
"cross-encoder/quora-roberta-base"
),
# noqa: E501
"XLMRobertaForSequenceClassification"
:
_HfExamplesInfo
(
"BAAI/bge-reranker-v2-m3"
),
# noqa: E501
}
}
_AUTOMATIC_CONVERTED_MODELS
=
{
_AUTOMATIC_CONVERTED_MODELS
=
{
# Use as_seq_cls_model for automatic conversion
# Use as_seq_cls_model for automatic conversion
"GemmaForSequenceClassification"
:
_HfExamplesInfo
(
"BAAI/bge-reranker-v2-gemma"
,
# noqa: E501
"GemmaForSequenceClassification"
:
_HfExamplesInfo
(
"BAAI/bge-reranker-v2-gemma"
,
# noqa: E501
v0_only
=
True
,
hf_overrides
=
{
"architectures"
:
[
"GemmaForSequenceClassification"
],
# noqa: E501
hf_overrides
=
{
"architectures"
:
[
"GemmaForSequenceClassification"
],
# noqa: E501
"classifier_from_token"
:
[
"Yes"
],
# noqa: E501
"classifier_from_token"
:
[
"Yes"
],
# noqa: E501
"method"
:
"no_post_processing"
}),
# noqa: E501
"method"
:
"no_post_processing"
}),
# noqa: E501
...
@@ -395,6 +402,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -395,6 +402,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
transformers_version_reason
=
"HF model is not compatible."
,
# noqa: E501
transformers_version_reason
=
"HF model is not compatible."
,
# noqa: E501
hf_overrides
=
{
"architectures"
:
[
"DeepseekVLV2ForCausalLM"
]}),
# noqa: E501
hf_overrides
=
{
"architectures"
:
[
"DeepseekVLV2ForCausalLM"
]}),
# noqa: E501
"Emu3ForConditionalGeneration"
:
_HfExamplesInfo
(
"BAAI/Emu3-Chat-hf"
),
"Emu3ForConditionalGeneration"
:
_HfExamplesInfo
(
"BAAI/Emu3-Chat-hf"
),
"Ernie4_5_VLMoeForConditionalGeneration"
:
_HfExamplesInfo
(
"baidu/ERNIE-4.5-VL-28B-A3B-PT"
,
# noqa: E501
trust_remote_code
=
True
),
"FuyuForCausalLM"
:
_HfExamplesInfo
(
"adept/fuyu-8b"
),
"FuyuForCausalLM"
:
_HfExamplesInfo
(
"adept/fuyu-8b"
),
"Gemma3ForConditionalGeneration"
:
_HfExamplesInfo
(
"google/gemma-3-4b-it"
),
"Gemma3ForConditionalGeneration"
:
_HfExamplesInfo
(
"google/gemma-3-4b-it"
),
"Gemma3nForConditionalGeneration"
:
_HfExamplesInfo
(
"google/gemma-3n-E2B-it"
,
# noqa: E501
"Gemma3nForConditionalGeneration"
:
_HfExamplesInfo
(
"google/gemma-3n-E2B-it"
,
# noqa: E501
...
@@ -405,22 +414,28 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -405,22 +414,28 @@ _MULTIMODAL_EXAMPLE_MODELS = {
hf_overrides
=
{
"architectures"
:
[
"GLM4VForCausalLM"
]}),
# noqa: E501
hf_overrides
=
{
"architectures"
:
[
"GLM4VForCausalLM"
]}),
# noqa: E501
"Glm4vForConditionalGeneration"
:
_HfExamplesInfo
(
"zai-org/GLM-4.1V-9B-Thinking"
),
# noqa: E501
"Glm4vForConditionalGeneration"
:
_HfExamplesInfo
(
"zai-org/GLM-4.1V-9B-Thinking"
),
# noqa: E501
"Glm4vMoeForConditionalGeneration"
:
_HfExamplesInfo
(
"zai-org/GLM-4.5V"
,
"Glm4vMoeForConditionalGeneration"
:
_HfExamplesInfo
(
"zai-org/GLM-4.5V"
,
is_available_online
=
False
),
# noqa: E501
min_transformers_version
=
"4.56"
),
# noqa: E501
"H2OVLChatModel"
:
_HfExamplesInfo
(
"h2oai/h2ovl-mississippi-800m"
,
"H2OVLChatModel"
:
_HfExamplesInfo
(
"h2oai/h2ovl-mississippi-800m"
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
extras
=
{
"2b"
:
"h2oai/h2ovl-mississippi-2b"
},
# noqa: E501
extras
=
{
"2b"
:
"h2oai/h2ovl-mississippi-2b"
},
# noqa: E501
max_transformers_version
=
"4.48"
,
# noqa: E501
max_transformers_version
=
"4.48"
,
# noqa: E501
transformers_version_reason
=
"HF model is not compatible."
),
# noqa: E501
transformers_version_reason
=
"HF model is not compatible."
),
# noqa: E501
"HCXVisionForCausalLM"
:
_HfExamplesInfo
(
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
,
# noqa: E501
trust_remote_code
=
True
),
"Idefics3ForConditionalGeneration"
:
_HfExamplesInfo
(
"HuggingFaceM4/Idefics3-8B-Llama3"
,
# noqa: E501
"Idefics3ForConditionalGeneration"
:
_HfExamplesInfo
(
"HuggingFaceM4/Idefics3-8B-Llama3"
,
# noqa: E501
{
"tiny"
:
"HuggingFaceTB/SmolVLM-256M-Instruct"
},
# noqa: E501
{
"tiny"
:
"HuggingFaceTB/SmolVLM-256M-Instruct"
},
# noqa: E501
min_transformers_version
=
"4.55.1"
,
min_transformers_version
=
"4.56"
,
transformers_version_reason
=
"HF model broken in 4.55.0"
),
# noqa: E501
transformers_version_reason
=
"HF model broken in 4.55"
),
# noqa: E501
"InternS1ForConditionalGeneration"
:
_HfExamplesInfo
(
"internlm/Intern-S1"
,
trust_remote_code
=
True
),
# noqa: E501
"InternVLChatModel"
:
_HfExamplesInfo
(
"OpenGVLab/InternVL2-1B"
,
"InternVLChatModel"
:
_HfExamplesInfo
(
"OpenGVLab/InternVL2-1B"
,
extras
=
{
"2B"
:
"OpenGVLab/InternVL2-2B"
,
extras
=
{
"2B"
:
"OpenGVLab/InternVL2-2B"
,
"3.0"
:
"OpenGVLab/InternVL3-1B"
},
# noqa: E501
"3.0"
:
"OpenGVLab/InternVL3-1B"
,
# noqa: E501
trust_remote_code
=
True
),
"3.5-qwen3"
:
"OpenGVLab/InternVL3_5-1B"
,
# noqa: E501
"InternS1ForConditionalGeneration"
:
_HfExamplesInfo
(
"internlm/Intern-S1"
,
"3.5-qwen3moe"
:
"OpenGVLab/InternVL3_5-30B-A3B"
,
# noqa: E501
"3.5-gptoss"
:
"OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview"
},
# noqa: E501
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"InternVLForConditionalGeneration"
:
_HfExamplesInfo
(
"OpenGVLab/InternVL3-1B-hf"
),
# noqa: E501
"KeyeForConditionalGeneration"
:
_HfExamplesInfo
(
"Kwai-Keye/Keye-VL-8B-Preview"
,
# noqa: E501
"KeyeForConditionalGeneration"
:
_HfExamplesInfo
(
"Kwai-Keye/Keye-VL-8B-Preview"
,
# noqa: E501
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"KimiVLForConditionalGeneration"
:
_HfExamplesInfo
(
"moonshotai/Kimi-VL-A3B-Instruct"
,
# noqa: E501
"KimiVLForConditionalGeneration"
:
_HfExamplesInfo
(
"moonshotai/Kimi-VL-A3B-Instruct"
,
# noqa: E501
...
@@ -443,7 +458,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -443,7 +458,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"MiniCPMO"
:
_HfExamplesInfo
(
"openbmb/MiniCPM-o-2_6"
,
"MiniCPMO"
:
_HfExamplesInfo
(
"openbmb/MiniCPM-o-2_6"
,
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"MiniCPMV"
:
_HfExamplesInfo
(
"openbmb/MiniCPM-Llama3-V-2_5"
,
"MiniCPMV"
:
_HfExamplesInfo
(
"openbmb/MiniCPM-Llama3-V-2_5"
,
extras
=
{
"2.6"
:
"openbmb/MiniCPM-V-2_6"
,
"4.0"
:
"openbmb/MiniCPM-V-4"
},
# noqa: E501
extras
=
{
"2.6"
:
"openbmb/MiniCPM-V-2_6"
,
"4.0"
:
"openbmb/MiniCPM-V-4"
,
"4.5"
:
"openbmb/MiniCPM-V-4_5"
},
# noqa: E501
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"MiniMaxVL01ForConditionalGeneration"
:
_HfExamplesInfo
(
"MiniMaxAI/MiniMax-VL-01"
,
# noqa: E501
"MiniMaxVL01ForConditionalGeneration"
:
_HfExamplesInfo
(
"MiniMaxAI/MiniMax-VL-01"
,
# noqa: E501
trust_remote_code
=
True
,
trust_remote_code
=
True
,
...
@@ -464,6 +479,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -464,6 +479,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
transformers_version_reason
=
"HF model is not compatible"
,
# noqa: E501
transformers_version_reason
=
"HF model is not compatible"
,
# noqa: E501
extras
=
{
"1.6-llama"
:
"AIDC-AI/Ovis1.6-Llama3.2-3B"
,
extras
=
{
"1.6-llama"
:
"AIDC-AI/Ovis1.6-Llama3.2-3B"
,
"1.6-gemma"
:
"AIDC-AI/Ovis1.6-Gemma2-9B"
}),
# noqa: E501
"1.6-gemma"
:
"AIDC-AI/Ovis1.6-Gemma2-9B"
}),
# noqa: E501
"Ovis2_5"
:
_HfExamplesInfo
(
"AIDC-AI/Ovis2.5-2B"
,
trust_remote_code
=
True
),
"PaliGemmaForConditionalGeneration"
:
_HfExamplesInfo
(
"google/paligemma-3b-mix-224"
,
# noqa: E501
"PaliGemmaForConditionalGeneration"
:
_HfExamplesInfo
(
"google/paligemma-3b-mix-224"
,
# noqa: E501
extras
=
{
"v2"
:
"google/paligemma2-3b-ft-docci-448"
}),
# noqa: E501
extras
=
{
"v2"
:
"google/paligemma2-3b-ft-docci-448"
}),
# noqa: E501
"Phi3VForCausalLM"
:
_HfExamplesInfo
(
"microsoft/Phi-3-vision-128k-instruct"
,
"Phi3VForCausalLM"
:
_HfExamplesInfo
(
"microsoft/Phi-3-vision-128k-instruct"
,
...
@@ -487,14 +504,15 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -487,14 +504,15 @@ _MULTIMODAL_EXAMPLE_MODELS = {
max_model_len
=
4096
),
max_model_len
=
4096
),
"Qwen2_5OmniModel"
:
_HfExamplesInfo
(
"Qwen/Qwen2.5-Omni-3B"
),
"Qwen2_5OmniModel"
:
_HfExamplesInfo
(
"Qwen/Qwen2.5-Omni-3B"
),
"Qwen2_5OmniForConditionalGeneration"
:
_HfExamplesInfo
(
"Qwen/Qwen2.5-Omni-7B-AWQ"
),
# noqa: E501
"Qwen2_5OmniForConditionalGeneration"
:
_HfExamplesInfo
(
"Qwen/Qwen2.5-Omni-7B-AWQ"
),
# noqa: E501
"RForConditionalGeneration"
:
_HfExamplesInfo
(
"YannQi/R-4B"
,
trust_remote_code
=
True
),
"SkyworkR1VChatModel"
:
_HfExamplesInfo
(
"Skywork/Skywork-R1V-38B"
,
"SkyworkR1VChatModel"
:
_HfExamplesInfo
(
"Skywork/Skywork-R1V-38B"
,
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"SmolVLMForConditionalGeneration"
:
_HfExamplesInfo
(
"HuggingFaceTB/SmolVLM2-2.2B-Instruct"
,
# noqa: E501
"SmolVLMForConditionalGeneration"
:
_HfExamplesInfo
(
"HuggingFaceTB/SmolVLM2-2.2B-Instruct"
,
# noqa: E501
min_transformers_version
=
"4.5
5.1
"
,
min_transformers_version
=
"4.5
6
"
,
transformers_version_reason
=
"HF model broken in 4.55
.0
"
),
# noqa: E501
transformers_version_reason
=
"HF model broken in 4.55"
),
# noqa: E501
"Step3VLForConditionalGeneration"
:
_HfExamplesInfo
(
"stepfun-ai/step3"
,
"Step3VLForConditionalGeneration"
:
_HfExamplesInfo
(
"stepfun-ai/step3"
,
trust_remote_code
=
True
,
trust_remote_code
=
True
),
is_available_online
=
False
),
"UltravoxModel"
:
_HfExamplesInfo
(
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
,
# noqa: E501
"UltravoxModel"
:
_HfExamplesInfo
(
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
,
# noqa: E501
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"TarsierForConditionalGeneration"
:
_HfExamplesInfo
(
"omni-research/Tarsier-7b"
),
# noqa: E501
"TarsierForConditionalGeneration"
:
_HfExamplesInfo
(
"omni-research/Tarsier-7b"
),
# noqa: E501
...
@@ -507,6 +525,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -507,6 +525,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
is_available_online
=
False
,
is_available_online
=
False
,
),
),
# [Encoder-decoder]
# [Encoder-decoder]
"DonutForConditionalGeneration"
:
_HfExamplesInfo
(
"naver-clova-ix/donut-base-finetuned-docvqa"
,
# noqa: E501
hf_overrides
=
{
"architectures"
:
[
"DonutForConditionalGeneration"
],
"model_type"
:
"donut"
},
# noqa: E501
extras
=
{
"dolphin"
:
"ByteDance/Dolphin"
}),
# noqa: E501
# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
# Therefore, we borrow the BartTokenizer from the original Bart model
# Therefore, we borrow the BartTokenizer from the original Bart model
"Florence2ForConditionalGeneration"
:
_HfExamplesInfo
(
"microsoft/Florence-2-base"
,
# noqa: E501
"Florence2ForConditionalGeneration"
:
_HfExamplesInfo
(
"microsoft/Florence-2-base"
,
# noqa: E501
...
@@ -530,6 +551,9 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
...
@@ -530,6 +551,9 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
"DeepSeekMTPModel"
:
_HfExamplesInfo
(
"luccafong/deepseek_mtp_main_random"
,
"DeepSeekMTPModel"
:
_HfExamplesInfo
(
"luccafong/deepseek_mtp_main_random"
,
speculative_model
=
"luccafong/deepseek_mtp_draft_random"
,
# noqa: E501
speculative_model
=
"luccafong/deepseek_mtp_draft_random"
,
# noqa: E501
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"EagleDeepSeekMTPModel"
:
_HfExamplesInfo
(
"eagle618/deepseek-v3-random"
,
speculative_model
=
"eagle618/eagle-deepseek-v3-random"
,
# noqa: E501
trust_remote_code
=
True
),
"EagleLlamaForCausalLM"
:
_HfExamplesInfo
(
"yuhuili/EAGLE-LLaMA3-Instruct-8B"
,
"EagleLlamaForCausalLM"
:
_HfExamplesInfo
(
"yuhuili/EAGLE-LLaMA3-Instruct-8B"
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
speculative_model
=
"yuhuili/EAGLE-LLaMA3-Instruct-8B"
,
speculative_model
=
"yuhuili/EAGLE-LLaMA3-Instruct-8B"
,
...
@@ -553,6 +577,9 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
...
@@ -553,6 +577,9 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
is_available_online
=
False
,
is_available_online
=
False
,
speculative_model
=
"openbmb/MiniCPM-2B-sft-bf16"
,
speculative_model
=
"openbmb/MiniCPM-2B-sft-bf16"
,
tokenizer
=
"openbmb/MiniCPM-2B-sft-bf16"
),
tokenizer
=
"openbmb/MiniCPM-2B-sft-bf16"
),
"ErnieMTPModel"
:
_HfExamplesInfo
(
"baidu/ERNIE-4.5-21B-A3B-PT"
,
trust_remote_code
=
True
,
speculative_model
=
"baidu/ERNIE-4.5-21B-A3B-PT"
),
"Glm4MoeMTPModel"
:
_HfExamplesInfo
(
"zai-org/GLM-4.5"
,
"Glm4MoeMTPModel"
:
_HfExamplesInfo
(
"zai-org/GLM-4.5"
,
speculative_model
=
"zai-org/GLM-4.5"
,
speculative_model
=
"zai-org/GLM-4.5"
,
min_transformers_version
=
"4.54"
,
min_transformers_version
=
"4.54"
,
...
@@ -565,7 +592,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
...
@@ -565,7 +592,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
_TRANSFORMERS_BACKEND_MODELS
=
{
_TRANSFORMERS_BACKEND_MODELS
=
{
"TransformersModel"
:
_HfExamplesInfo
(
"Qwen/Qwen3-Embedding-0.6B"
),
"TransformersModel"
:
_HfExamplesInfo
(
"Qwen/Qwen3-Embedding-0.6B"
),
"TransformersForCausalLM"
:
_HfExamplesInfo
(
"hmellor/Ilama-3.2-1B"
,
trust_remote_code
=
True
),
# noqa: E501
"TransformersForCausalLM"
:
_HfExamplesInfo
(
"hmellor/Ilama-3.2-1B"
,
trust_remote_code
=
True
),
# noqa: E501
"TransformersForMultimodalLM"
:
_HfExamplesInfo
(
"
OpenGVLab/InternVL3-1B
-hf"
),
"TransformersForMultimodalLM"
:
_HfExamplesInfo
(
"
BAAI/Emu3-Chat
-hf"
),
}
}
_EXAMPLE_MODELS
=
{
_EXAMPLE_MODELS
=
{
...
...
tests/models/test_initialization.py
View file @
d2b52805
...
@@ -38,11 +38,6 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
...
@@ -38,11 +38,6 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
model_arch
=
model_arch
,
model_arch
=
model_arch
,
exist_overrides
=
model_info
.
hf_overrides
)
exist_overrides
=
model_info
.
hf_overrides
)
if
model_arch
in
(
"Llama4ForCausalLM"
,
"EagleLlama4ForCausalLM"
):
from
vllm.model_executor.models.llama4
import
Llama4ForCausalLM
from
vllm.model_executor.models.registry
import
ModelRegistry
ModelRegistry
.
register_model
(
"Llama4ForCausalLM"
,
Llama4ForCausalLM
)
# Avoid calling model.forward()
# Avoid calling model.forward()
def
_initialize_kv_caches_v0
(
self
)
->
None
:
def
_initialize_kv_caches_v0
(
self
)
->
None
:
self
.
cache_config
.
num_gpu_blocks
=
0
self
.
cache_config
.
num_gpu_blocks
=
0
...
@@ -95,6 +90,8 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
...
@@ -95,6 +90,8 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
@
pytest
.
mark
.
parametrize
(
"model_arch"
,
HF_EXAMPLE_MODELS
.
get_supported_archs
())
@
pytest
.
mark
.
parametrize
(
"model_arch"
,
HF_EXAMPLE_MODELS
.
get_supported_archs
())
def
test_can_initialize
(
model_arch
:
str
,
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_can_initialize
(
model_arch
:
str
,
monkeypatch
:
pytest
.
MonkeyPatch
):
if
model_arch
==
"Lfm2ForCausalLM"
:
pytest
.
skip
(
"Skipping until test supports V1-only models"
)
can_initialize
(
model_arch
,
monkeypatch
,
HF_EXAMPLE_MODELS
)
can_initialize
(
model_arch
,
monkeypatch
,
HF_EXAMPLE_MODELS
)
...
...
tests/models/test_registry.py
View file @
d2b52805
...
@@ -24,6 +24,9 @@ from .registry import HF_EXAMPLE_MODELS
...
@@ -24,6 +24,9 @@ from .registry import HF_EXAMPLE_MODELS
@
pytest
.
mark
.
parametrize
(
"model_arch"
,
ModelRegistry
.
get_supported_archs
())
@
pytest
.
mark
.
parametrize
(
"model_arch"
,
ModelRegistry
.
get_supported_archs
())
def
test_registry_imports
(
model_arch
):
def
test_registry_imports
(
model_arch
):
# Skip if transformers version is incompatible
model_info
=
HF_EXAMPLE_MODELS
.
get_hf_info
(
model_arch
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
# Ensure all model classes can be imported successfully
# Ensure all model classes can be imported successfully
model_cls
=
ModelRegistry
.
_try_load_model_cls
(
model_arch
)
model_cls
=
ModelRegistry
.
_try_load_model_cls
(
model_arch
)
assert
model_cls
is
not
None
assert
model_cls
is
not
None
...
...
tests/models/utils.py
View file @
d2b52805
...
@@ -3,7 +3,8 @@
...
@@ -3,7 +3,8 @@
import
warnings
import
warnings
from
collections.abc
import
Sequence
from
collections.abc
import
Sequence
from
typing
import
Any
,
NamedTuple
,
Optional
,
Union
from
dataclasses
import
dataclass
from
typing
import
Any
,
Optional
,
Union
import
torch
import
torch
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
...
@@ -339,36 +340,43 @@ def softmax(data):
...
@@ -339,36 +340,43 @@ def softmax(data):
return
F
.
softmax
(
data
,
dim
=-
1
)
return
F
.
softmax
(
data
,
dim
=-
1
)
class
EmbedModelInfo
(
NamedTuple
):
@
dataclass
class
ModelInfo
:
name
:
str
name
:
str
is_matryoshka
:
bool
=
False
matryoshka_dimensions
:
Optional
[
list
[
int
]]
=
None
architecture
:
str
=
""
architecture
:
str
=
""
dtype
:
str
=
"auto"
dtype
:
str
=
"auto"
hf_overrides
:
Optional
[
dict
[
str
,
Any
]]
=
None
default_pooling_type
:
str
=
""
default_pooling_type
:
str
=
""
enable_test
:
bool
=
True
enable_test
:
bool
=
True
@
dataclass
class
EmbedModelInfo
(
ModelInfo
):
is_matryoshka
:
bool
=
False
matryoshka_dimensions
:
Optional
[
list
[
int
]]
=
None
@
dataclass
class
CLSPoolingEmbedModelInfo
(
EmbedModelInfo
):
class
CLSPoolingEmbedModelInfo
(
EmbedModelInfo
):
default_pooling_type
:
str
=
"CLS"
default_pooling_type
:
str
=
"CLS"
@
dataclass
class
LASTPoolingEmbedModelInfo
(
EmbedModelInfo
):
class
LASTPoolingEmbedModelInfo
(
EmbedModelInfo
):
default_pooling_type
:
str
=
"LAST"
default_pooling_type
:
str
=
"LAST"
class
RerankModelInfo
(
NamedTuple
):
@
dataclass
name
:
str
class
RerankModelInfo
(
ModelInfo
):
architecture
:
str
=
""
pass
dtype
:
str
=
"auto"
default_pooling_type
:
str
=
""
enable_test
:
bool
=
True
@
dataclass
class
CLSPoolingRerankModelInfo
(
RerankModelInfo
):
class
CLSPoolingRerankModelInfo
(
RerankModelInfo
):
default_pooling_type
:
str
=
"CLS"
default_pooling_type
:
str
=
"CLS"
@
dataclass
class
LASTPoolingRerankModelInfo
(
RerankModelInfo
):
class
LASTPoolingRerankModelInfo
(
RerankModelInfo
):
default_pooling_type
:
str
=
"LAST"
default_pooling_type
:
str
=
"LAST"
...
...
tests/multimodal/test_cache.py
View file @
d2b52805
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
Optional
import
numpy
as
np
import
pytest
import
pytest
import
torch
import
torch
from
vllm.multimodal.cache
import
MultiModalCache
,
MultiModalCacheItemMetadata
from
vllm.config
import
ModelConfig
,
ParallelConfig
,
VllmConfig
from
vllm.multimodal.inputs
import
(
MultiModalFieldElem
,
MultiModalKwargs
,
from
vllm.multimodal.cache
import
(
MultiModalCache
,
MultiModalKwargsItem
,
MultiModalProcessorCacheItem
,
MultiModalProcessorCacheItemMetadata
,
processor_cache_from_config
,
receiver_cache_from_config
)
from
vllm.multimodal.hasher
import
MultiModalHasher
from
vllm.multimodal.inputs
import
(
MultiModalFieldElem
,
MultiModalKwargsItem
,
MultiModalKwargsItems
,
MultiModalSharedField
)
MultiModalSharedField
)
from
vllm.multimodal.processing
import
PromptInsertion
from
vllm.multimodal.registry
import
MultiModalRegistry
def
_dummy_elem
(
modality
:
str
,
key
:
str
,
size
:
int
,
*
,
rng
:
Optional
[
np
.
random
.
RandomState
]
=
None
,
):
if
rng
is
None
:
data
=
torch
.
empty
((
size
,
),
dtype
=
torch
.
int8
)
else
:
data
=
torch
.
from_numpy
(
rng
.
randint
(
4
,
size
=
(
size
,
),
dtype
=
np
.
int8
))
def
_dummy_elem
(
modality
:
str
,
key
:
str
,
size
:
int
):
return
MultiModalFieldElem
(
return
MultiModalFieldElem
(
modality
=
modality
,
modality
=
modality
,
key
=
key
,
key
=
key
,
data
=
torch
.
empty
((
size
,
),
dtype
=
torch
.
int8
)
,
data
=
data
,
field
=
MultiModalSharedField
(
1
),
field
=
MultiModalSharedField
(
1
),
)
)
def
_dummy_item
(
modality
:
str
,
size_by_key
:
dict
[
str
,
int
]):
def
_dummy_item
(
modality
:
str
,
size_by_key
:
dict
[
str
,
int
],
*
,
rng
:
Optional
[
np
.
random
.
RandomState
]
=
None
,
):
return
MultiModalKwargsItem
.
from_elems
([
return
MultiModalKwargsItem
.
from_elems
([
_dummy_elem
(
modality
,
key
,
size
)
for
key
,
size
in
size_by_key
.
items
()
_dummy_elem
(
modality
,
key
,
size
,
rng
=
rng
)
for
key
,
size
in
size_by_key
.
items
()
])
])
def
_dummy_kw
(
size_by_key_modality
:
dict
[
str
,
dict
[
str
,
int
]]):
def
_dummy_items
(
return
MultiModalKwargs
([
size_by_key_modality
:
dict
[
str
,
dict
[
str
,
int
]],
_dummy_item
(
modality
,
size_by_key
)
*
,
rng
:
Optional
[
np
.
random
.
RandomState
]
=
None
,
):
return
MultiModalKwargsItems
.
from_seq
([
_dummy_item
(
modality
,
size_by_key
,
rng
=
rng
)
for
modality
,
size_by_key
in
size_by_key_modality
.
items
()
for
modality
,
size_by_key
in
size_by_key_modality
.
items
()
])
])
...
@@ -37,7 +69,8 @@ def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]):
...
@@ -37,7 +69,8 @@ def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]):
[
[
(
_dummy_item
(
"a"
,
{
"a1"
:
100
}),
100
),
(
_dummy_item
(
"a"
,
{
"a1"
:
100
}),
100
),
(
_dummy_item
(
"a"
,
{
"a1"
:
100
,
"a2"
:
110
}),
210
),
(
_dummy_item
(
"a"
,
{
"a1"
:
100
,
"a2"
:
110
}),
210
),
(
_dummy_kw
({
"a"
:
{
"a1"
:
100
,
"a2"
:
110
},
"b"
:
{
"b1"
:
120
,
"b2"
:
130
}}),
460
),
# noqa: E501
(
_dummy_items
({
"a"
:
{
"a1"
:
100
,
"a2"
:
110
},
"b"
:
{
"b1"
:
120
,
"b2"
:
130
}}),
460
),
# noqa: E501
(
_dummy_items
({
"a"
:
{
"a1"
:
100
,
"a2"
:
110
},
"b"
:
{
"b1"
:
120
,
"b2"
:
130
}}).
get_data
(),
460
),
# noqa: E501
],
],
)
)
# yapf: enable
# yapf: enable
...
@@ -47,5 +80,139 @@ def test_cache_item_size(item, expected_size):
...
@@ -47,5 +80,139 @@ def test_cache_item_size(item, expected_size):
cache
[
""
]
=
item
cache
[
""
]
=
item
assert
cache
.
currsize
==
expected_size
assert
cache
.
currsize
==
expected_size
cache
[
""
]
=
MultiModalCacheItemMetadata
.
wraps
(
item
)
prompt_update
=
PromptInsertion
(
"dummy"
,
"target"
,
"insertion"
)
\
.
resolve
(
0
)
cache
[
""
]
=
MultiModalProcessorCacheItem
(
item
,
[
prompt_update
])
assert
cache
.
currsize
==
expected_size
cache
[
""
]
=
MultiModalProcessorCacheItemMetadata
(
item
,
[
prompt_update
])
assert
cache
.
currsize
==
expected_size
assert
cache
.
currsize
==
expected_size
def
_create_vllm_config
(
*
,
mm_processor_cache_gb
:
float
,
enable_ipc
:
bool
,
):
return
VllmConfig
(
model_config
=
ModelConfig
(
mm_processor_cache_gb
=
mm_processor_cache_gb
),
parallel_config
=
ParallelConfig
(
data_parallel_size
=
1
if
enable_ipc
else
2
),
)
def
_compare_caches
(
config_0
:
VllmConfig
,
config_1
:
VllmConfig
,
*
,
item_capacity
:
int
=
8
,
hit_rate
:
float
=
0.5
,
max_items_per_iter
:
int
=
3
,
is_cached_calls_per_iter
:
int
,
n_iter
:
int
=
100
,
seed
:
int
=
0
,
):
mm_registry
=
MultiModalRegistry
()
cache_0_p0
=
processor_cache_from_config
(
config_0
,
mm_registry
)
cache_0_p1
=
receiver_cache_from_config
(
config_0
,
mm_registry
)
cache_1_p0
=
processor_cache_from_config
(
config_1
,
mm_registry
)
cache_1_p1
=
receiver_cache_from_config
(
config_1
,
mm_registry
)
cache_size_gb
=
max
(
config_0
.
model_config
.
mm_processor_cache_gb
,
config_1
.
model_config
.
mm_processor_cache_gb
,
)
item_size_gb
=
int
(
cache_size_gb
/
item_capacity
)
rng
=
np
.
random
.
RandomState
(
seed
)
all_items
=
[
_dummy_item
(
"item"
,
{
"key"
:
item_size_gb
},
rng
=
rng
)
for
_
in
range
(
int
(
item_capacity
/
hit_rate
))
]
all_hashes
=
[
MultiModalHasher
.
hash_kwargs
(
item
=
item
.
get_data
())
for
item
in
all_items
]
# Should not be used since there is nothing to convert to text
prompt_update
=
PromptInsertion
(
"dummy"
,
"target"
,
"insertion"
)
for
it
in
range
(
n_iter
):
num_items_to_select
=
rng
.
randint
(
0
,
max_items_per_iter
)
item_idxs_to_select
=
rng
.
choice
(
len
(
all_items
),
num_items_to_select
)
selected_items
=
[
all_items
[
idx
]
for
idx
in
item_idxs_to_select
]
selected_hashes
=
[
all_hashes
[
idx
]
for
idx
in
item_idxs_to_select
]
if
cache_0_p0
is
None
:
cache_0_p0_out
=
selected_items
else
:
for
_
in
range
(
is_cached_calls_per_iter
):
cache_0_p0
.
is_cached
(
selected_hashes
)
cache_0_p0_out
=
[
item
for
item
,
_
in
cache_0_p0
.
get_and_update
(
[(
item
,
prompt_update
.
content
)
for
item
in
selected_items
],
selected_hashes
,
)
]
if
cache_1_p0
is
None
:
cache_1_p0_out
=
selected_items
else
:
for
_
in
range
(
is_cached_calls_per_iter
):
cache_1_p0
.
is_cached
(
selected_hashes
)
cache_1_p0_out
=
[
item
for
item
,
_
in
cache_1_p0
.
get_and_update
(
[(
item
,
prompt_update
.
content
)
for
item
in
selected_items
],
selected_hashes
,
)
]
if
cache_0_p1
is
None
:
cache_0_p1_out
=
cache_0_p0_out
else
:
cache_0_p1_out
=
cache_0_p1
.
get_and_update
(
cache_0_p0_out
,
selected_hashes
)
if
cache_1_p1
is
None
:
cache_1_p1_out
=
cache_1_p0_out
else
:
cache_1_p1_out
=
cache_1_p1
.
get_and_update
(
cache_1_p0_out
,
selected_hashes
)
assert
cache_0_p1_out
==
cache_1_p1_out
,
f
"Failed at
{
it
=
}
"
@
pytest
.
mark
.
parametrize
(
"is_cached_calls_per_iter"
,
[
1
,
2
,
3
])
def
test_ipc_enable_disable_consistency
(
is_cached_calls_per_iter
):
cache_size_gb
=
1
/
(
1
<<
20
)
vllm_config_ipc_enabled
=
_create_vllm_config
(
mm_processor_cache_gb
=
cache_size_gb
,
enable_ipc
=
True
,
)
vllm_config_ipc_disabled
=
_create_vllm_config
(
mm_processor_cache_gb
=
0
,
enable_ipc
=
False
,
)
vllm_config_cache_disabled
=
_create_vllm_config
(
mm_processor_cache_gb
=
cache_size_gb
,
enable_ipc
=
True
,
)
_compare_caches
(
vllm_config_ipc_enabled
,
vllm_config_ipc_disabled
,
is_cached_calls_per_iter
=
is_cached_calls_per_iter
,
)
_compare_caches
(
vllm_config_ipc_disabled
,
vllm_config_cache_disabled
,
is_cached_calls_per_iter
=
is_cached_calls_per_iter
,
)
_compare_caches
(
vllm_config_cache_disabled
,
vllm_config_ipc_enabled
,
is_cached_calls_per_iter
=
is_cached_calls_per_iter
,
)
tests/multimodal/test_hasher.py
View file @
d2b52805
...
@@ -45,10 +45,11 @@ def test_hash_collision_image_transpose():
...
@@ -45,10 +45,11 @@ def test_hash_collision_image_transpose():
assert
hasher
.
hash_kwargs
(
image
=
image1
)
!=
hasher
.
hash_kwargs
(
image
=
image2
)
assert
hasher
.
hash_kwargs
(
image
=
image1
)
!=
hasher
.
hash_kwargs
(
image
=
image2
)
def
test_hash_collision_tensor_shape
():
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float32
,
torch
.
bfloat16
])
def
test_hash_collision_tensor_shape
(
dtype
):
# The hash should be different though the data is the same when flattened
# The hash should be different though the data is the same when flattened
arr1
=
torch
.
zeros
((
5
,
10
,
20
,
3
))
arr1
=
torch
.
zeros
((
5
,
10
,
20
,
3
)
,
dtype
=
dtype
)
arr2
=
torch
.
zeros
((
10
,
20
,
5
,
3
))
arr2
=
torch
.
zeros
((
10
,
20
,
5
,
3
)
,
dtype
=
dtype
)
hasher
=
MultiModalHasher
hasher
=
MultiModalHasher
assert
hasher
.
hash_kwargs
(
data
=
arr1
)
!=
hasher
.
hash_kwargs
(
data
=
arr2
)
assert
hasher
.
hash_kwargs
(
data
=
arr1
)
!=
hasher
.
hash_kwargs
(
data
=
arr2
)
...
...
tests/multimodal/test_processing.py
View file @
d2b52805
...
@@ -17,13 +17,11 @@ from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
...
@@ -17,13 +17,11 @@ from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
PromptReplacement
,
apply_text_matches
,
PromptReplacement
,
apply_text_matches
,
apply_token_matches
,
apply_token_matches
,
find_mm_placeholders
,
find_mm_placeholders
,
find_text_matches
,
find_token_matches
,
iter_token_matches
,
iter_token_matches
,
replace_token_matches
)
replace_token_matches
)
# yapf: enable
# yapf: enable
from
vllm.multimodal.profiling
import
MultiModalProfiler
from
vllm.multimodal.profiling
import
MultiModalProfiler
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
vllm.utils
import
full_groupby
from
.utils
import
random_image
from
.utils
import
random_image
...
@@ -75,12 +73,15 @@ from .utils import random_image
...
@@ -75,12 +73,15 @@ from .utils import random_image
),
),
],
],
)
)
@
pytest
.
mark
.
parametrize
(
"start_idx"
,
[
0
,
4
,
8
])
# yapf: enable
# yapf: enable
def
test_iter_token_matches
(
token_ids
,
match_ids
,
expected
):
def
test_iter_token_matches
(
token_ids
,
match_ids
,
expected
,
start_idx
):
result
=
list
(
iter_token_matches
(
token_ids
,
match_ids
))
result
=
list
(
iter_token_matches
(
token_ids
,
match_ids
,
start_idx
=
start_idx
))
# Manually constructed results
# Manually constructed results
assert
[
item
.
_asdict
()
for
item
in
result
]
==
expected
assert
[
item
.
_asdict
()
for
item
in
result
]
==
[
item
for
item
in
expected
if
item
[
"start_idx"
]
>=
start_idx
]
# Invariants
# Invariants
match_lens
=
[
end
-
start
for
start
,
end
in
result
]
match_lens
=
[
end
-
start
for
start
,
end
in
result
]
...
@@ -241,21 +242,23 @@ def test_find_token_matches(
...
@@ -241,21 +242,23 @@ def test_find_token_matches(
# Should not be used since there is nothing to convert to token IDs
# Should not be used since there is nothing to convert to token IDs
mock_tokenizer
=
cast
(
AnyTokenizer
,
object
())
mock_tokenizer
=
cast
(
AnyTokenizer
,
object
())
prompt_updates
=
[
prompt_updates
=
{
update_type
(
key
,
target
,
[]).
bind
(
mock_tokenizer
)
key
:
update_type
(
key
,
target
,
[]).
resolve
(
0
)
for
key
,
target
in
target_by_key
.
items
()
for
key
,
target
in
target_by_key
.
items
()
]
}
result
=
find_token_matches
(
prompt
,
prompt_updates
)
result
=
{
key
:
list
(
update
.
iter_token_matches
(
prompt
,
mock_tokenizer
))
for
key
,
update
in
prompt_updates
.
items
()
}
# Only displayed on error
# Only displayed on error
print
(
"result:"
,
result
)
print
(
"result:"
,
result
)
# Manually constructed results
# Manually constructed results
result_groups
=
dict
(
full_groupby
(
result
,
key
=
lambda
x
:
x
.
modality
))
assert
{
assert
{
key
:
[
key
:
[
dict
(
start_idx
=
item
.
start_idx
,
end_idx
=
item
.
end_idx
)
dict
(
start_idx
=
item
.
start_idx
,
end_idx
=
item
.
end_idx
)
for
item
in
result
_groups
.
get
(
key
,
[])
for
item
in
result
.
get
(
key
,
[])
]
]
for
key
in
expected_by_key
for
key
in
expected_by_key
}
==
expected_by_key
}
==
expected_by_key
...
@@ -388,21 +391,23 @@ def test_find_text_matches(
...
@@ -388,21 +391,23 @@ def test_find_text_matches(
# Should not be used since there is nothing to convert to text
# Should not be used since there is nothing to convert to text
mock_tokenizer
=
cast
(
AnyTokenizer
,
object
())
mock_tokenizer
=
cast
(
AnyTokenizer
,
object
())
prompt_updates
=
[
prompt_updates
=
{
update_type
(
key
,
target
,
[]).
bind
(
mock_tokenizer
)
key
:
update_type
(
key
,
target
,
[]).
resolve
(
0
)
for
key
,
target
in
target_by_key
.
items
()
for
key
,
target
in
target_by_key
.
items
()
]
}
result
=
find_text_matches
(
prompt
,
prompt_updates
)
result
=
{
key
:
list
(
update
.
iter_text_matches
(
prompt
,
mock_tokenizer
))
for
key
,
update
in
prompt_updates
.
items
()
}
# Only displayed on error
# Only displayed on error
print
(
"result:"
,
result
)
print
(
"result:"
,
result
)
# Manually constructed results
# Manually constructed results
result_groups
=
dict
(
full_groupby
(
result
,
key
=
lambda
x
:
x
.
modality
))
assert
{
assert
{
key
:
[
key
:
[
dict
(
start_idx
=
item
.
start_idx
,
end_idx
=
item
.
end_idx
)
dict
(
start_idx
=
item
.
start_idx
,
end_idx
=
item
.
end_idx
)
for
item
in
result
_groups
.
get
(
key
,
[])
for
item
in
result
.
get
(
key
,
[])
]
]
for
key
in
expected_by_key
for
key
in
expected_by_key
}
==
expected_by_key
}
==
expected_by_key
...
@@ -552,39 +557,35 @@ def test_find_update_text(
...
@@ -552,39 +557,35 @@ def test_find_update_text(
update_type
,
update_type
,
expected_by_mm_count
,
expected_by_mm_count
,
)
in
expected_by_update_type_mm_count
.
items
():
)
in
expected_by_update_type_mm_count
.
items
():
mm_prompt_updates
=
{
key
:
[
update_type
(
key
,
target
,
repl_by_key
[
key
]).
bind
(
mock_tokenizer
)]
for
key
,
target
in
target_by_key
.
items
()
}
mm_matches
=
{
key
:
find_text_matches
(
prompt
,
updates
)
for
key
,
updates
in
mm_prompt_updates
.
items
()
}
for
mm_count
,
expected
in
expected_by_mm_count
.
items
():
for
mm_count
,
expected
in
expected_by_mm_count
.
items
():
result
=
apply_text_matches
(
mm_prompt_updates
=
{
key
:
[[
update_type
(
key
,
target
,
repl_by_key
[
key
]).
resolve
(
i
)]
for
i
in
range
(
mm_count
)]
for
key
,
target
in
target_by_key
.
items
()
}
new_prompt
,
result
=
apply_text_matches
(
prompt
,
prompt
,
mm_matches
,
mm_prompt_updates
,
{
key
:
mm_count
mock_tokenizer
,
for
key
in
repl_by_key
},
)
)
# Only displayed on error
# Only displayed on error
print
(
"update_type:"
,
update_type
)
print
(
"update_type:"
,
update_type
)
print
(
"mm_count:"
,
mm_count
)
print
(
"mm_count:"
,
mm_count
)
print
(
"mm_matches:"
,
mm_matches
)
print
(
"mm_prompt_updates:"
,
mm_prompt_updates
)
print
(
"new_prompt:"
,
new_prompt
)
print
(
"result:"
,
result
)
print
(
"result:"
,
result
)
# Manually constructed results
# Manually constructed results
assert
resul
t
==
expected
assert
new_promp
t
==
expected
# yapf: disable
# yapf: disable
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
(
"prompt"
,
"target_by_key"
,
"repl_by_key"
,
"expected_by_update_type_mm_count"
),
# noqa: E501
(
"prompt"
,
"target_by_key"
,
"repl_by_key"
,
"expected_by_update_type_mm_count"
),
# noqa: E501
[
[
# Tokenized test cases of `test_find_
replac
e_text`
# Tokenized test cases of `test_find_
updat
e_text`
# using the vocab of llava-hf/llava-v1.6-mistral-7b-hf
# using the vocab of llava-hf/llava-v1.6-mistral-7b-hf
(
(
[
1
,
9833
,
28747
,
32000
,
9833
,
28747
,
32000
,
32000
,
918
],
[
1
,
9833
,
28747
,
32000
,
9833
,
28747
,
32000
,
32000
,
918
],
...
@@ -726,32 +727,28 @@ def test_find_update_tokens(
...
@@ -726,32 +727,28 @@ def test_find_update_tokens(
update_type
,
update_type
,
expected_by_mm_count
,
expected_by_mm_count
,
)
in
expected_by_update_type_mm_count
.
items
():
)
in
expected_by_update_type_mm_count
.
items
():
mm_prompt_updates
=
{
key
:
[
update_type
(
key
,
target
,
repl_by_key
[
key
]).
bind
(
mock_tokenizer
)]
for
key
,
target
in
target_by_key
.
items
()
}
mm_matches
=
{
key
:
find_token_matches
(
prompt
,
updates
)
for
key
,
updates
in
mm_prompt_updates
.
items
()
}
for
mm_count
,
expected
in
expected_by_mm_count
.
items
():
for
mm_count
,
expected
in
expected_by_mm_count
.
items
():
result
=
apply_token_matches
(
mm_prompt_updates
=
{
key
:
[[
update_type
(
key
,
target
,
repl_by_key
[
key
]).
resolve
(
i
)]
for
i
in
range
(
mm_count
)]
for
key
,
target
in
target_by_key
.
items
()
}
new_prompt
,
result
=
apply_token_matches
(
prompt
,
prompt
,
mm_matches
,
mm_prompt_updates
,
{
key
:
mm_count
mock_tokenizer
,
for
key
in
repl_by_key
},
)
)
# Only displayed on error
# Only displayed on error
print
(
"update_type:"
,
update_type
)
print
(
"update_type:"
,
update_type
)
print
(
"mm_count:"
,
mm_count
)
print
(
"mm_count:"
,
mm_count
)
print
(
"mm_matches:"
,
mm_matches
)
print
(
"mm_prompt_updates:"
,
mm_prompt_updates
)
print
(
"new_prompt:"
,
new_prompt
)
print
(
"result:"
,
result
)
print
(
"result:"
,
result
)
# Manually constructed results
# Manually constructed results
assert
resul
t
==
expected
assert
new_promp
t
==
expected
# yapf: disable
# yapf: disable
...
@@ -878,17 +875,11 @@ def test_find_mm_placeholders(
...
@@ -878,17 +875,11 @@ def test_find_mm_placeholders(
mock_tokenizer
=
cast
(
AnyTokenizer
,
object
())
mock_tokenizer
=
cast
(
AnyTokenizer
,
object
())
mm_prompt_updates
=
{
mm_prompt_updates
=
{
key
:
[
update_type
(
key
,
[],
repl
).
bind
(
mock_tokenizer
)]
key
:
[
[
update_type
(
key
,
[],
repl
).
resolve
(
i
)]
for
i
in
range
(
3
)]
for
key
,
repl
in
repl_by_key
.
items
()
for
key
,
repl
in
repl_by_key
.
items
()
}
}
result
=
find_mm_placeholders
(
result
=
find_mm_placeholders
(
prompt
,
mm_prompt_updates
,
mock_tokenizer
)
mm_prompt_updates
,
prompt
,
# Effectively match all occurrences in the prompt
{
key
:
3
for
key
in
repl_by_key
},
)
# Only displayed on error
# Only displayed on error
print
(
"result:"
,
result
)
print
(
"result:"
,
result
)
...
...
Prev
1
…
11
12
13
14
15
16
17
18
19
…
26
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment