Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ca796e19
Commit
ca796e19
authored
Mar 21, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.1' into v0.8.1-ori
parents
e983c804
61c7a1b8
Changes
130
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
300 additions
and
260 deletions
+300
-260
tests/entrypoints/test_chat_utils.py
tests/entrypoints/test_chat_utils.py
+3
-3
tests/models/decoder_only/audio_language/test_ultravox.py
tests/models/decoder_only/audio_language/test_ultravox.py
+2
-13
tests/models/decoder_only/vision_language/test_models.py
tests/models/decoder_only/vision_language/test_models.py
+17
-50
tests/models/decoder_only/vision_language/test_pixtral.py
tests/models/decoder_only/vision_language/test_pixtral.py
+27
-53
tests/models/decoder_only/vision_language/vlm_utils/core.py
tests/models/decoder_only/vision_language/vlm_utils/core.py
+0
-3
tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
...els/decoder_only/vision_language/vlm_utils/model_utils.py
+49
-42
tests/models/decoder_only/vision_language/vlm_utils/types.py
tests/models/decoder_only/vision_language/vlm_utils/types.py
+2
-9
tests/models/embedding/vision_language/test_dse_qwen2_vl.py
tests/models/embedding/vision_language/test_dse_qwen2_vl.py
+28
-24
tests/models/embedding/vision_language/test_llava_next.py
tests/models/embedding/vision_language/test_llava_next.py
+3
-4
tests/models/embedding/vision_language/test_phi3v.py
tests/models/embedding/vision_language/test_phi3v.py
+1
-2
tests/models/encoder_decoder/vision_language/test_mllama.py
tests/models/encoder_decoder/vision_language/test_mllama.py
+2
-7
tests/models/fixtures/mistral_small_3_chat.json
tests/models/fixtures/mistral_small_3_chat.json
+1
-0
tests/models/fixtures/pixtral_chat_engine.json
tests/models/fixtures/pixtral_chat_engine.json
+0
-1
tests/models/utils.py
tests/models/utils.py
+4
-7
tests/multimodal/test_processing.py
tests/multimodal/test_processing.py
+103
-6
tests/neuron/1_core/test_prefix_prefill.py
tests/neuron/1_core/test_prefix_prefill.py
+1
-1
tests/quantization/test_bitsandbytes.py
tests/quantization/test_bitsandbytes.py
+2
-0
tests/tensorizer_loader/test_tensorizer.py
tests/tensorizer_loader/test_tensorizer.py
+2
-2
tests/tpu/test_compilation.py
tests/tpu/test_compilation.py
+39
-24
tests/tpu/test_custom_dispatcher.py
tests/tpu/test_custom_dispatcher.py
+14
-9
No files found.
tests/entrypoints/test_chat_utils.py
View file @
ca796e19
...
@@ -34,7 +34,7 @@ def phi3v_model_config():
...
@@ -34,7 +34,7 @@ def phi3v_model_config():
tokenizer
=
PHI3V_MODEL_ID
,
tokenizer
=
PHI3V_MODEL_ID
,
tokenizer_mode
=
"auto"
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
dtype
=
"
bfloat16
"
,
dtype
=
"
auto
"
,
seed
=
0
,
seed
=
0
,
limit_mm_per_prompt
=
{
limit_mm_per_prompt
=
{
"image"
:
2
,
"image"
:
2
,
...
@@ -58,7 +58,7 @@ def mllama_model_config():
...
@@ -58,7 +58,7 @@ def mllama_model_config():
tokenizer
=
MLLAMA_MODEL_ID
,
tokenizer
=
MLLAMA_MODEL_ID
,
tokenizer_mode
=
"auto"
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
dtype
=
"
bfloat16
"
,
dtype
=
"
auto
"
,
seed
=
0
,
seed
=
0
,
limit_mm_per_prompt
=
{
limit_mm_per_prompt
=
{
"image"
:
2
,
"image"
:
2
,
...
@@ -669,7 +669,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
...
@@ -669,7 +669,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
tokenizer
=
MLLAMA_MODEL_ID
,
tokenizer
=
MLLAMA_MODEL_ID
,
tokenizer_mode
=
"auto"
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
dtype
=
"
bfloat16
"
,
dtype
=
"
auto
"
,
seed
=
0
,
seed
=
0
,
limit_mm_per_prompt
=
{
limit_mm_per_prompt
=
{
"image"
:
2
,
"image"
:
2
,
...
...
tests/models/decoder_only/audio_language/test_ultravox.py
View file @
ca796e19
...
@@ -5,11 +5,10 @@ from typing import Optional
...
@@ -5,11 +5,10 @@ from typing import Optional
import
numpy
as
np
import
numpy
as
np
import
pytest
import
pytest
import
pytest_asyncio
import
pytest_asyncio
from
transformers
import
AutoModel
,
AutoTokenizer
,
BatchEncoding
from
transformers
import
AutoModel
,
AutoTokenizer
from
vllm.multimodal.audio
import
resample_audio
from
vllm.multimodal.audio
import
resample_audio
from
vllm.sequence
import
SampleLogprobs
from
vllm.sequence
import
SampleLogprobs
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
from
....conftest
import
HfRunner
,
VllmRunner
from
....conftest
import
HfRunner
,
VllmRunner
from
....utils
import
RemoteOpenAIServer
from
....utils
import
RemoteOpenAIServer
...
@@ -107,8 +106,6 @@ def run_test(
...
@@ -107,8 +106,6 @@ def run_test(
**
kwargs
,
**
kwargs
,
):
):
"""Inference result should be the same between hf and vllm."""
"""Inference result should be the same between hf and vllm."""
torch_dtype
=
STR_DTYPE_TO_TORCH_DTYPE
[
dtype
]
# NOTE: take care of the order. run vLLM first, and then run HF.
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# if we run HF first, the cuda initialization will be done and it
...
@@ -124,15 +121,7 @@ def run_test(
...
@@ -124,15 +121,7 @@ def run_test(
for
vllm_prompt
,
_
,
audio
in
prompts_and_audios
for
vllm_prompt
,
_
,
audio
in
prompts_and_audios
]
]
def
process
(
hf_inputs
:
BatchEncoding
,
**
kwargs
):
with
hf_runner
(
model
,
dtype
=
dtype
,
auto_cls
=
AutoModel
)
as
hf_model
:
hf_inputs
[
"audio_values"
]
=
hf_inputs
[
"audio_values"
]
\
.
to
(
torch_dtype
)
# type: ignore
return
hf_inputs
with
hf_runner
(
model
,
dtype
=
dtype
,
postprocess_inputs
=
process
,
auto_cls
=
AutoModel
)
as
hf_model
:
hf_outputs_per_audio
=
[
hf_outputs_per_audio
=
[
hf_model
.
generate_greedy_logprobs_limit
(
hf_model
.
generate_greedy_logprobs_limit
(
[
hf_prompt
],
[
hf_prompt
],
...
...
tests/models/decoder_only/vision_language/test_models.py
View file @
ca796e19
...
@@ -9,7 +9,7 @@ from pathlib import PosixPath
...
@@ -9,7 +9,7 @@ from pathlib import PosixPath
import
pytest
import
pytest
from
packaging.version
import
Version
from
packaging.version
import
Version
from
transformers
import
AutoModelFor
PreTraining
,
AutoModelForVision2Seq
from
transformers
import
AutoModelFor
ImageTextToText
,
AutoModelForVision2Seq
from
transformers
import
__version__
as
TRANSFORMERS_VERSION
from
transformers
import
__version__
as
TRANSFORMERS_VERSION
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
...
@@ -101,7 +101,7 @@ VLM_TEST_SETTINGS = {
...
@@ -101,7 +101,7 @@ VLM_TEST_SETTINGS = {
prompt_formatter
=
lambda
img_prompt
:
f
"USER:
{
img_prompt
}
\n
ASSISTANT:"
,
prompt_formatter
=
lambda
img_prompt
:
f
"USER:
{
img_prompt
}
\n
ASSISTANT:"
,
convert_assets_to_embeddings
=
model_utils
.
get_llava_embeddings
,
convert_assets_to_embeddings
=
model_utils
.
get_llava_embeddings
,
max_model_len
=
4096
,
max_model_len
=
4096
,
auto_cls
=
AutoModelFor
Vision2Seq
,
auto_cls
=
AutoModelFor
ImageTextToText
,
vllm_output_post_proc
=
model_utils
.
llava_image_vllm_to_hf_output
,
vllm_output_post_proc
=
model_utils
.
llava_image_vllm_to_hf_output
,
custom_test_opts
=
[
CustomTestOptions
(
custom_test_opts
=
[
CustomTestOptions
(
inputs
=
custom_inputs
.
multi_image_multi_aspect_ratio_inputs
(
inputs
=
custom_inputs
.
multi_image_multi_aspect_ratio_inputs
(
...
@@ -121,10 +121,7 @@ VLM_TEST_SETTINGS = {
...
@@ -121,10 +121,7 @@ VLM_TEST_SETTINGS = {
"stop_sign"
:
"caption es"
,
"stop_sign"
:
"caption es"
,
"cherry_blossom"
:
"What is in the picture?"
,
"cherry_blossom"
:
"What is in the picture?"
,
}),
}),
auto_cls
=
AutoModelForVision2Seq
,
auto_cls
=
AutoModelForImageTextToText
,
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
"pixel_values"
),
vllm_output_post_proc
=
model_utils
.
paligemma_vllm_to_hf_output
,
vllm_output_post_proc
=
model_utils
.
paligemma_vllm_to_hf_output
,
dtype
=
"bfloat16"
,
dtype
=
"bfloat16"
,
marks
=
[
pytest
.
mark
.
skip
(
reason
=
"vLLM does not support PrefixLM attention mask"
)],
# noqa: E501
marks
=
[
pytest
.
mark
.
skip
(
reason
=
"vLLM does not support PrefixLM attention mask"
)],
# noqa: E501
...
@@ -179,7 +176,6 @@ VLM_TEST_SETTINGS = {
...
@@ -179,7 +176,6 @@ VLM_TEST_SETTINGS = {
# "cherry_blossom": "<vlm_image>Please infer the season with reason.", # noqa: E501
# "cherry_blossom": "<vlm_image>Please infer the season with reason.", # noqa: E501
# }),
# }),
# multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501
# multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501
# postprocess_inputs=model_utils.cast_dtype_post_processor("pixel_values"), # noqa: E501
# stop_str=["<|im_end|>"],
# stop_str=["<|im_end|>"],
# image_size_factors=[(0.10, 0.15)],
# image_size_factors=[(0.10, 0.15)],
# max_tokens=64,
# max_tokens=64,
...
@@ -190,7 +186,7 @@ VLM_TEST_SETTINGS = {
...
@@ -190,7 +186,7 @@ VLM_TEST_SETTINGS = {
test_type
=
VLMTestType
.
IMAGE
,
test_type
=
VLMTestType
.
IMAGE
,
prompt_formatter
=
lambda
img_prompt
:
f
"Question:
{
img_prompt
}
Answer:"
,
prompt_formatter
=
lambda
img_prompt
:
f
"Question:
{
img_prompt
}
Answer:"
,
img_idx_to_prompt
=
lambda
idx
:
""
,
img_idx_to_prompt
=
lambda
idx
:
""
,
auto_cls
=
AutoModelFor
Vision2Seq
,
auto_cls
=
AutoModelFor
ImageTextToText
,
vllm_output_post_proc
=
model_utils
.
blip2_vllm_to_hf_output
,
vllm_output_post_proc
=
model_utils
.
blip2_vllm_to_hf_output
,
),
),
"chameleon"
:
VLMTestInfo
(
"chameleon"
:
VLMTestInfo
(
...
@@ -199,10 +195,7 @@ VLM_TEST_SETTINGS = {
...
@@ -199,10 +195,7 @@ VLM_TEST_SETTINGS = {
prompt_formatter
=
lambda
img_prompt
:
f
"USER:
{
img_prompt
}
\n
ASSISTANT:"
,
prompt_formatter
=
lambda
img_prompt
:
f
"USER:
{
img_prompt
}
\n
ASSISTANT:"
,
max_model_len
=
4096
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
auto_cls
=
AutoModelForVision2Seq
,
auto_cls
=
AutoModelForImageTextToText
,
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
"pixel_values"
),
# For chameleon, we only compare the sequences
# For chameleon, we only compare the sequences
vllm_output_post_proc
=
lambda
vllm_output
,
model
:
vllm_output
[:
2
],
vllm_output_post_proc
=
lambda
vllm_output
,
model
:
vllm_output
[:
2
],
hf_output_post_proc
=
lambda
hf_output
,
model
:
hf_output
[:
2
],
hf_output_post_proc
=
lambda
hf_output
,
model
:
hf_output
[:
2
],
...
@@ -222,7 +215,6 @@ VLM_TEST_SETTINGS = {
...
@@ -222,7 +215,6 @@ VLM_TEST_SETTINGS = {
}),
}),
multi_image_prompt
=
"image_1:<image>
\n
image_2:<image>
\n
Which image can we see the car and the tower?"
,
# noqa: E501
multi_image_prompt
=
"image_1:<image>
\n
image_2:<image>
\n
Which image can we see the car and the tower?"
,
# noqa: E501
patch_hf_runner
=
model_utils
.
deepseekvl2_patch_hf_runner
,
patch_hf_runner
=
model_utils
.
deepseekvl2_patch_hf_runner
,
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
"images"
),
hf_output_post_proc
=
model_utils
.
deepseekvl2_trunc_hf_output
,
hf_output_post_proc
=
model_utils
.
deepseekvl2_trunc_hf_output
,
stop_str
=
[
"<|end▁of▁sentence|>"
,
"<|begin▁of▁sentence|>"
],
# noqa: E501
stop_str
=
[
"<|end▁of▁sentence|>"
,
"<|begin▁of▁sentence|>"
],
# noqa: E501
image_size_factors
=
[(),
(
1.0
,
),
(
1.0
,
1.0
,
1.0
),
(
0.1
,
0.5
,
1.0
)],
image_size_factors
=
[(),
(
1.0
,
),
(
1.0
,
1.0
,
1.0
),
(
0.1
,
0.5
,
1.0
)],
...
@@ -240,6 +232,7 @@ VLM_TEST_SETTINGS = {
...
@@ -240,6 +232,7 @@ VLM_TEST_SETTINGS = {
img_idx_to_prompt
=
lambda
idx
:
""
,
img_idx_to_prompt
=
lambda
idx
:
""
,
max_model_len
=
2048
,
max_model_len
=
2048
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
auto_cls
=
AutoModelForImageTextToText
,
use_tokenizer_eos
=
True
,
use_tokenizer_eos
=
True
,
vllm_output_post_proc
=
model_utils
.
fuyu_vllm_to_hf_output
,
vllm_output_post_proc
=
model_utils
.
fuyu_vllm_to_hf_output
,
num_logprobs
=
10
,
num_logprobs
=
10
,
...
@@ -256,9 +249,7 @@ VLM_TEST_SETTINGS = {
...
@@ -256,9 +249,7 @@ VLM_TEST_SETTINGS = {
multi_image_prompt
=
"<start_of_image><start_of_image>Describe the two images in detail."
,
# noqa: E501
multi_image_prompt
=
"<start_of_image><start_of_image>Describe the two images in detail."
,
# noqa: E501
max_model_len
=
4096
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
# TODO: Use AutoModelForVision2Seq once transformers supports this
auto_cls
=
AutoModelForImageTextToText
,
auto_cls
=
AutoModelForPreTraining
,
dtype
=
"bfloat16"
,
vllm_runner_kwargs
=
{
"mm_processor_kwargs"
:
{
"do_pan_and_scan"
:
True
}},
vllm_runner_kwargs
=
{
"mm_processor_kwargs"
:
{
"do_pan_and_scan"
:
True
}},
patch_hf_runner
=
model_utils
.
gemma3_patch_hf_runner
,
patch_hf_runner
=
model_utils
.
gemma3_patch_hf_runner
,
),
),
...
@@ -272,7 +263,6 @@ VLM_TEST_SETTINGS = {
...
@@ -272,7 +263,6 @@ VLM_TEST_SETTINGS = {
}),
}),
max_model_len
=
2048
,
max_model_len
=
2048
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
dtype
=
"bfloat16"
,
get_stop_token_ids
=
lambda
tok
:
[
151329
,
151336
,
151338
],
get_stop_token_ids
=
lambda
tok
:
[
151329
,
151336
,
151338
],
patch_hf_runner
=
model_utils
.
glm4v_patch_hf_runner
,
patch_hf_runner
=
model_utils
.
glm4v_patch_hf_runner
,
# The image embeddings match with HF but the outputs of the language
# The image embeddings match with HF but the outputs of the language
...
@@ -295,7 +285,6 @@ VLM_TEST_SETTINGS = {
...
@@ -295,7 +285,6 @@ VLM_TEST_SETTINGS = {
}),
}),
multi_image_prompt
=
"Image-1: <image>
\n
Image-2: <image>
\n
Describe the two images in short."
,
# noqa: E501
multi_image_prompt
=
"Image-1: <image>
\n
Image-2: <image>
\n
Describe the two images in short."
,
# noqa: E501
max_model_len
=
8192
,
max_model_len
=
8192
,
dtype
=
"bfloat16"
,
use_tokenizer_eos
=
True
,
use_tokenizer_eos
=
True
,
num_logprobs
=
10
,
num_logprobs
=
10
,
patch_hf_runner
=
model_utils
.
h2ovl_patch_hf_runner
,
patch_hf_runner
=
model_utils
.
h2ovl_patch_hf_runner
,
...
@@ -307,7 +296,7 @@ VLM_TEST_SETTINGS = {
...
@@ -307,7 +296,7 @@ VLM_TEST_SETTINGS = {
img_idx_to_prompt
=
lambda
idx
:
"<image>"
,
img_idx_to_prompt
=
lambda
idx
:
"<image>"
,
max_model_len
=
8192
,
max_model_len
=
8192
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
auto_cls
=
AutoModelFor
Vision2Seq
,
auto_cls
=
AutoModelFor
ImageTextToText
,
hf_output_post_proc
=
model_utils
.
idefics3_trunc_hf_output
,
hf_output_post_proc
=
model_utils
.
idefics3_trunc_hf_output
,
),
),
"intern_vl"
:
VLMTestInfo
(
"intern_vl"
:
VLMTestInfo
(
...
@@ -324,10 +313,6 @@ VLM_TEST_SETTINGS = {
...
@@ -324,10 +313,6 @@ VLM_TEST_SETTINGS = {
}),
}),
multi_image_prompt
=
"Image-1: <image>
\n
Image-2: <image>
\n
Describe the two images in short."
,
# noqa: E501
multi_image_prompt
=
"Image-1: <image>
\n
Image-2: <image>
\n
Describe the two images in short."
,
# noqa: E501
max_model_len
=
4096
,
max_model_len
=
4096
,
# NOTE: Mono-InternVL-2B doesn't work with fp16,
# it will result NaN during inference.
# See: https://huggingface.co/OpenGVLab/Mono-InternVL-2B/discussions/9
dtype
=
"bfloat16"
,
use_tokenizer_eos
=
True
,
use_tokenizer_eos
=
True
,
patch_hf_runner
=
model_utils
.
internvl_patch_hf_runner
,
patch_hf_runner
=
model_utils
.
internvl_patch_hf_runner
,
),
),
...
@@ -336,7 +321,7 @@ VLM_TEST_SETTINGS = {
...
@@ -336,7 +321,7 @@ VLM_TEST_SETTINGS = {
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
CUSTOM_INPUTS
),
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
CUSTOM_INPUTS
),
prompt_formatter
=
lambda
img_prompt
:
f
"[INST]
{
img_prompt
}
[/INST]"
,
prompt_formatter
=
lambda
img_prompt
:
f
"[INST]
{
img_prompt
}
[/INST]"
,
max_model_len
=
10240
,
max_model_len
=
10240
,
auto_cls
=
AutoModelFor
Vision2Seq
,
auto_cls
=
AutoModelFor
ImageTextToText
,
vllm_output_post_proc
=
model_utils
.
llava_image_vllm_to_hf_output
,
vllm_output_post_proc
=
model_utils
.
llava_image_vllm_to_hf_output
,
custom_test_opts
=
[
CustomTestOptions
(
custom_test_opts
=
[
CustomTestOptions
(
inputs
=
custom_inputs
.
multi_image_multi_aspect_ratio_inputs
(
inputs
=
custom_inputs
.
multi_image_multi_aspect_ratio_inputs
(
...
@@ -351,9 +336,6 @@ VLM_TEST_SETTINGS = {
...
@@ -351,9 +336,6 @@ VLM_TEST_SETTINGS = {
prompt_formatter
=
lambda
vid_prompt
:
f
"<|im_start|>user
\n
{
vid_prompt
}
<|im_end|>
\n
<|im_start|>assistant
\n
"
,
# noqa: E501
prompt_formatter
=
lambda
vid_prompt
:
f
"<|im_start|>user
\n
{
vid_prompt
}
<|im_end|>
\n
<|im_start|>assistant
\n
"
,
# noqa: E501
num_video_frames
=
16
,
num_video_frames
=
16
,
max_model_len
=
16384
,
max_model_len
=
16384
,
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
"pixel_values_videos"
),
auto_cls
=
AutoModelForVision2Seq
,
auto_cls
=
AutoModelForVision2Seq
,
vllm_output_post_proc
=
model_utils
.
llava_onevision_vllm_to_hf_output
,
vllm_output_post_proc
=
model_utils
.
llava_onevision_vllm_to_hf_output
,
custom_test_opts
=
[
CustomTestOptions
(
custom_test_opts
=
[
CustomTestOptions
(
...
@@ -378,11 +360,8 @@ VLM_TEST_SETTINGS = {
...
@@ -378,11 +360,8 @@ VLM_TEST_SETTINGS = {
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
prompt_formatter
=
lambda
img_prompt
:
f
"<|start_header_id|>user<|end_header_id|>
\n\n
{
img_prompt
}
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
\n\n
"
,
# noqa: E501
prompt_formatter
=
lambda
img_prompt
:
f
"<|start_header_id|>user<|end_header_id|>
\n\n
{
img_prompt
}
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
\n\n
"
,
# noqa: E501
max_model_len
=
4096
,
max_model_len
=
4096
,
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
"pixel_values"
),
get_stop_token_ids
=
lambda
tok
:
[
128009
],
get_stop_token_ids
=
lambda
tok
:
[
128009
],
auto_cls
=
AutoModelFor
Vision2Seq
,
auto_cls
=
AutoModelFor
ImageTextToText
,
vllm_output_post_proc
=
model_utils
.
mantis_vllm_to_hf_output
,
vllm_output_post_proc
=
model_utils
.
mantis_vllm_to_hf_output
,
patch_hf_runner
=
model_utils
.
mantis_patch_hf_runner
,
patch_hf_runner
=
model_utils
.
mantis_patch_hf_runner
,
marks
=
[
marks
=
[
...
@@ -400,8 +379,8 @@ VLM_TEST_SETTINGS = {
...
@@ -400,8 +379,8 @@ VLM_TEST_SETTINGS = {
max_model_len
=
4096
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
get_stop_token_ids
=
lambda
tok
:
[
tok
.
eos_id
,
tok
.
eot_id
],
get_stop_token_ids
=
lambda
tok
:
[
tok
.
eos_id
,
tok
.
eot_id
],
postprocess_inputs
=
model_utils
.
wrap_inputs_post_processor
,
hf_output_post_proc
=
model_utils
.
minicpmv_trunc_hf_output
,
hf_output_post_proc
=
model_utils
.
minicpmv_trunc_hf_output
,
patch_hf_runner
=
model_utils
.
minicpmv_25_patch_hf_runner
,
),
),
"minicpmo_26"
:
VLMTestInfo
(
"minicpmo_26"
:
VLMTestInfo
(
models
=
[
"openbmb/MiniCPM-o-2_6"
],
models
=
[
"openbmb/MiniCPM-o-2_6"
],
...
@@ -411,11 +390,8 @@ VLM_TEST_SETTINGS = {
...
@@ -411,11 +390,8 @@ VLM_TEST_SETTINGS = {
max_model_len
=
4096
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
get_stop_token_ids
=
lambda
tok
:
tok
.
convert_tokens_to_ids
([
'<|im_end|>'
,
'<|endoftext|>'
]),
# noqa: E501
get_stop_token_ids
=
lambda
tok
:
tok
.
convert_tokens_to_ids
([
'<|im_end|>'
,
'<|endoftext|>'
]),
# noqa: E501
postprocess_inputs
=
model_utils
.
ignore_inputs_post_processor
(
"image_sizes"
),
hf_output_post_proc
=
model_utils
.
minicpmv_trunc_hf_output
,
hf_output_post_proc
=
model_utils
.
minicpmv_trunc_hf_output
,
patch_hf_runner
=
model_utils
.
minicpmo_patch_hf_runner
patch_hf_runner
=
model_utils
.
minicpmo_
26_
patch_hf_runner
,
),
),
"minicpmv_26"
:
VLMTestInfo
(
"minicpmv_26"
:
VLMTestInfo
(
models
=
[
"openbmb/MiniCPM-V-2_6"
],
models
=
[
"openbmb/MiniCPM-V-2_6"
],
...
@@ -425,10 +401,8 @@ VLM_TEST_SETTINGS = {
...
@@ -425,10 +401,8 @@ VLM_TEST_SETTINGS = {
max_model_len
=
4096
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
get_stop_token_ids
=
lambda
tok
:
tok
.
convert_tokens_to_ids
([
'<|im_end|>'
,
'<|endoftext|>'
]),
# noqa: E501
get_stop_token_ids
=
lambda
tok
:
tok
.
convert_tokens_to_ids
([
'<|im_end|>'
,
'<|endoftext|>'
]),
# noqa: E501
postprocess_inputs
=
model_utils
.
ignore_inputs_post_processor
(
"image_sizes"
),
hf_output_post_proc
=
model_utils
.
minicpmv_trunc_hf_output
,
hf_output_post_proc
=
model_utils
.
minicpmv_trunc_hf_output
,
patch_hf_runner
=
model_utils
.
minicpmv_26_patch_hf_runner
,
),
),
"molmo"
:
VLMTestInfo
(
"molmo"
:
VLMTestInfo
(
models
=
[
"allenai/Molmo-7B-D-0924"
],
models
=
[
"allenai/Molmo-7B-D-0924"
],
...
@@ -437,7 +411,6 @@ VLM_TEST_SETTINGS = {
...
@@ -437,7 +411,6 @@ VLM_TEST_SETTINGS = {
max_model_len
=
4096
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
patch_hf_runner
=
model_utils
.
molmo_patch_hf_runner
,
patch_hf_runner
=
model_utils
.
molmo_patch_hf_runner
,
postprocess_inputs
=
model_utils
.
molmo_post_processor
,
),
),
# Tests for phi3v currently live in another file because of a bug in
# Tests for phi3v currently live in another file because of a bug in
# transformers. Once this issue is fixed, we can enable them here instead.
# transformers. Once this issue is fixed, we can enable them here instead.
...
@@ -463,7 +436,7 @@ VLM_TEST_SETTINGS = {
...
@@ -463,7 +436,7 @@ VLM_TEST_SETTINGS = {
img_idx_to_prompt
=
lambda
idx
:
"[IMG]"
,
img_idx_to_prompt
=
lambda
idx
:
"[IMG]"
,
max_model_len
=
8192
,
max_model_len
=
8192
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
auto_cls
=
AutoModelFor
Vision2Seq
,
auto_cls
=
AutoModelFor
ImageTextToText
,
marks
=
[
large_gpu_mark
(
min_gb
=
48
)],
marks
=
[
large_gpu_mark
(
min_gb
=
48
)],
),
),
"qwen_vl"
:
VLMTestInfo
(
"qwen_vl"
:
VLMTestInfo
(
...
@@ -481,10 +454,7 @@ VLM_TEST_SETTINGS = {
...
@@ -481,10 +454,7 @@ VLM_TEST_SETTINGS = {
models
=
[
"facebook/chameleon-7b"
],
models
=
[
"facebook/chameleon-7b"
],
prompt_formatter
=
lambda
img_prompt
:
f
"USER:
{
img_prompt
}
\n
ASSISTANT:"
,
prompt_formatter
=
lambda
img_prompt
:
f
"USER:
{
img_prompt
}
\n
ASSISTANT:"
,
max_model_len
=
4096
,
max_model_len
=
4096
,
auto_cls
=
AutoModelForVision2Seq
,
auto_cls
=
AutoModelForImageTextToText
,
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
"pixel_values"
),
vllm_output_post_proc
=
lambda
vllm_output
,
model
:
vllm_output
[:
2
],
vllm_output_post_proc
=
lambda
vllm_output
,
model
:
vllm_output
[:
2
],
hf_output_post_proc
=
lambda
hf_output
,
model
:
hf_output
[:
2
],
hf_output_post_proc
=
lambda
hf_output
,
model
:
hf_output
[:
2
],
comparator
=
check_outputs_equal
,
comparator
=
check_outputs_equal
,
...
@@ -495,7 +465,7 @@ VLM_TEST_SETTINGS = {
...
@@ -495,7 +465,7 @@ VLM_TEST_SETTINGS = {
models
=
[
"llava-hf/llava-1.5-7b-hf"
],
models
=
[
"llava-hf/llava-1.5-7b-hf"
],
prompt_formatter
=
lambda
img_prompt
:
f
"USER:
{
img_prompt
}
\n
ASSISTANT:"
,
prompt_formatter
=
lambda
img_prompt
:
f
"USER:
{
img_prompt
}
\n
ASSISTANT:"
,
max_model_len
=
4096
,
max_model_len
=
4096
,
auto_cls
=
AutoModelFor
Vision2Seq
,
auto_cls
=
AutoModelFor
ImageTextToText
,
vllm_output_post_proc
=
model_utils
.
llava_image_vllm_to_hf_output
,
vllm_output_post_proc
=
model_utils
.
llava_image_vllm_to_hf_output
,
marks
=
multi_gpu_marks
(
num_gpus
=
2
),
marks
=
multi_gpu_marks
(
num_gpus
=
2
),
**
COMMON_BROADCAST_SETTINGS
# type: ignore
**
COMMON_BROADCAST_SETTINGS
# type: ignore
...
@@ -504,7 +474,7 @@ VLM_TEST_SETTINGS = {
...
@@ -504,7 +474,7 @@ VLM_TEST_SETTINGS = {
models
=
[
"llava-hf/llava-v1.6-mistral-7b-hf"
],
models
=
[
"llava-hf/llava-v1.6-mistral-7b-hf"
],
prompt_formatter
=
lambda
img_prompt
:
f
"[INST]
{
img_prompt
}
[/INST]"
,
prompt_formatter
=
lambda
img_prompt
:
f
"[INST]
{
img_prompt
}
[/INST]"
,
max_model_len
=
10240
,
max_model_len
=
10240
,
auto_cls
=
AutoModelFor
Vision2Seq
,
auto_cls
=
AutoModelFor
ImageTextToText
,
vllm_output_post_proc
=
model_utils
.
llava_image_vllm_to_hf_output
,
vllm_output_post_proc
=
model_utils
.
llava_image_vllm_to_hf_output
,
marks
=
multi_gpu_marks
(
num_gpus
=
2
),
marks
=
multi_gpu_marks
(
num_gpus
=
2
),
**
COMMON_BROADCAST_SETTINGS
# type: ignore
**
COMMON_BROADCAST_SETTINGS
# type: ignore
...
@@ -529,9 +499,6 @@ VLM_TEST_SETTINGS = {
...
@@ -529,9 +499,6 @@ VLM_TEST_SETTINGS = {
test_type
=
VLMTestType
.
CUSTOM_INPUTS
,
test_type
=
VLMTestType
.
CUSTOM_INPUTS
,
max_model_len
=
16384
,
max_model_len
=
16384
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
"pixel_values"
),
auto_cls
=
AutoModelForVision2Seq
,
auto_cls
=
AutoModelForVision2Seq
,
vllm_output_post_proc
=
model_utils
.
llava_onevision_vllm_to_hf_output
,
vllm_output_post_proc
=
model_utils
.
llava_onevision_vllm_to_hf_output
,
custom_test_opts
=
[
CustomTestOptions
(
custom_test_opts
=
[
CustomTestOptions
(
...
...
tests/models/decoder_only/vision_language/test_pixtral.py
View file @
ca796e19
...
@@ -4,7 +4,6 @@
...
@@ -4,7 +4,6 @@
Run `pytest tests/models/test_mistral.py`.
Run `pytest tests/models/test_mistral.py`.
"""
"""
import
json
import
json
import
uuid
from
dataclasses
import
asdict
from
dataclasses
import
asdict
from
typing
import
TYPE_CHECKING
,
Any
,
Optional
from
typing
import
TYPE_CHECKING
,
Any
,
Optional
...
@@ -16,8 +15,7 @@ from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
...
@@ -16,8 +15,7 @@ from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from
mistral_common.tokens.tokenizers.multimodal
import
image_from_chunk
from
mistral_common.tokens.tokenizers.multimodal
import
image_from_chunk
from
transformers
import
AutoProcessor
from
transformers
import
AutoProcessor
from
vllm
import
(
EngineArgs
,
LLMEngine
,
RequestOutput
,
SamplingParams
,
from
vllm
import
RequestOutput
,
SamplingParams
,
TextPrompt
,
TokensPrompt
TextPrompt
,
TokensPrompt
)
from
vllm.multimodal
import
MultiModalDataBuiltins
from
vllm.multimodal
import
MultiModalDataBuiltins
from
vllm.multimodal.inputs
import
PlaceholderRange
from
vllm.multimodal.inputs
import
PlaceholderRange
from
vllm.sequence
import
Logprob
,
SampleLogprobs
from
vllm.sequence
import
Logprob
,
SampleLogprobs
...
@@ -28,7 +26,11 @@ from ...utils import check_logprobs_close
...
@@ -28,7 +26,11 @@ from ...utils import check_logprobs_close
if
TYPE_CHECKING
:
if
TYPE_CHECKING
:
from
_typeshed
import
StrPath
from
_typeshed
import
StrPath
MODELS
=
[
"mistralai/Pixtral-12B-2409"
]
PIXTRAL_ID
=
"mistralai/Pixtral-12B-2409"
MISTRAL_SMALL_3_1_ID
=
"mistralai/Mistral-Small-3.1-24B-Instruct-2503"
MODELS
=
[
PIXTRAL_ID
,
MISTRAL_SMALL_3_1_ID
]
IMG_URLS
=
[
IMG_URLS
=
[
"https://picsum.photos/id/237/400/300"
,
"https://picsum.photos/id/237/400/300"
,
"https://picsum.photos/id/231/200/300"
,
"https://picsum.photos/id/231/200/300"
,
...
@@ -125,8 +127,10 @@ MAX_MODEL_LEN = [8192, 65536]
...
@@ -125,8 +127,10 @@ MAX_MODEL_LEN = [8192, 65536]
FIXTURES_PATH
=
VLLM_PATH
/
"tests/models/fixtures"
FIXTURES_PATH
=
VLLM_PATH
/
"tests/models/fixtures"
assert
FIXTURES_PATH
.
exists
()
assert
FIXTURES_PATH
.
exists
()
FIXTURE_LOGPROBS_CHAT
=
FIXTURES_PATH
/
"pixtral_chat.json"
FIXTURE_LOGPROBS_CHAT
=
{
FIXTURE_LOGPROBS_ENGINE
=
FIXTURES_PATH
/
"pixtral_chat_engine.json"
PIXTRAL_ID
:
FIXTURES_PATH
/
"pixtral_chat.json"
,
MISTRAL_SMALL_3_1_ID
:
FIXTURES_PATH
/
"mistral_small_3_chat.json"
,
}
OutputsLogprobs
=
list
[
tuple
[
list
[
int
],
str
,
Optional
[
SampleLogprobs
]]]
OutputsLogprobs
=
list
[
tuple
[
list
[
int
],
str
,
Optional
[
SampleLogprobs
]]]
...
@@ -166,12 +170,12 @@ def test_chat(
...
@@ -166,12 +170,12 @@ def test_chat(
model
:
str
,
model
:
str
,
dtype
:
str
,
dtype
:
str
,
)
->
None
:
)
->
None
:
EXPECTED_CHAT_LOGPROBS
=
load_outputs_w_logprobs
(
FIXTURE_LOGPROBS_CHAT
)
EXPECTED_CHAT_LOGPROBS
=
load_outputs_w_logprobs
(
FIXTURE_LOGPROBS_CHAT
[
model
])
with
vllm_runner
(
with
vllm_runner
(
model
,
model
,
dtype
=
dtype
,
dtype
=
dtype
,
tokenizer_mode
=
"mistral"
,
tokenizer_mode
=
"mistral"
,
enable_chunked_prefill
=
False
,
max_model_len
=
max_model_len
,
max_model_len
=
max_model_len
,
limit_mm_per_prompt
=
LIMIT_MM_PER_PROMPT
,
limit_mm_per_prompt
=
LIMIT_MM_PER_PROMPT
,
)
as
vllm_model
:
)
as
vllm_model
:
...
@@ -183,70 +187,40 @@ def test_chat(
...
@@ -183,70 +187,40 @@ def test_chat(
outputs
.
extend
(
output
)
outputs
.
extend
(
output
)
logprobs
=
vllm_runner
.
_final_steps_generate_w_logprobs
(
outputs
)
logprobs
=
vllm_runner
.
_final_steps_generate_w_logprobs
(
outputs
)
# Remove last `None` prompt_logprobs to compare with fixture
for
i
in
range
(
len
(
logprobs
)):
assert
logprobs
[
i
][
-
1
]
is
None
logprobs
[
i
]
=
logprobs
[
i
][:
-
1
]
check_logprobs_close
(
outputs_0_lst
=
EXPECTED_CHAT_LOGPROBS
,
check_logprobs_close
(
outputs_0_lst
=
EXPECTED_CHAT_LOGPROBS
,
outputs_1_lst
=
logprobs
,
outputs_1_lst
=
logprobs
,
name_0
=
"h100_ref"
,
name_0
=
"h100_ref"
,
name_1
=
"output"
)
name_1
=
"output"
)
@
large_gpu_test
(
min_gb
=
80
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
def
test_model_engine
(
vllm_runner
,
model
:
str
,
dtype
:
str
)
->
None
:
EXPECTED_ENGINE_LOGPROBS
=
load_outputs_w_logprobs
(
FIXTURE_LOGPROBS_ENGINE
)
args
=
EngineArgs
(
model
=
model
,
tokenizer_mode
=
"mistral"
,
enable_chunked_prefill
=
False
,
limit_mm_per_prompt
=
LIMIT_MM_PER_PROMPT
,
dtype
=
dtype
,
)
engine
=
LLMEngine
.
from_engine_args
(
args
)
engine
.
add_request
(
uuid
.
uuid4
().
hex
,
ENGINE_INPUTS
[
0
],
SAMPLING_PARAMS
)
engine
.
add_request
(
uuid
.
uuid4
().
hex
,
ENGINE_INPUTS
[
1
],
SAMPLING_PARAMS
)
outputs
=
[]
count
=
0
while
True
:
out
=
engine
.
step
()
count
+=
1
for
request_output
in
out
:
if
request_output
.
finished
:
outputs
.
append
(
request_output
)
if
count
==
2
:
engine
.
add_request
(
uuid
.
uuid4
().
hex
,
ENGINE_INPUTS
[
2
],
SAMPLING_PARAMS
)
if
not
engine
.
has_unfinished_requests
():
break
logprobs
=
vllm_runner
.
_final_steps_generate_w_logprobs
(
outputs
)
check_logprobs_close
(
outputs_0_lst
=
EXPECTED_ENGINE_LOGPROBS
,
outputs_1_lst
=
logprobs
,
name_0
=
"h100_ref"
,
name_1
=
"output"
)
@
large_gpu_test
(
min_gb
=
48
)
@
large_gpu_test
(
min_gb
=
48
)
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"prompt,expected_ranges"
,
"prompt,expected_ranges"
,
[(
_create_engine_inputs_hf
(
IMG_URLS
[:
1
]),
[{
[(
_create_engine_inputs_hf
(
IMG_URLS
[:
1
]),
[{
"offset"
:
1
0
,
"offset"
:
1
1
,
"length"
:
494
"length"
:
494
}]),
}]),
(
_create_engine_inputs_hf
(
IMG_URLS
[
1
:
4
]),
[{
(
_create_engine_inputs_hf
(
IMG_URLS
[
1
:
4
]),
[{
"offset"
:
1
0
,
"offset"
:
1
1
,
"length"
:
266
"length"
:
266
},
{
},
{
"offset"
:
27
6
,
"offset"
:
27
7
,
"length"
:
1056
"length"
:
1056
},
{
},
{
"offset"
:
133
2
,
"offset"
:
133
3
,
"length"
:
418
"length"
:
418
}])])
}])])
def
test_multi_modal_placeholders
(
def
test_multi_modal_placeholders
(
vllm_runner
,
prompt
,
vllm_runner
,
prompt
,
expected_ranges
:
list
[
PlaceholderRange
])
->
None
:
expected_ranges
:
list
[
PlaceholderRange
],
monkeypatch
)
->
None
:
# This placeholder checking test only works with V0 engine
# where `multi_modal_placeholders` is returned with `RequestOutput`
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
with
vllm_runner
(
with
vllm_runner
(
"mistral-community/pixtral-12b"
,
"mistral-community/pixtral-12b"
,
max_model_len
=
8192
,
max_model_len
=
8192
,
...
...
tests/models/decoder_only/vision_language/vlm_utils/core.py
View file @
ca796e19
...
@@ -4,7 +4,6 @@ from typing import Any, Callable, Optional, Union
...
@@ -4,7 +4,6 @@ from typing import Any, Callable, Optional, Union
import
torch
import
torch
from
PIL.Image
import
Image
from
PIL.Image
import
Image
from
transformers
import
BatchEncoding
from
transformers.models.auto.auto_factory
import
_BaseAutoModelClass
from
transformers.models.auto.auto_factory
import
_BaseAutoModelClass
from
vllm.config
import
TaskOption
from
vllm.config
import
TaskOption
...
@@ -31,7 +30,6 @@ def run_test(
...
@@ -31,7 +30,6 @@ def run_test(
vllm_output_post_proc
:
Optional
[
Callable
[[
RunnerOutput
,
str
],
Any
]],
vllm_output_post_proc
:
Optional
[
Callable
[[
RunnerOutput
,
str
],
Any
]],
auto_cls
:
type
[
_BaseAutoModelClass
],
auto_cls
:
type
[
_BaseAutoModelClass
],
use_tokenizer_eos
:
bool
,
use_tokenizer_eos
:
bool
,
postprocess_inputs
:
Callable
[[
BatchEncoding
],
BatchEncoding
],
comparator
:
Callable
[...,
None
],
comparator
:
Callable
[...,
None
],
get_stop_token_ids
:
Optional
[
Callable
[[
AnyTokenizer
],
list
[
int
]]],
get_stop_token_ids
:
Optional
[
Callable
[[
AnyTokenizer
],
list
[
int
]]],
stop_str
:
Optional
[
list
[
str
]],
stop_str
:
Optional
[
list
[
str
]],
...
@@ -101,7 +99,6 @@ def run_test(
...
@@ -101,7 +99,6 @@ def run_test(
hf_model
=
hf_runner
(
model
,
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
,
dtype
=
dtype
,
auto_cls
=
auto_cls
,
auto_cls
=
auto_cls
,
postprocess_inputs
=
postprocess_inputs
,
model_kwargs
=
hf_model_kwargs
)
model_kwargs
=
hf_model_kwargs
)
# Some models need to patch things like the model processor, e.g., internvl
# Some models need to patch things like the model processor, e.g., internvl
...
...
tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
View file @
ca796e19
...
@@ -6,16 +6,15 @@ typically specific to a small subset of models.
...
@@ -6,16 +6,15 @@ typically specific to a small subset of models.
import
re
import
re
import
types
import
types
from
pathlib
import
PosixPath
from
pathlib
import
PosixPath
from
typing
import
Callable
,
Optional
,
Union
from
typing
import
Optional
,
Union
import
torch
import
torch
from
PIL.Image
import
Image
from
PIL.Image
import
Image
from
transformers
import
(
AutoConfig
,
AutoTokenizer
,
Batch
Encoding
,
from
transformers
import
(
AutoConfig
,
AutoTokenizer
,
Batch
Feature
,
GenerationConfig
)
GenerationConfig
)
from
vllm.sequence
import
SampleLogprobs
from
vllm.sequence
import
SampleLogprobs
from
vllm.transformers_utils.tokenizer
import
patch_padding_side
from
vllm.transformers_utils.tokenizer
import
patch_padding_side
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
from
.....conftest
import
HfRunner
,
ImageAsset
,
_ImageAssets
from
.....conftest
import
HfRunner
,
ImageAsset
,
_ImageAssets
from
.types
import
RunnerOutput
from
.types
import
RunnerOutput
...
@@ -211,40 +210,6 @@ def get_llava_embeddings(image_assets: _ImageAssets):
...
@@ -211,40 +210,6 @@ def get_llava_embeddings(image_assets: _ImageAssets):
return
[
asset
.
image_embeds
for
asset
in
image_assets
]
return
[
asset
.
image_embeds
for
asset
in
image_assets
]
####### postprocessors to run on HF BatchEncoding
def
cast_dtype_post_processor
(
hf_inp_key
:
str
)
->
Callable
[[
BatchEncoding
,
str
],
BatchEncoding
]:
"""Gets a handle to a post processor which converts a given key into a
target data type."""
def
process
(
hf_inputs
:
BatchEncoding
,
dtype
:
str
):
torch_dtype
=
STR_DTYPE_TO_TORCH_DTYPE
[
dtype
]
hf_inputs
[
hf_inp_key
]
=
hf_inputs
[
hf_inp_key
].
to
(
torch_dtype
)
return
hf_inputs
return
process
def
ignore_inputs_post_processor
(
hf_inp_key
:
str
)
->
Callable
[[
BatchEncoding
,
str
],
BatchEncoding
]:
"""Gets a handle to a post processor which ignores a given key."""
def
process
(
hf_inputs
:
BatchEncoding
,
dtype
:
str
):
del
hf_inputs
[
hf_inp_key
]
return
hf_inputs
return
process
def
wrap_inputs_post_processor
(
hf_inputs
:
BatchEncoding
,
dtype
:
str
):
return
{
"model_inputs"
:
hf_inputs
}
def
molmo_post_processor
(
hf_inputs
:
BatchEncoding
,
dtype
:
str
):
hf_inputs
=
cast_dtype_post_processor
(
"images"
)(
hf_inputs
,
dtype
)
return
{
k
:
v
.
unsqueeze
(
0
)
for
k
,
v
in
hf_inputs
.
items
()}
####### Prompt path encoders for models that need models on disk
####### Prompt path encoders for models that need models on disk
def
qwen_prompt_path_encoder
(
def
qwen_prompt_path_encoder
(
tmp_path
:
PosixPath
,
prompt
:
str
,
assets
:
Union
[
list
[
ImageAsset
],
tmp_path
:
PosixPath
,
prompt
:
str
,
assets
:
Union
[
list
[
ImageAsset
],
...
@@ -295,8 +260,7 @@ def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
...
@@ -295,8 +260,7 @@ def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
for
k
in
inputs
.
keys
()
# noqa
for
k
in
inputs
.
keys
()
# noqa
if
k
not
in
(
"seq_lens"
,
"sft_format"
)
if
k
not
in
(
"seq_lens"
,
"sft_format"
)
}
}
inputs
=
BatchEncoding
(
data
=
inputs
,
tensor_type
=
"pt"
)
return
BatchFeature
(
data
=
inputs
,
tensor_type
=
"pt"
)
return
inputs
hf_model
.
processor
=
processor
hf_model
.
processor
=
processor
hf_model
.
model
.
get_output_embeddings
=
lambda
:
\
hf_model
.
model
.
get_output_embeddings
=
lambda
:
\
...
@@ -529,10 +493,52 @@ def mantis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
...
@@ -529,10 +493,52 @@ def mantis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
return
hf_model
return
hf_model
def
minicpm
o
_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
def
minicpm
v_25
_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
orig_generate
=
hf_model
.
model
.
generate
orig_generate
=
hf_model
.
model
.
generate
def
_generate
(
self
,
*
args
,
**
kwargs
):
def
_generate
(
self
,
*
args
,
input_ids
=
None
,
pixel_values
=
None
,
image_sizes
=
None
,
image_bound
=
None
,
tgt_sizes
=
None
,
**
kwargs
,
):
model_inputs
=
{
"input_ids"
:
input_ids
,
"pixel_values"
:
pixel_values
,
"image_sizes"
:
image_sizes
,
"image_bound"
:
image_bound
,
"tgt_sizes"
:
tgt_sizes
,
}
for
k
in
list
(
model_inputs
.
keys
()):
if
model_inputs
[
k
]
is
None
:
model_inputs
.
pop
(
k
)
return
orig_generate
(
model_inputs
,
*
args
,
decode_text
=
False
,
**
kwargs
)
hf_model
.
model
.
generate
=
types
.
MethodType
(
_generate
,
hf_model
.
model
)
return
hf_model
def
minicpmo_26_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
orig_generate
=
hf_model
.
model
.
generate
def
_generate
(
self
,
*
args
,
image_sizes
=
None
,
**
kwargs
):
return
orig_generate
(
*
args
,
decode_text
=
False
,
**
kwargs
)
hf_model
.
model
.
generate
=
types
.
MethodType
(
_generate
,
hf_model
.
model
)
return
hf_model
def
minicpmv_26_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
orig_generate
=
hf_model
.
model
.
generate
def
_generate
(
self
,
*
args
,
image_sizes
=
None
,
**
kwargs
):
return
orig_generate
(
*
args
,
decode_text
=
False
,
**
kwargs
)
return
orig_generate
(
*
args
,
decode_text
=
False
,
**
kwargs
)
hf_model
.
model
.
generate
=
types
.
MethodType
(
_generate
,
hf_model
.
model
)
hf_model
.
model
.
generate
=
types
.
MethodType
(
_generate
,
hf_model
.
model
)
...
@@ -551,10 +557,11 @@ def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
...
@@ -551,10 +557,11 @@ def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
def
_generate
(
self
,
max_new_tokens
=
None
,
do_sample
=
None
,
**
kwargs
):
def
_generate
(
self
,
max_new_tokens
=
None
,
do_sample
=
None
,
**
kwargs
):
batch
=
{
batch
=
{
k
:
kwargs
.
pop
(
k
)
k
:
kwargs
.
pop
(
k
)
.
unsqueeze
(
0
)
for
k
in
(
"input_ids"
,
"images"
,
"image_input_idx"
,
"image_masks"
)
for
k
in
(
"input_ids"
,
"images"
,
"image_input_idx"
,
"image_masks"
)
if
k
in
kwargs
if
k
in
kwargs
}
}
batch
=
BatchFeature
(
batch
).
to
(
dtype
=
self
.
dtype
)
return
self
.
generate_from_batch
(
return
self
.
generate_from_batch
(
batch
,
batch
,
...
...
tests/models/decoder_only/vision_language/vlm_utils/types.py
View file @
ca796e19
...
@@ -8,13 +8,12 @@ from typing import Any, Callable, NamedTuple, Optional, Union
...
@@ -8,13 +8,12 @@ from typing import Any, Callable, NamedTuple, Optional, Union
import
torch
import
torch
from
PIL.Image
import
Image
from
PIL.Image
import
Image
from
pytest
import
MarkDecorator
from
pytest
import
MarkDecorator
from
transformers
import
AutoModelForCausalLM
,
BatchEncoding
from
transformers
import
AutoModelForCausalLM
from
transformers.models.auto.auto_factory
import
_BaseAutoModelClass
from
transformers.models.auto.auto_factory
import
_BaseAutoModelClass
from
vllm.config
import
TaskOption
from
vllm.config
import
TaskOption
from
vllm.sequence
import
SampleLogprobs
from
vllm.sequence
import
SampleLogprobs
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
vllm.utils
import
identity
from
.....conftest
import
IMAGE_ASSETS
,
HfRunner
,
ImageAsset
,
_ImageAssets
from
.....conftest
import
IMAGE_ASSETS
,
HfRunner
,
ImageAsset
,
_ImageAssets
from
....utils
import
check_logprobs_close
from
....utils
import
check_logprobs_close
...
@@ -110,11 +109,6 @@ class VLMTestInfo(NamedTuple):
...
@@ -110,11 +109,6 @@ class VLMTestInfo(NamedTuple):
# Indicates we should explicitly pass the EOS from the tokenizer
# Indicates we should explicitly pass the EOS from the tokenizer
use_tokenizer_eos
:
bool
=
False
use_tokenizer_eos
:
bool
=
False
auto_cls
:
type
[
_BaseAutoModelClass
]
=
AutoModelForCausalLM
auto_cls
:
type
[
_BaseAutoModelClass
]
=
AutoModelForCausalLM
# Callable to pass to the HF runner to run on inputs; for now, we also pass
# the data type to input post processing, because almost all of the uses of
# postprocess_inputs are to fix the data types of BatchEncoding values.
postprocess_inputs
:
Callable
[[
BatchEncoding
,
str
],
BatchEncoding
]
=
identity
patch_hf_runner
:
Optional
[
Callable
[[
HfRunner
],
HfRunner
]]
=
None
patch_hf_runner
:
Optional
[
Callable
[[
HfRunner
],
HfRunner
]]
=
None
# Post processors that if defined, will run oun the outputs of the
# Post processors that if defined, will run oun the outputs of the
...
@@ -130,7 +124,7 @@ class VLMTestInfo(NamedTuple):
...
@@ -130,7 +124,7 @@ class VLMTestInfo(NamedTuple):
# is all combinations of .models + all fields below
# is all combinations of .models + all fields below
max_tokens
:
Union
[
int
,
tuple
[
int
]]
=
128
max_tokens
:
Union
[
int
,
tuple
[
int
]]
=
128
num_logprobs
:
Union
[
int
,
tuple
[
int
]]
=
5
num_logprobs
:
Union
[
int
,
tuple
[
int
]]
=
5
dtype
:
Union
[
str
,
Iterable
[
str
]]
=
"
half
"
dtype
:
Union
[
str
,
Union
[
list
[
str
],
tuple
[
str
,
...]
]]
=
"
auto
"
distributed_executor_backend
:
Optional
[
Union
[
str
,
Iterable
[
str
]]]
=
None
distributed_executor_backend
:
Optional
[
Union
[
str
,
Iterable
[
str
]]]
=
None
# Only expanded in video tests
# Only expanded in video tests
num_video_frames
:
Union
[
int
,
tuple
[
int
]]
=
16
num_video_frames
:
Union
[
int
,
tuple
[
int
]]
=
16
...
@@ -171,7 +165,6 @@ class VLMTestInfo(NamedTuple):
...
@@ -171,7 +165,6 @@ class VLMTestInfo(NamedTuple):
"vllm_output_post_proc"
:
self
.
vllm_output_post_proc
,
"vllm_output_post_proc"
:
self
.
vllm_output_post_proc
,
"auto_cls"
:
self
.
auto_cls
,
"auto_cls"
:
self
.
auto_cls
,
"use_tokenizer_eos"
:
self
.
use_tokenizer_eos
,
"use_tokenizer_eos"
:
self
.
use_tokenizer_eos
,
"postprocess_inputs"
:
self
.
postprocess_inputs
,
"comparator"
:
self
.
comparator
,
"comparator"
:
self
.
comparator
,
"get_stop_token_ids"
:
self
.
get_stop_token_ids
,
"get_stop_token_ids"
:
self
.
get_stop_token_ids
,
"hf_model_kwargs"
:
self
.
hf_model_kwargs
,
"hf_model_kwargs"
:
self
.
hf_model_kwargs
,
...
...
tests/models/embedding/vision_language/test_dse_qwen2_vl.py
View file @
ca796e19
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
from
functools
import
partial
from
typing
import
Callable
from
typing
import
Callable
import
pytest
import
pytest
import
torch
import
torch
import
torch.nn.functional
as
F
from
PIL
import
Image
from
PIL
import
Image
from
transformers
import
BatchEncoding
,
Qwen2VLForConditionalGeneration
from
transformers
import
Qwen2VLForConditionalGeneration
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
from
....utils
import
large_gpu_test
from
....utils
import
large_gpu_test
...
@@ -75,10 +75,6 @@ def apply_chat_template_and_add_eos(
...
@@ -75,10 +75,6 @@ def apply_chat_template_and_add_eos(
return
prompt
return
prompt
def
postprocess_inputs
(
hf_model
:
HfRunner
,
inputs
:
BatchEncoding
,
**
kwargs
):
return
hf_model
.
model
.
prepare_inputs_for_generation
(
**
inputs
,
**
kwargs
)
def
_run_test
(
def
_run_test
(
hf_runner
:
type
[
HfRunner
],
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
type
[
VllmRunner
],
vllm_runner
:
type
[
VllmRunner
],
...
@@ -118,14 +114,8 @@ def _run_test(
...
@@ -118,14 +114,8 @@ def _run_test(
with
hf_runner
(
model
,
with
hf_runner
(
model
,
dtype
=
dtype
,
dtype
=
dtype
,
auto_cls
=
Qwen2VLForConditionalGeneration
)
as
hf_model
:
auto_cls
=
Qwen2VLForConditionalGeneration
)
as
hf_model
:
hf_model
.
postprocess_inputs
=
partial
(
postprocess_inputs
,
prompts
=
[]
hf_model
,
cache_position
=
torch
.
arange
(
0
,
1
,
# 1 for batch size
requires_grad
=
False
),
use_cache
=
False
)
for
text
,
image
,
embed_text
in
zip
(
input_texts
,
input_images
,
for
text
,
image
,
embed_text
in
zip
(
input_texts
,
input_images
,
embed_texts
):
embed_texts
):
# dse requires non-standard input processing
# dse requires non-standard input processing
...
@@ -133,20 +123,34 @@ def _run_test(
...
@@ -133,20 +123,34 @@ def _run_test(
messages
=
get_messages
(
image
,
text
,
embed_text
)
messages
=
get_messages
(
image
,
text
,
embed_text
)
prompt
=
apply_chat_template_and_add_eos
(
prompt
=
apply_chat_template_and_add_eos
(
messages
,
hf_model
.
processor
.
apply_chat_template
)
messages
,
hf_model
.
processor
.
apply_chat_template
)
inputs
=
hf_model
.
get_inputs
(
prompts
=
[[
prompt
]],
prompts
.
append
(
prompt
)
images
=
[[
image
]],
all_inputs
=
hf_model
.
get_inputs
(
prompts
=
prompts
,
images
=
input_images
,
)
)
with
torch
.
no_grad
():
with
torch
.
no_grad
():
all_outputs
=
[]
for
inputs
in
all_inputs
:
inputs
=
hf_model
.
model
.
prepare_inputs_for_generation
(
**
inputs
,
cache_position
=
torch
.
arange
(
1
),
# 1 for batch size
use_cache
=
False
,
)
outputs
=
hf_model
.
model
(
outputs
=
hf_model
.
model
(
**
hf_model
.
wrap_device
(
inputs
[
0
],
**
hf_model
.
wrap_device
(
inputs
),
device
=
hf_model
.
model
.
device
.
type
),
return_dict
=
True
,
return_dict
=
True
,
output_hidden_states
=
True
,
output_hidden_states
=
True
,
)
)
pooled_output
=
torch
.
nn
.
functional
.
normalize
(
pooled_output
=
F
.
normalize
(
outputs
.
hidden_states
[
-
1
][
0
,
-
1
],
outputs
.
hidden_states
[
-
1
][
0
,
-
1
],
p
=
2
,
dim
=-
1
)
p
=
2
,
hf_outputs
.
append
(
pooled_output
.
tolist
())
dim
=-
1
)
all_outputs
.
append
(
pooled_output
.
tolist
())
hf_outputs
=
all_outputs
check_embeddings_close
(
check_embeddings_close
(
embeddings_0_lst
=
hf_outputs
,
embeddings_0_lst
=
hf_outputs
,
...
...
tests/models/embedding/vision_language/test_llava_next.py
View file @
ca796e19
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
import
pytest
import
pytest
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
from
transformers
import
AutoModelFor
Vision2Seq
from
transformers
import
AutoModelFor
ImageTextToText
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
...
@@ -70,7 +70,7 @@ def _run_test(
...
@@ -70,7 +70,7 @@ def _run_test(
vllm_outputs
=
vllm_model
.
encode
(
input_texts
,
images
=
input_images
)
vllm_outputs
=
vllm_model
.
encode
(
input_texts
,
images
=
input_images
)
with
hf_runner
(
model
,
dtype
=
dtype
,
with
hf_runner
(
model
,
dtype
=
dtype
,
auto_cls
=
AutoModelFor
Vision2Seq
)
as
hf_model
:
auto_cls
=
AutoModelFor
ImageTextToText
)
as
hf_model
:
# Patch the issue where generation_config.json is missing
# Patch the issue where generation_config.json is missing
hf_model
.
processor
.
patch_size
=
\
hf_model
.
processor
.
patch_size
=
\
hf_model
.
model
.
config
.
vision_config
.
patch_size
hf_model
.
model
.
config
.
vision_config
.
patch_size
...
@@ -86,8 +86,7 @@ def _run_test(
...
@@ -86,8 +86,7 @@ def _run_test(
for
inputs
in
all_inputs
:
for
inputs
in
all_inputs
:
# Based on: https://huggingface.co/royokong/e5-v
# Based on: https://huggingface.co/royokong/e5-v
outputs
=
hf_model
.
model
(
outputs
=
hf_model
.
model
(
**
hf_model
.
wrap_device
(
inputs
,
**
hf_model
.
wrap_device
(
inputs
),
device
=
hf_model
.
model
.
device
.
type
),
return_dict
=
True
,
return_dict
=
True
,
output_hidden_states
=
True
,
output_hidden_states
=
True
,
)
)
...
...
tests/models/embedding/vision_language/test_phi3v.py
View file @
ca796e19
...
@@ -53,8 +53,7 @@ def _run_test(
...
@@ -53,8 +53,7 @@ def _run_test(
for
inputs
in
all_inputs
:
for
inputs
in
all_inputs
:
# Based on: https://github.com/TIGER-AI-Lab/VLM2Vec/blob/db3b951bccabba220c1f53ab46a734e50dd2fc08/src/model.py
# Based on: https://github.com/TIGER-AI-Lab/VLM2Vec/blob/db3b951bccabba220c1f53ab46a734e50dd2fc08/src/model.py
outputs
=
hf_model
.
model
(
outputs
=
hf_model
.
model
(
**
hf_model
.
wrap_device
(
inputs
,
**
hf_model
.
wrap_device
(
inputs
),
device
=
hf_model
.
model
.
device
.
type
),
return_dict
=
True
,
return_dict
=
True
,
output_hidden_states
=
True
,
output_hidden_states
=
True
,
)
)
...
...
tests/models/encoder_decoder/vision_language/test_mllama.py
View file @
ca796e19
...
@@ -4,8 +4,7 @@ from typing import Optional, overload
...
@@ -4,8 +4,7 @@ from typing import Optional, overload
import
pytest
import
pytest
import
torch
import
torch
from
transformers
import
(
AutoConfig
,
AutoModelForVision2Seq
,
AutoTokenizer
,
from
transformers
import
AutoConfig
,
AutoModelForImageTextToText
,
AutoTokenizer
BatchEncoding
)
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
vllm.attention.backends.flash_attn
import
FlashAttentionMetadata
from
vllm.attention.backends.flash_attn
import
FlashAttentionMetadata
...
@@ -227,14 +226,10 @@ def _run_test(
...
@@ -227,14 +226,10 @@ def _run_test(
for
prompts
,
images
in
inputs
for
prompts
,
images
in
inputs
]
]
def
process
(
hf_inputs
:
BatchEncoding
,
**
kwargs
):
return
hf_inputs
with
hf_runner
(
model
,
with
hf_runner
(
model
,
dtype
=
dtype
,
dtype
=
dtype
,
model_kwargs
=
{
"device_map"
:
"auto"
},
model_kwargs
=
{
"device_map"
:
"auto"
},
postprocess_inputs
=
process
,
auto_cls
=
AutoModelForImageTextToText
)
as
hf_model
:
auto_cls
=
AutoModelForVision2Seq
)
as
hf_model
:
hf_outputs_per_image
=
[
hf_outputs_per_image
=
[
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
max_tokens
,
max_tokens
,
...
...
tests/models/fixtures/mistral_small_3_chat.json
0 → 100644
View file @
ca796e19
This diff is collapsed.
Click to expand it.
tests/models/fixtures/pixtral_chat_engine.json
deleted
100644 → 0
View file @
e983c804
This diff is collapsed.
Click to expand it.
tests/models/utils.py
View file @
ca796e19
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
import
warnings
import
warnings
from
collections.abc
import
Sequence
from
collections.abc
import
Sequence
from
typing
import
Optional
,
Union
from
typing
import
Any
,
Optional
,
Union
import
torch
import
torch
...
@@ -254,9 +254,9 @@ def check_logprobs_close(
...
@@ -254,9 +254,9 @@ def check_logprobs_close(
def
build_model_context
(
def
build_model_context
(
model_id
:
str
,
model_id
:
str
,
task
:
TaskOption
=
"auto"
,
task
:
TaskOption
=
"auto"
,
dtype
:
Optional
[
Union
[
str
,
torch
.
dtype
]
]
=
None
,
dtype
:
Union
[
str
,
torch
.
dtype
]
=
"auto"
,
mm_processor_kwargs
:
Optional
[
dict
]
=
None
,
mm_processor_kwargs
:
Optional
[
dict
[
str
,
Any
]
]
=
None
,
limit_mm_per_prompt
:
Optional
[
dict
]
=
None
,
limit_mm_per_prompt
:
Optional
[
dict
[
str
,
int
]
]
=
None
,
disable_mm_preprocessor_cache
:
bool
=
True
,
disable_mm_preprocessor_cache
:
bool
=
True
,
):
):
"""Creates an InputContext for a given model.
"""Creates an InputContext for a given model.
...
@@ -274,9 +274,6 @@ def build_model_context(
...
@@ -274,9 +274,6 @@ def build_model_context(
model_info
.
check_available_online
(
on_fail
=
"skip"
)
model_info
.
check_available_online
(
on_fail
=
"skip"
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
if
dtype
is
None
:
dtype
=
"half"
model_config
=
ModelConfig
(
model_config
=
ModelConfig
(
model_id
,
model_id
,
task
=
task
,
task
=
task
,
...
...
tests/multimodal/test_processing.py
View file @
ca796e19
...
@@ -7,19 +7,25 @@ from unittest.mock import MagicMock
...
@@ -7,19 +7,25 @@ from unittest.mock import MagicMock
import
numpy
as
np
import
numpy
as
np
import
pytest
import
pytest
import
torch
from
transformers
import
ProcessorMixin
from
transformers
import
ProcessorMixin
from
vllm.config
import
ModelConfig
from
vllm.config
import
ModelConfig
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalFieldElem
,
MultiModalKwargs
,
MultiModalKwargsItem
,
MultiModalSharedField
)
# yapf conflicts with isort for this block
# yapf conflicts with isort for this block
# yapf: disable
# yapf: disable
from
vllm.multimodal.processing
import
(
PlaceholderFeaturesInfo
,
from
vllm.multimodal.processing
import
(
PlaceholderFeaturesInfo
,
PromptIndexTargets
,
PromptInsertion
,
ProcessingCache
,
PromptIndexTargets
,
PromptReplacement
,
apply_text_matches
,
PromptInsertion
,
PromptReplacement
,
apply_text_matches
,
apply_token_matches
,
apply_token_matches
,
find_mm_placeholders
,
find_mm_placeholders
,
find_text_matches
,
find_token_matches
,
find_text_matches
,
find_token_matches
,
iter_token_matches
)
iter_token_matches
,
replace_token_matches
)
# yapf: enable
# yapf: enable
from
vllm.multimodal.profiling
import
MultiModalProfiler
from
vllm.multimodal.profiling
import
MultiModalProfiler
from
vllm.transformers_utils.tokenizer
import
(
AnyTokenizer
,
from
vllm.transformers_utils.tokenizer
import
(
AnyTokenizer
,
...
@@ -89,6 +95,58 @@ def test_iter_token_matches(token_ids, match_ids, expected):
...
@@ -89,6 +95,58 @@ def test_iter_token_matches(token_ids, match_ids, expected):
assert
all
(
match_len
==
len
(
match_ids
)
for
match_len
in
match_lens
)
assert
all
(
match_len
==
len
(
match_ids
)
for
match_len
in
match_lens
)
# yapf: disable
@
pytest
.
mark
.
parametrize
(
(
"token_ids"
,
"match_ids"
,
"new_ids"
,
"expected"
),
[
([],
[],
[
-
1
],
[]),
([],
[
32000
],
[
-
1
],
[]),
(
[
32000
,
32000
,
32000
],
[
32000
],
[
-
1
],
[
-
1
,
-
1
,
-
1
],
),
(
[
32000
,
32000
,
32000
],
[
32000
,
32000
],
[
-
1
],
[
-
1
,
32000
],
),
(
[
32000
,
32000
,
32000
],
[
32000
,
32000
,
32000
],
[
-
1
],
[
-
1
],
),
(
[
9833
,
28747
,
32000
,
32000
,
32000
,
9833
,
28747
,
32000
,
32000
,
918
],
[
28747
,
32000
],
[
-
1
],
[
9833
,
-
1
,
32000
,
32000
,
9833
,
-
1
,
32000
,
918
],
),
(
[
9833
,
28747
,
32000
,
32000
,
32000
,
9833
,
28747
,
32000
,
32000
,
918
],
[
28747
,
32000
,
32000
,
32000
],
[
-
1
],
[
9833
,
-
1
,
9833
,
28747
,
32000
,
32000
,
918
],
),
(
[
9833
,
28747
,
32000
,
32000
,
32000
,
9833
,
28747
,
32000
,
32000
,
918
],
[
28747
,
0
,
32000
],
[
-
1
],
[
9833
,
28747
,
32000
,
32000
,
32000
,
9833
,
28747
,
32000
,
32000
,
918
],
),
],
)
# yapf: enable
def
test_replace_token_matches
(
token_ids
,
match_ids
,
new_ids
,
expected
):
result
=
replace_token_matches
(
token_ids
,
match_ids
,
new_ids
)
# Manually constructed results
assert
result
==
expected
# yapf: disable
# yapf: disable
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
(
"prompt"
,
"target_by_key"
,
"expected_by_key"
),
(
"prompt"
,
"target_by_key"
,
"expected_by_key"
),
...
@@ -837,6 +895,45 @@ def test_find_mm_placeholders(
...
@@ -837,6 +895,45 @@ def test_find_mm_placeholders(
assert
result
==
expected
assert
result
==
expected
def
_dummy_elem
(
modality
:
str
,
key
:
str
,
size
:
int
):
return
MultiModalFieldElem
(
modality
=
modality
,
key
=
key
,
data
=
torch
.
empty
((
size
,
),
dtype
=
torch
.
int8
),
field
=
MultiModalSharedField
(
1
),
)
def
_dummy_item
(
modality
:
str
,
size_by_key
:
dict
[
str
,
int
]):
return
MultiModalKwargsItem
.
from_elems
([
_dummy_elem
(
modality
,
key
,
size
)
for
key
,
size
in
size_by_key
.
items
()
])
def
_dummy_kw
(
size_by_key_modality
:
dict
[
str
,
dict
[
str
,
int
]]):
return
MultiModalKwargs
.
from_items
([
_dummy_item
(
modality
,
size_by_key
)
for
modality
,
size_by_key
in
size_by_key_modality
.
items
()
])
# yapf: disable
@
pytest
.
mark
.
parametrize
(
(
"item"
,
"expected_size"
),
[
(
_dummy_item
(
"a"
,
{
"a1"
:
100
}),
100
),
(
_dummy_item
(
"a"
,
{
"a1"
:
100
,
"a2"
:
110
}),
210
),
(
_dummy_kw
({
"a"
:
{
"a1"
:
100
,
"a2"
:
110
},
"b"
:
{
"b1"
:
120
,
"b2"
:
130
}}),
460
),
# noqa: E501
],
)
# yapf: enable
def
test_cache_item_size
(
item
,
expected_size
):
cache
=
ProcessingCache
.
get_lru_cache
(
2048
,
type
(
item
))
cache
[
""
]
=
item
assert
cache
.
currsize
==
expected_size
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"llava-hf/llava-v1.6-mistral-7b-hf"
])
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"llava-hf/llava-v1.6-mistral-7b-hf"
])
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
(
"limit"
,
"num_supported"
,
"is_valid"
),
(
"limit"
,
"num_supported"
,
"is_valid"
),
...
@@ -853,7 +950,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
...
@@ -853,7 +950,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
tokenizer_mode
=
"auto"
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
trust_remote_code
=
False
,
seed
=
0
,
seed
=
0
,
dtype
=
"
half
"
,
dtype
=
"
auto
"
,
revision
=
None
,
revision
=
None
,
limit_mm_per_prompt
=
limit_mm_per_prompt
,
limit_mm_per_prompt
=
limit_mm_per_prompt
,
)
)
...
@@ -892,7 +989,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
...
@@ -892,7 +989,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
tokenizer_mode
=
"auto"
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
trust_remote_code
=
False
,
seed
=
0
,
seed
=
0
,
dtype
=
"
half
"
,
dtype
=
"
auto
"
,
revision
=
None
,
revision
=
None
,
limit_mm_per_prompt
=
limit_mm_per_prompt
,
limit_mm_per_prompt
=
limit_mm_per_prompt
,
)
)
...
@@ -965,7 +1062,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
...
@@ -965,7 +1062,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
tokenizer_mode
=
"auto"
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
trust_remote_code
=
False
,
seed
=
0
,
seed
=
0
,
dtype
=
"
half
"
,
dtype
=
"
auto
"
,
revision
=
None
,
revision
=
None
,
)
)
...
...
tests/neuron/1_core/test_prefix_prefill.py
View file @
ca796e19
...
@@ -314,7 +314,7 @@ def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
...
@@ -314,7 +314,7 @@ def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
# Test edge cases
# Test edge cases
(
1
,
128
,
16
,
1024
,
4
,
2
,
16
,
False
),
# large decode batch
(
1
,
128
,
16
,
1024
,
4
,
2
,
16
,
False
),
# large decode batch
(
16
,
4
,
8
,
8192
,
4
8
,
1
,
128
,
True
),
# large prefill batch
(
16
,
4
,
8
,
1024
,
4
,
2
,
128
,
True
),
# large prefill batch
(
4
,
12
,
32
,
2048
,
16
,
1
,
32
,
True
),
# multi-head attention (MHA)
(
4
,
12
,
32
,
2048
,
16
,
1
,
32
,
True
),
# multi-head attention (MHA)
(
4
,
12
,
32
,
2048
,
16
,
16
,
32
,
True
),
# multi-query attention (MQA)
(
4
,
12
,
32
,
2048
,
16
,
16
,
32
,
True
),
# multi-query attention (MQA)
])
])
...
...
tests/quantization/test_bitsandbytes.py
View file @
ca796e19
...
@@ -15,6 +15,8 @@ from ..utils import compare_two_settings, create_new_process_for_each_test
...
@@ -15,6 +15,8 @@ from ..utils import compare_two_settings, create_new_process_for_each_test
models_4bit_to_test
=
[
models_4bit_to_test
=
[
(
"facebook/opt-125m"
,
"quantize opt model inflight"
),
(
"facebook/opt-125m"
,
"quantize opt model inflight"
),
(
"mistralai/Mistral-7B-Instruct-v0.3"
,
"quantize inflight model with both HF and Mistral format weights"
)
]
]
models_pre_qaunt_4bit_to_test
=
[
models_pre_qaunt_4bit_to_test
=
[
...
...
tests/tensorizer_loader/test_tensorizer.py
View file @
ca796e19
...
@@ -166,7 +166,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
...
@@ -166,7 +166,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
test_prompts
=
multilora_inference
.
create_test_prompts
(
lora_path
)
test_prompts
=
multilora_inference
.
create_test_prompts
(
lora_path
)
# Serialize model before deserializing and binding LoRA adapters
# Serialize model before deserializing and binding LoRA adapters
with
vllm_runner
(
model_ref
,
)
as
vllm_model
:
with
vllm_runner
(
model_ref
)
as
vllm_model
:
model_path
=
tmp_path
/
(
model_ref
+
".tensors"
)
model_path
=
tmp_path
/
(
model_ref
+
".tensors"
)
vllm_model
.
apply_model
(
vllm_model
.
apply_model
(
...
@@ -208,7 +208,7 @@ def test_load_without_tensorizer_load_format(vllm_runner):
...
@@ -208,7 +208,7 @@ def test_load_without_tensorizer_load_format(vllm_runner):
@
pytest
.
mark
.
skipif
(
not
is_curl_installed
(),
reason
=
"cURL is not installed"
)
@
pytest
.
mark
.
skipif
(
not
is_curl_installed
(),
reason
=
"cURL is not installed"
)
def
test_openai_apiserver_with_tensorizer
(
vllm_runner
,
tmp_path
):
def
test_openai_apiserver_with_tensorizer
(
vllm_runner
,
tmp_path
):
## Serialize model
## Serialize model
with
vllm_runner
(
model_ref
,
)
as
vllm_model
:
with
vllm_runner
(
model_ref
)
as
vllm_model
:
model_path
=
tmp_path
/
(
model_ref
+
".tensors"
)
model_path
=
tmp_path
/
(
model_ref
+
".tensors"
)
vllm_model
.
apply_model
(
vllm_model
.
apply_model
(
...
...
tests/tpu/test_compilation.py
View file @
ca796e19
...
@@ -34,7 +34,9 @@ with depyf.prepare_debug(temp_dir):
...
@@ -34,7 +34,9 @@ with depyf.prepare_debug(temp_dir):
# disable custom dispatcher, let Dynamo takes over
# disable custom dispatcher, let Dynamo takes over
# all the control
# all the control
llm
=
LLM
(
model
=
"google/gemma-2b"
,
llm
=
LLM
(
model
=
"Qwen/Qwen2.5-1.5B-Instruct"
,
max_model_len
=
512
,
max_num_seqs
=
64
,
enforce_eager
=
True
,
enforce_eager
=
True
,
compilation_config
=
{
"level"
:
CompilationLevel
.
DYNAMO_AS_IS
})
compilation_config
=
{
"level"
:
CompilationLevel
.
DYNAMO_AS_IS
})
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
...
@@ -44,38 +46,51 @@ with depyf.prepare_debug(temp_dir):
...
@@ -44,38 +46,51 @@ with depyf.prepare_debug(temp_dir):
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
assert
generated_text
.
startswith
(
answer
)
assert
generated_text
.
startswith
(
answer
)
compiled_code
=
sorted
(
compiled_code
s
=
sorted
(
glob
.
glob
(
os
.
path
.
join
(
temp_dir
,
"__transformed_code*.py"
)))
glob
.
glob
(
os
.
path
.
join
(
temp_dir
,
"__transformed_code*.py"
)))
# we should only trigger Dynamo compilation three times:
for
i
,
compiled_code
in
enumerate
(
compiled_codes
):
# one for the profiling phase without kv cache
print
(
"{} file: {}"
.
format
(
i
+
1
,
compiled_code
))
# one for the prefill phase with symbolic shapes
# one for the decode phase with symbolic shapes
# We should only trigger Dynamo compilation 4 times:
# 1. forward pass (symbolic)
# 2. compute_logits (symbolic)
# 3. forward pass (shape 16)
# 4. forward pass (shape 32)
# and later calls should not trigger Dynamo compilation again.
# and later calls should not trigger Dynamo compilation again.
# NOTE: it might still trigger XLA compilation.
# NOTE: It might still trigger XLA compilation.
# Check we have 4 compiled codes
assert
len
(
compiled_codes
)
==
4
# check we have three compiled code
kv_cache_prefix
=
"kv_cache"
# this is the assumption when we use the custom dispatcher
attn_prefix
=
"ragged_paged_attention"
assert
len
(
compiled_code
)
==
3
#
c
heck all the compilations are as expected
#
C
heck all the compilations are as expected
compiled_fn
=
sorted
(
compiled_fn
s
=
sorted
(
glob
.
glob
(
os
.
path
.
join
(
temp_dir
,
"__compiled_fn*Captured*.py"
)))
glob
.
glob
(
os
.
path
.
join
(
temp_dir
,
"__compiled_fn*Captured*.py"
)))
# the first compilation is the profiling phase,
for
i
,
compiled_fn
in
enumerate
(
compiled_fns
):
# it should not have any kv cache
print
(
"{} file: {}"
.
format
(
i
+
1
,
compiled_fn
))
with
open
(
compiled_fn
[
0
])
as
f
:
# The first compilation is symbolic, so it should not have any kv_caches
with
open
(
compiled_fns
[
0
])
as
f
:
content
=
f
.
read
()
assert
kv_cache_prefix
not
in
content
# The second compilation is symbolic, so it should not have any kv_caches
with
open
(
compiled_fns
[
1
])
as
f
:
content
=
f
.
read
()
content
=
f
.
read
()
assert
"
kv_cache
s"
not
in
content
assert
kv_cache
_prefix
not
in
content
#
t
he
secon
d compilation is
the prefill phase,
#
T
he
thir
d compilation is
shape 16, so it should have kv_caches and the
#
it should have kv cache and the flash
_attention
op
#
ragged_paged
_attention
with
open
(
compiled_fn
[
1
])
as
f
:
with
open
(
compiled_fn
s
[
2
])
as
f
:
content
=
f
.
read
()
content
=
f
.
read
()
assert
"
kv_cache
s"
in
content
and
"torch.ops.xla.flash_attention"
in
content
assert
(
kv_cache
_prefix
in
content
and
attn_prefix
in
content
)
#
t
he
third
compilation is
the decode phase,
#
T
he
forth
compilation is
shape 32, so it should have kv_caches and the
#
it should have kv cache and the
paged_attention
op
#
ragged_
paged_attention
with
open
(
compiled_fn
[
2
])
as
f
:
with
open
(
compiled_fn
s
[
3
])
as
f
:
content
=
f
.
read
()
content
=
f
.
read
()
assert
"
kv_cache
s"
in
content
and
"torch.ops.xla.paged_attention"
in
content
assert
(
kv_cache
_prefix
in
content
and
attn_prefix
in
content
)
tests/tpu/test_custom_dispatcher.py
View file @
ca796e19
...
@@ -14,12 +14,17 @@ from ..utils import compare_two_settings
...
@@ -14,12 +14,17 @@ from ..utils import compare_two_settings
def
test_custom_dispatcher
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_custom_dispatcher
(
monkeypatch
:
pytest
.
MonkeyPatch
):
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_RPC_TIMEOUT"
,
"30000"
)
m
.
setenv
(
"VLLM_RPC_TIMEOUT"
,
"30000"
)
compare_two_settings
(
compare_two_settings
(
"Qwen/Qwen2.5-1.5B-Instruct"
,
"google/gemma-2b"
,
arg1
=
[
arg1
=
[
"--max-model-len=256"
,
"--max-num-seqs=32"
,
"--enforce-eager"
,
"--enforce-eager"
,
f
"-O
{
CompilationLevel
.
DYNAMO_ONCE
}
"
,
f
"-O
{
CompilationLevel
.
DYNAMO_ONCE
}
"
,
],
],
arg2
=
[
"--enforce-eager"
,
f
"-O
{
CompilationLevel
.
DYNAMO_AS_IS
}
"
],
arg2
=
[
"--max-model-len=256"
,
"--max-num-seqs=32"
,
"--enforce-eager"
,
f
"-O
{
CompilationLevel
.
DYNAMO_AS_IS
}
"
],
env1
=
{},
env1
=
{},
env2
=
{})
env2
=
{})
Prev
1
2
3
4
5
6
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment