Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ca796e19
Commit
ca796e19
authored
Mar 21, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.1' into v0.8.1-ori
parents
e983c804
61c7a1b8
Changes
130
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
300 additions
and
260 deletions
+300
-260
tests/entrypoints/test_chat_utils.py
tests/entrypoints/test_chat_utils.py
+3
-3
tests/models/decoder_only/audio_language/test_ultravox.py
tests/models/decoder_only/audio_language/test_ultravox.py
+2
-13
tests/models/decoder_only/vision_language/test_models.py
tests/models/decoder_only/vision_language/test_models.py
+17
-50
tests/models/decoder_only/vision_language/test_pixtral.py
tests/models/decoder_only/vision_language/test_pixtral.py
+27
-53
tests/models/decoder_only/vision_language/vlm_utils/core.py
tests/models/decoder_only/vision_language/vlm_utils/core.py
+0
-3
tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
...els/decoder_only/vision_language/vlm_utils/model_utils.py
+49
-42
tests/models/decoder_only/vision_language/vlm_utils/types.py
tests/models/decoder_only/vision_language/vlm_utils/types.py
+2
-9
tests/models/embedding/vision_language/test_dse_qwen2_vl.py
tests/models/embedding/vision_language/test_dse_qwen2_vl.py
+28
-24
tests/models/embedding/vision_language/test_llava_next.py
tests/models/embedding/vision_language/test_llava_next.py
+3
-4
tests/models/embedding/vision_language/test_phi3v.py
tests/models/embedding/vision_language/test_phi3v.py
+1
-2
tests/models/encoder_decoder/vision_language/test_mllama.py
tests/models/encoder_decoder/vision_language/test_mllama.py
+2
-7
tests/models/fixtures/mistral_small_3_chat.json
tests/models/fixtures/mistral_small_3_chat.json
+1
-0
tests/models/fixtures/pixtral_chat_engine.json
tests/models/fixtures/pixtral_chat_engine.json
+0
-1
tests/models/utils.py
tests/models/utils.py
+4
-7
tests/multimodal/test_processing.py
tests/multimodal/test_processing.py
+103
-6
tests/neuron/1_core/test_prefix_prefill.py
tests/neuron/1_core/test_prefix_prefill.py
+1
-1
tests/quantization/test_bitsandbytes.py
tests/quantization/test_bitsandbytes.py
+2
-0
tests/tensorizer_loader/test_tensorizer.py
tests/tensorizer_loader/test_tensorizer.py
+2
-2
tests/tpu/test_compilation.py
tests/tpu/test_compilation.py
+39
-24
tests/tpu/test_custom_dispatcher.py
tests/tpu/test_custom_dispatcher.py
+14
-9
No files found.
tests/entrypoints/test_chat_utils.py
View file @
ca796e19
...
...
@@ -34,7 +34,7 @@ def phi3v_model_config():
tokenizer
=
PHI3V_MODEL_ID
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
True
,
dtype
=
"
bfloat16
"
,
dtype
=
"
auto
"
,
seed
=
0
,
limit_mm_per_prompt
=
{
"image"
:
2
,
...
...
@@ -58,7 +58,7 @@ def mllama_model_config():
tokenizer
=
MLLAMA_MODEL_ID
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
True
,
dtype
=
"
bfloat16
"
,
dtype
=
"
auto
"
,
seed
=
0
,
limit_mm_per_prompt
=
{
"image"
:
2
,
...
...
@@ -669,7 +669,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
tokenizer
=
MLLAMA_MODEL_ID
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
True
,
dtype
=
"
bfloat16
"
,
dtype
=
"
auto
"
,
seed
=
0
,
limit_mm_per_prompt
=
{
"image"
:
2
,
...
...
tests/models/decoder_only/audio_language/test_ultravox.py
View file @
ca796e19
...
...
@@ -5,11 +5,10 @@ from typing import Optional
import
numpy
as
np
import
pytest
import
pytest_asyncio
from
transformers
import
AutoModel
,
AutoTokenizer
,
BatchEncoding
from
transformers
import
AutoModel
,
AutoTokenizer
from
vllm.multimodal.audio
import
resample_audio
from
vllm.sequence
import
SampleLogprobs
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
from
....conftest
import
HfRunner
,
VllmRunner
from
....utils
import
RemoteOpenAIServer
...
...
@@ -107,8 +106,6 @@ def run_test(
**
kwargs
,
):
"""Inference result should be the same between hf and vllm."""
torch_dtype
=
STR_DTYPE_TO_TORCH_DTYPE
[
dtype
]
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
...
...
@@ -124,15 +121,7 @@ def run_test(
for
vllm_prompt
,
_
,
audio
in
prompts_and_audios
]
def
process
(
hf_inputs
:
BatchEncoding
,
**
kwargs
):
hf_inputs
[
"audio_values"
]
=
hf_inputs
[
"audio_values"
]
\
.
to
(
torch_dtype
)
# type: ignore
return
hf_inputs
with
hf_runner
(
model
,
dtype
=
dtype
,
postprocess_inputs
=
process
,
auto_cls
=
AutoModel
)
as
hf_model
:
with
hf_runner
(
model
,
dtype
=
dtype
,
auto_cls
=
AutoModel
)
as
hf_model
:
hf_outputs_per_audio
=
[
hf_model
.
generate_greedy_logprobs_limit
(
[
hf_prompt
],
...
...
tests/models/decoder_only/vision_language/test_models.py
View file @
ca796e19
...
...
@@ -9,7 +9,7 @@ from pathlib import PosixPath
import
pytest
from
packaging.version
import
Version
from
transformers
import
AutoModelFor
PreTraining
,
AutoModelForVision2Seq
from
transformers
import
AutoModelFor
ImageTextToText
,
AutoModelForVision2Seq
from
transformers
import
__version__
as
TRANSFORMERS_VERSION
from
vllm.platforms
import
current_platform
...
...
@@ -101,7 +101,7 @@ VLM_TEST_SETTINGS = {
prompt_formatter
=
lambda
img_prompt
:
f
"USER:
{
img_prompt
}
\n
ASSISTANT:"
,
convert_assets_to_embeddings
=
model_utils
.
get_llava_embeddings
,
max_model_len
=
4096
,
auto_cls
=
AutoModelFor
Vision2Seq
,
auto_cls
=
AutoModelFor
ImageTextToText
,
vllm_output_post_proc
=
model_utils
.
llava_image_vllm_to_hf_output
,
custom_test_opts
=
[
CustomTestOptions
(
inputs
=
custom_inputs
.
multi_image_multi_aspect_ratio_inputs
(
...
...
@@ -121,10 +121,7 @@ VLM_TEST_SETTINGS = {
"stop_sign"
:
"caption es"
,
"cherry_blossom"
:
"What is in the picture?"
,
}),
auto_cls
=
AutoModelForVision2Seq
,
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
"pixel_values"
),
auto_cls
=
AutoModelForImageTextToText
,
vllm_output_post_proc
=
model_utils
.
paligemma_vllm_to_hf_output
,
dtype
=
"bfloat16"
,
marks
=
[
pytest
.
mark
.
skip
(
reason
=
"vLLM does not support PrefixLM attention mask"
)],
# noqa: E501
...
...
@@ -179,7 +176,6 @@ VLM_TEST_SETTINGS = {
# "cherry_blossom": "<vlm_image>Please infer the season with reason.", # noqa: E501
# }),
# multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501
# postprocess_inputs=model_utils.cast_dtype_post_processor("pixel_values"), # noqa: E501
# stop_str=["<|im_end|>"],
# image_size_factors=[(0.10, 0.15)],
# max_tokens=64,
...
...
@@ -190,7 +186,7 @@ VLM_TEST_SETTINGS = {
test_type
=
VLMTestType
.
IMAGE
,
prompt_formatter
=
lambda
img_prompt
:
f
"Question:
{
img_prompt
}
Answer:"
,
img_idx_to_prompt
=
lambda
idx
:
""
,
auto_cls
=
AutoModelFor
Vision2Seq
,
auto_cls
=
AutoModelFor
ImageTextToText
,
vllm_output_post_proc
=
model_utils
.
blip2_vllm_to_hf_output
,
),
"chameleon"
:
VLMTestInfo
(
...
...
@@ -199,10 +195,7 @@ VLM_TEST_SETTINGS = {
prompt_formatter
=
lambda
img_prompt
:
f
"USER:
{
img_prompt
}
\n
ASSISTANT:"
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
auto_cls
=
AutoModelForVision2Seq
,
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
"pixel_values"
),
auto_cls
=
AutoModelForImageTextToText
,
# For chameleon, we only compare the sequences
vllm_output_post_proc
=
lambda
vllm_output
,
model
:
vllm_output
[:
2
],
hf_output_post_proc
=
lambda
hf_output
,
model
:
hf_output
[:
2
],
...
...
@@ -222,7 +215,6 @@ VLM_TEST_SETTINGS = {
}),
multi_image_prompt
=
"image_1:<image>
\n
image_2:<image>
\n
Which image can we see the car and the tower?"
,
# noqa: E501
patch_hf_runner
=
model_utils
.
deepseekvl2_patch_hf_runner
,
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
"images"
),
hf_output_post_proc
=
model_utils
.
deepseekvl2_trunc_hf_output
,
stop_str
=
[
"<|end▁of▁sentence|>"
,
"<|begin▁of▁sentence|>"
],
# noqa: E501
image_size_factors
=
[(),
(
1.0
,
),
(
1.0
,
1.0
,
1.0
),
(
0.1
,
0.5
,
1.0
)],
...
...
@@ -240,6 +232,7 @@ VLM_TEST_SETTINGS = {
img_idx_to_prompt
=
lambda
idx
:
""
,
max_model_len
=
2048
,
max_num_seqs
=
2
,
auto_cls
=
AutoModelForImageTextToText
,
use_tokenizer_eos
=
True
,
vllm_output_post_proc
=
model_utils
.
fuyu_vllm_to_hf_output
,
num_logprobs
=
10
,
...
...
@@ -256,9 +249,7 @@ VLM_TEST_SETTINGS = {
multi_image_prompt
=
"<start_of_image><start_of_image>Describe the two images in detail."
,
# noqa: E501
max_model_len
=
4096
,
max_num_seqs
=
2
,
# TODO: Use AutoModelForVision2Seq once transformers supports this
auto_cls
=
AutoModelForPreTraining
,
dtype
=
"bfloat16"
,
auto_cls
=
AutoModelForImageTextToText
,
vllm_runner_kwargs
=
{
"mm_processor_kwargs"
:
{
"do_pan_and_scan"
:
True
}},
patch_hf_runner
=
model_utils
.
gemma3_patch_hf_runner
,
),
...
...
@@ -272,7 +263,6 @@ VLM_TEST_SETTINGS = {
}),
max_model_len
=
2048
,
max_num_seqs
=
2
,
dtype
=
"bfloat16"
,
get_stop_token_ids
=
lambda
tok
:
[
151329
,
151336
,
151338
],
patch_hf_runner
=
model_utils
.
glm4v_patch_hf_runner
,
# The image embeddings match with HF but the outputs of the language
...
...
@@ -295,7 +285,6 @@ VLM_TEST_SETTINGS = {
}),
multi_image_prompt
=
"Image-1: <image>
\n
Image-2: <image>
\n
Describe the two images in short."
,
# noqa: E501
max_model_len
=
8192
,
dtype
=
"bfloat16"
,
use_tokenizer_eos
=
True
,
num_logprobs
=
10
,
patch_hf_runner
=
model_utils
.
h2ovl_patch_hf_runner
,
...
...
@@ -307,7 +296,7 @@ VLM_TEST_SETTINGS = {
img_idx_to_prompt
=
lambda
idx
:
"<image>"
,
max_model_len
=
8192
,
max_num_seqs
=
2
,
auto_cls
=
AutoModelFor
Vision2Seq
,
auto_cls
=
AutoModelFor
ImageTextToText
,
hf_output_post_proc
=
model_utils
.
idefics3_trunc_hf_output
,
),
"intern_vl"
:
VLMTestInfo
(
...
...
@@ -324,10 +313,6 @@ VLM_TEST_SETTINGS = {
}),
multi_image_prompt
=
"Image-1: <image>
\n
Image-2: <image>
\n
Describe the two images in short."
,
# noqa: E501
max_model_len
=
4096
,
# NOTE: Mono-InternVL-2B doesn't work with fp16,
# it will result NaN during inference.
# See: https://huggingface.co/OpenGVLab/Mono-InternVL-2B/discussions/9
dtype
=
"bfloat16"
,
use_tokenizer_eos
=
True
,
patch_hf_runner
=
model_utils
.
internvl_patch_hf_runner
,
),
...
...
@@ -336,7 +321,7 @@ VLM_TEST_SETTINGS = {
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
CUSTOM_INPUTS
),
prompt_formatter
=
lambda
img_prompt
:
f
"[INST]
{
img_prompt
}
[/INST]"
,
max_model_len
=
10240
,
auto_cls
=
AutoModelFor
Vision2Seq
,
auto_cls
=
AutoModelFor
ImageTextToText
,
vllm_output_post_proc
=
model_utils
.
llava_image_vllm_to_hf_output
,
custom_test_opts
=
[
CustomTestOptions
(
inputs
=
custom_inputs
.
multi_image_multi_aspect_ratio_inputs
(
...
...
@@ -351,9 +336,6 @@ VLM_TEST_SETTINGS = {
prompt_formatter
=
lambda
vid_prompt
:
f
"<|im_start|>user
\n
{
vid_prompt
}
<|im_end|>
\n
<|im_start|>assistant
\n
"
,
# noqa: E501
num_video_frames
=
16
,
max_model_len
=
16384
,
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
"pixel_values_videos"
),
auto_cls
=
AutoModelForVision2Seq
,
vllm_output_post_proc
=
model_utils
.
llava_onevision_vllm_to_hf_output
,
custom_test_opts
=
[
CustomTestOptions
(
...
...
@@ -378,11 +360,8 @@ VLM_TEST_SETTINGS = {
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
prompt_formatter
=
lambda
img_prompt
:
f
"<|start_header_id|>user<|end_header_id|>
\n\n
{
img_prompt
}
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
\n\n
"
,
# noqa: E501
max_model_len
=
4096
,
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
"pixel_values"
),
get_stop_token_ids
=
lambda
tok
:
[
128009
],
auto_cls
=
AutoModelFor
Vision2Seq
,
auto_cls
=
AutoModelFor
ImageTextToText
,
vllm_output_post_proc
=
model_utils
.
mantis_vllm_to_hf_output
,
patch_hf_runner
=
model_utils
.
mantis_patch_hf_runner
,
marks
=
[
...
...
@@ -400,8 +379,8 @@ VLM_TEST_SETTINGS = {
max_model_len
=
4096
,
max_num_seqs
=
2
,
get_stop_token_ids
=
lambda
tok
:
[
tok
.
eos_id
,
tok
.
eot_id
],
postprocess_inputs
=
model_utils
.
wrap_inputs_post_processor
,
hf_output_post_proc
=
model_utils
.
minicpmv_trunc_hf_output
,
patch_hf_runner
=
model_utils
.
minicpmv_25_patch_hf_runner
,
),
"minicpmo_26"
:
VLMTestInfo
(
models
=
[
"openbmb/MiniCPM-o-2_6"
],
...
...
@@ -411,11 +390,8 @@ VLM_TEST_SETTINGS = {
max_model_len
=
4096
,
max_num_seqs
=
2
,
get_stop_token_ids
=
lambda
tok
:
tok
.
convert_tokens_to_ids
([
'<|im_end|>'
,
'<|endoftext|>'
]),
# noqa: E501
postprocess_inputs
=
model_utils
.
ignore_inputs_post_processor
(
"image_sizes"
),
hf_output_post_proc
=
model_utils
.
minicpmv_trunc_hf_output
,
patch_hf_runner
=
model_utils
.
minicpmo_patch_hf_runner
patch_hf_runner
=
model_utils
.
minicpmo_
26_
patch_hf_runner
,
),
"minicpmv_26"
:
VLMTestInfo
(
models
=
[
"openbmb/MiniCPM-V-2_6"
],
...
...
@@ -425,10 +401,8 @@ VLM_TEST_SETTINGS = {
max_model_len
=
4096
,
max_num_seqs
=
2
,
get_stop_token_ids
=
lambda
tok
:
tok
.
convert_tokens_to_ids
([
'<|im_end|>'
,
'<|endoftext|>'
]),
# noqa: E501
postprocess_inputs
=
model_utils
.
ignore_inputs_post_processor
(
"image_sizes"
),
hf_output_post_proc
=
model_utils
.
minicpmv_trunc_hf_output
,
patch_hf_runner
=
model_utils
.
minicpmv_26_patch_hf_runner
,
),
"molmo"
:
VLMTestInfo
(
models
=
[
"allenai/Molmo-7B-D-0924"
],
...
...
@@ -437,7 +411,6 @@ VLM_TEST_SETTINGS = {
max_model_len
=
4096
,
max_num_seqs
=
2
,
patch_hf_runner
=
model_utils
.
molmo_patch_hf_runner
,
postprocess_inputs
=
model_utils
.
molmo_post_processor
,
),
# Tests for phi3v currently live in another file because of a bug in
# transformers. Once this issue is fixed, we can enable them here instead.
...
...
@@ -463,7 +436,7 @@ VLM_TEST_SETTINGS = {
img_idx_to_prompt
=
lambda
idx
:
"[IMG]"
,
max_model_len
=
8192
,
max_num_seqs
=
2
,
auto_cls
=
AutoModelFor
Vision2Seq
,
auto_cls
=
AutoModelFor
ImageTextToText
,
marks
=
[
large_gpu_mark
(
min_gb
=
48
)],
),
"qwen_vl"
:
VLMTestInfo
(
...
...
@@ -481,10 +454,7 @@ VLM_TEST_SETTINGS = {
models
=
[
"facebook/chameleon-7b"
],
prompt_formatter
=
lambda
img_prompt
:
f
"USER:
{
img_prompt
}
\n
ASSISTANT:"
,
max_model_len
=
4096
,
auto_cls
=
AutoModelForVision2Seq
,
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
"pixel_values"
),
auto_cls
=
AutoModelForImageTextToText
,
vllm_output_post_proc
=
lambda
vllm_output
,
model
:
vllm_output
[:
2
],
hf_output_post_proc
=
lambda
hf_output
,
model
:
hf_output
[:
2
],
comparator
=
check_outputs_equal
,
...
...
@@ -495,7 +465,7 @@ VLM_TEST_SETTINGS = {
models
=
[
"llava-hf/llava-1.5-7b-hf"
],
prompt_formatter
=
lambda
img_prompt
:
f
"USER:
{
img_prompt
}
\n
ASSISTANT:"
,
max_model_len
=
4096
,
auto_cls
=
AutoModelFor
Vision2Seq
,
auto_cls
=
AutoModelFor
ImageTextToText
,
vllm_output_post_proc
=
model_utils
.
llava_image_vllm_to_hf_output
,
marks
=
multi_gpu_marks
(
num_gpus
=
2
),
**
COMMON_BROADCAST_SETTINGS
# type: ignore
...
...
@@ -504,7 +474,7 @@ VLM_TEST_SETTINGS = {
models
=
[
"llava-hf/llava-v1.6-mistral-7b-hf"
],
prompt_formatter
=
lambda
img_prompt
:
f
"[INST]
{
img_prompt
}
[/INST]"
,
max_model_len
=
10240
,
auto_cls
=
AutoModelFor
Vision2Seq
,
auto_cls
=
AutoModelFor
ImageTextToText
,
vllm_output_post_proc
=
model_utils
.
llava_image_vllm_to_hf_output
,
marks
=
multi_gpu_marks
(
num_gpus
=
2
),
**
COMMON_BROADCAST_SETTINGS
# type: ignore
...
...
@@ -529,9 +499,6 @@ VLM_TEST_SETTINGS = {
test_type
=
VLMTestType
.
CUSTOM_INPUTS
,
max_model_len
=
16384
,
max_num_seqs
=
2
,
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
"pixel_values"
),
auto_cls
=
AutoModelForVision2Seq
,
vllm_output_post_proc
=
model_utils
.
llava_onevision_vllm_to_hf_output
,
custom_test_opts
=
[
CustomTestOptions
(
...
...
tests/models/decoder_only/vision_language/test_pixtral.py
View file @
ca796e19
...
...
@@ -4,7 +4,6 @@
Run `pytest tests/models/test_mistral.py`.
"""
import
json
import
uuid
from
dataclasses
import
asdict
from
typing
import
TYPE_CHECKING
,
Any
,
Optional
...
...
@@ -16,8 +15,7 @@ from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from
mistral_common.tokens.tokenizers.multimodal
import
image_from_chunk
from
transformers
import
AutoProcessor
from
vllm
import
(
EngineArgs
,
LLMEngine
,
RequestOutput
,
SamplingParams
,
TextPrompt
,
TokensPrompt
)
from
vllm
import
RequestOutput
,
SamplingParams
,
TextPrompt
,
TokensPrompt
from
vllm.multimodal
import
MultiModalDataBuiltins
from
vllm.multimodal.inputs
import
PlaceholderRange
from
vllm.sequence
import
Logprob
,
SampleLogprobs
...
...
@@ -28,7 +26,11 @@ from ...utils import check_logprobs_close
if
TYPE_CHECKING
:
from
_typeshed
import
StrPath
MODELS
=
[
"mistralai/Pixtral-12B-2409"
]
PIXTRAL_ID
=
"mistralai/Pixtral-12B-2409"
MISTRAL_SMALL_3_1_ID
=
"mistralai/Mistral-Small-3.1-24B-Instruct-2503"
MODELS
=
[
PIXTRAL_ID
,
MISTRAL_SMALL_3_1_ID
]
IMG_URLS
=
[
"https://picsum.photos/id/237/400/300"
,
"https://picsum.photos/id/231/200/300"
,
...
...
@@ -125,8 +127,10 @@ MAX_MODEL_LEN = [8192, 65536]
FIXTURES_PATH
=
VLLM_PATH
/
"tests/models/fixtures"
assert
FIXTURES_PATH
.
exists
()
FIXTURE_LOGPROBS_CHAT
=
FIXTURES_PATH
/
"pixtral_chat.json"
FIXTURE_LOGPROBS_ENGINE
=
FIXTURES_PATH
/
"pixtral_chat_engine.json"
FIXTURE_LOGPROBS_CHAT
=
{
PIXTRAL_ID
:
FIXTURES_PATH
/
"pixtral_chat.json"
,
MISTRAL_SMALL_3_1_ID
:
FIXTURES_PATH
/
"mistral_small_3_chat.json"
,
}
OutputsLogprobs
=
list
[
tuple
[
list
[
int
],
str
,
Optional
[
SampleLogprobs
]]]
...
...
@@ -166,12 +170,12 @@ def test_chat(
model
:
str
,
dtype
:
str
,
)
->
None
:
EXPECTED_CHAT_LOGPROBS
=
load_outputs_w_logprobs
(
FIXTURE_LOGPROBS_CHAT
)
EXPECTED_CHAT_LOGPROBS
=
load_outputs_w_logprobs
(
FIXTURE_LOGPROBS_CHAT
[
model
])
with
vllm_runner
(
model
,
dtype
=
dtype
,
tokenizer_mode
=
"mistral"
,
enable_chunked_prefill
=
False
,
max_model_len
=
max_model_len
,
limit_mm_per_prompt
=
LIMIT_MM_PER_PROMPT
,
)
as
vllm_model
:
...
...
@@ -183,70 +187,40 @@ def test_chat(
outputs
.
extend
(
output
)
logprobs
=
vllm_runner
.
_final_steps_generate_w_logprobs
(
outputs
)
# Remove last `None` prompt_logprobs to compare with fixture
for
i
in
range
(
len
(
logprobs
)):
assert
logprobs
[
i
][
-
1
]
is
None
logprobs
[
i
]
=
logprobs
[
i
][:
-
1
]
check_logprobs_close
(
outputs_0_lst
=
EXPECTED_CHAT_LOGPROBS
,
outputs_1_lst
=
logprobs
,
name_0
=
"h100_ref"
,
name_1
=
"output"
)
@
large_gpu_test
(
min_gb
=
80
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
def
test_model_engine
(
vllm_runner
,
model
:
str
,
dtype
:
str
)
->
None
:
EXPECTED_ENGINE_LOGPROBS
=
load_outputs_w_logprobs
(
FIXTURE_LOGPROBS_ENGINE
)
args
=
EngineArgs
(
model
=
model
,
tokenizer_mode
=
"mistral"
,
enable_chunked_prefill
=
False
,
limit_mm_per_prompt
=
LIMIT_MM_PER_PROMPT
,
dtype
=
dtype
,
)
engine
=
LLMEngine
.
from_engine_args
(
args
)
engine
.
add_request
(
uuid
.
uuid4
().
hex
,
ENGINE_INPUTS
[
0
],
SAMPLING_PARAMS
)
engine
.
add_request
(
uuid
.
uuid4
().
hex
,
ENGINE_INPUTS
[
1
],
SAMPLING_PARAMS
)
outputs
=
[]
count
=
0
while
True
:
out
=
engine
.
step
()
count
+=
1
for
request_output
in
out
:
if
request_output
.
finished
:
outputs
.
append
(
request_output
)
if
count
==
2
:
engine
.
add_request
(
uuid
.
uuid4
().
hex
,
ENGINE_INPUTS
[
2
],
SAMPLING_PARAMS
)
if
not
engine
.
has_unfinished_requests
():
break
logprobs
=
vllm_runner
.
_final_steps_generate_w_logprobs
(
outputs
)
check_logprobs_close
(
outputs_0_lst
=
EXPECTED_ENGINE_LOGPROBS
,
outputs_1_lst
=
logprobs
,
name_0
=
"h100_ref"
,
name_1
=
"output"
)
@
large_gpu_test
(
min_gb
=
48
)
@
pytest
.
mark
.
parametrize
(
"prompt,expected_ranges"
,
[(
_create_engine_inputs_hf
(
IMG_URLS
[:
1
]),
[{
"offset"
:
1
0
,
"offset"
:
1
1
,
"length"
:
494
}]),
(
_create_engine_inputs_hf
(
IMG_URLS
[
1
:
4
]),
[{
"offset"
:
1
0
,
"offset"
:
1
1
,
"length"
:
266
},
{
"offset"
:
27
6
,
"offset"
:
27
7
,
"length"
:
1056
},
{
"offset"
:
133
2
,
"offset"
:
133
3
,
"length"
:
418
}])])
def
test_multi_modal_placeholders
(
vllm_runner
,
prompt
,
expected_ranges
:
list
[
PlaceholderRange
])
->
None
:
def
test_multi_modal_placeholders
(
vllm_runner
,
prompt
,
expected_ranges
:
list
[
PlaceholderRange
],
monkeypatch
)
->
None
:
# This placeholder checking test only works with V0 engine
# where `multi_modal_placeholders` is returned with `RequestOutput`
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
with
vllm_runner
(
"mistral-community/pixtral-12b"
,
max_model_len
=
8192
,
...
...
tests/models/decoder_only/vision_language/vlm_utils/core.py
View file @
ca796e19
...
...
@@ -4,7 +4,6 @@ from typing import Any, Callable, Optional, Union
import
torch
from
PIL.Image
import
Image
from
transformers
import
BatchEncoding
from
transformers.models.auto.auto_factory
import
_BaseAutoModelClass
from
vllm.config
import
TaskOption
...
...
@@ -31,7 +30,6 @@ def run_test(
vllm_output_post_proc
:
Optional
[
Callable
[[
RunnerOutput
,
str
],
Any
]],
auto_cls
:
type
[
_BaseAutoModelClass
],
use_tokenizer_eos
:
bool
,
postprocess_inputs
:
Callable
[[
BatchEncoding
],
BatchEncoding
],
comparator
:
Callable
[...,
None
],
get_stop_token_ids
:
Optional
[
Callable
[[
AnyTokenizer
],
list
[
int
]]],
stop_str
:
Optional
[
list
[
str
]],
...
...
@@ -101,7 +99,6 @@ def run_test(
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
,
auto_cls
=
auto_cls
,
postprocess_inputs
=
postprocess_inputs
,
model_kwargs
=
hf_model_kwargs
)
# Some models need to patch things like the model processor, e.g., internvl
...
...
tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
View file @
ca796e19
...
...
@@ -6,16 +6,15 @@ typically specific to a small subset of models.
import
re
import
types
from
pathlib
import
PosixPath
from
typing
import
Callable
,
Optional
,
Union
from
typing
import
Optional
,
Union
import
torch
from
PIL.Image
import
Image
from
transformers
import
(
AutoConfig
,
AutoTokenizer
,
Batch
Encoding
,
from
transformers
import
(
AutoConfig
,
AutoTokenizer
,
Batch
Feature
,
GenerationConfig
)
from
vllm.sequence
import
SampleLogprobs
from
vllm.transformers_utils.tokenizer
import
patch_padding_side
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
from
.....conftest
import
HfRunner
,
ImageAsset
,
_ImageAssets
from
.types
import
RunnerOutput
...
...
@@ -211,40 +210,6 @@ def get_llava_embeddings(image_assets: _ImageAssets):
return
[
asset
.
image_embeds
for
asset
in
image_assets
]
####### postprocessors to run on HF BatchEncoding
def
cast_dtype_post_processor
(
hf_inp_key
:
str
)
->
Callable
[[
BatchEncoding
,
str
],
BatchEncoding
]:
"""Gets a handle to a post processor which converts a given key into a
target data type."""
def
process
(
hf_inputs
:
BatchEncoding
,
dtype
:
str
):
torch_dtype
=
STR_DTYPE_TO_TORCH_DTYPE
[
dtype
]
hf_inputs
[
hf_inp_key
]
=
hf_inputs
[
hf_inp_key
].
to
(
torch_dtype
)
return
hf_inputs
return
process
def
ignore_inputs_post_processor
(
hf_inp_key
:
str
)
->
Callable
[[
BatchEncoding
,
str
],
BatchEncoding
]:
"""Gets a handle to a post processor which ignores a given key."""
def
process
(
hf_inputs
:
BatchEncoding
,
dtype
:
str
):
del
hf_inputs
[
hf_inp_key
]
return
hf_inputs
return
process
def
wrap_inputs_post_processor
(
hf_inputs
:
BatchEncoding
,
dtype
:
str
):
return
{
"model_inputs"
:
hf_inputs
}
def
molmo_post_processor
(
hf_inputs
:
BatchEncoding
,
dtype
:
str
):
hf_inputs
=
cast_dtype_post_processor
(
"images"
)(
hf_inputs
,
dtype
)
return
{
k
:
v
.
unsqueeze
(
0
)
for
k
,
v
in
hf_inputs
.
items
()}
####### Prompt path encoders for models that need models on disk
def
qwen_prompt_path_encoder
(
tmp_path
:
PosixPath
,
prompt
:
str
,
assets
:
Union
[
list
[
ImageAsset
],
...
...
@@ -295,8 +260,7 @@ def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
for
k
in
inputs
.
keys
()
# noqa
if
k
not
in
(
"seq_lens"
,
"sft_format"
)
}
inputs
=
BatchEncoding
(
data
=
inputs
,
tensor_type
=
"pt"
)
return
inputs
return
BatchFeature
(
data
=
inputs
,
tensor_type
=
"pt"
)
hf_model
.
processor
=
processor
hf_model
.
model
.
get_output_embeddings
=
lambda
:
\
...
...
@@ -529,10 +493,52 @@ def mantis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
return
hf_model
def
minicpm
o
_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
def
minicpm
v_25
_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
orig_generate
=
hf_model
.
model
.
generate
def
_generate
(
self
,
*
args
,
**
kwargs
):
def
_generate
(
self
,
*
args
,
input_ids
=
None
,
pixel_values
=
None
,
image_sizes
=
None
,
image_bound
=
None
,
tgt_sizes
=
None
,
**
kwargs
,
):
model_inputs
=
{
"input_ids"
:
input_ids
,
"pixel_values"
:
pixel_values
,
"image_sizes"
:
image_sizes
,
"image_bound"
:
image_bound
,
"tgt_sizes"
:
tgt_sizes
,
}
for
k
in
list
(
model_inputs
.
keys
()):
if
model_inputs
[
k
]
is
None
:
model_inputs
.
pop
(
k
)
return
orig_generate
(
model_inputs
,
*
args
,
decode_text
=
False
,
**
kwargs
)
hf_model
.
model
.
generate
=
types
.
MethodType
(
_generate
,
hf_model
.
model
)
return
hf_model
def
minicpmo_26_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
orig_generate
=
hf_model
.
model
.
generate
def
_generate
(
self
,
*
args
,
image_sizes
=
None
,
**
kwargs
):
return
orig_generate
(
*
args
,
decode_text
=
False
,
**
kwargs
)
hf_model
.
model
.
generate
=
types
.
MethodType
(
_generate
,
hf_model
.
model
)
return
hf_model
def
minicpmv_26_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
orig_generate
=
hf_model
.
model
.
generate
def
_generate
(
self
,
*
args
,
image_sizes
=
None
,
**
kwargs
):
return
orig_generate
(
*
args
,
decode_text
=
False
,
**
kwargs
)
hf_model
.
model
.
generate
=
types
.
MethodType
(
_generate
,
hf_model
.
model
)
...
...
@@ -551,10 +557,11 @@ def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
def
_generate
(
self
,
max_new_tokens
=
None
,
do_sample
=
None
,
**
kwargs
):
batch
=
{
k
:
kwargs
.
pop
(
k
)
k
:
kwargs
.
pop
(
k
)
.
unsqueeze
(
0
)
for
k
in
(
"input_ids"
,
"images"
,
"image_input_idx"
,
"image_masks"
)
if
k
in
kwargs
}
batch
=
BatchFeature
(
batch
).
to
(
dtype
=
self
.
dtype
)
return
self
.
generate_from_batch
(
batch
,
...
...
tests/models/decoder_only/vision_language/vlm_utils/types.py
View file @
ca796e19
...
...
@@ -8,13 +8,12 @@ from typing import Any, Callable, NamedTuple, Optional, Union
import
torch
from
PIL.Image
import
Image
from
pytest
import
MarkDecorator
from
transformers
import
AutoModelForCausalLM
,
BatchEncoding
from
transformers
import
AutoModelForCausalLM
from
transformers.models.auto.auto_factory
import
_BaseAutoModelClass
from
vllm.config
import
TaskOption
from
vllm.sequence
import
SampleLogprobs
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
vllm.utils
import
identity
from
.....conftest
import
IMAGE_ASSETS
,
HfRunner
,
ImageAsset
,
_ImageAssets
from
....utils
import
check_logprobs_close
...
...
@@ -110,11 +109,6 @@ class VLMTestInfo(NamedTuple):
# Indicates we should explicitly pass the EOS from the tokenizer
use_tokenizer_eos
:
bool
=
False
auto_cls
:
type
[
_BaseAutoModelClass
]
=
AutoModelForCausalLM
# Callable to pass to the HF runner to run on inputs; for now, we also pass
# the data type to input post processing, because almost all of the uses of
# postprocess_inputs are to fix the data types of BatchEncoding values.
postprocess_inputs
:
Callable
[[
BatchEncoding
,
str
],
BatchEncoding
]
=
identity
patch_hf_runner
:
Optional
[
Callable
[[
HfRunner
],
HfRunner
]]
=
None
# Post processors that if defined, will run oun the outputs of the
...
...
@@ -130,7 +124,7 @@ class VLMTestInfo(NamedTuple):
# is all combinations of .models + all fields below
max_tokens
:
Union
[
int
,
tuple
[
int
]]
=
128
num_logprobs
:
Union
[
int
,
tuple
[
int
]]
=
5
dtype
:
Union
[
str
,
Iterable
[
str
]]
=
"
half
"
dtype
:
Union
[
str
,
Union
[
list
[
str
],
tuple
[
str
,
...]
]]
=
"
auto
"
distributed_executor_backend
:
Optional
[
Union
[
str
,
Iterable
[
str
]]]
=
None
# Only expanded in video tests
num_video_frames
:
Union
[
int
,
tuple
[
int
]]
=
16
...
...
@@ -171,7 +165,6 @@ class VLMTestInfo(NamedTuple):
"vllm_output_post_proc"
:
self
.
vllm_output_post_proc
,
"auto_cls"
:
self
.
auto_cls
,
"use_tokenizer_eos"
:
self
.
use_tokenizer_eos
,
"postprocess_inputs"
:
self
.
postprocess_inputs
,
"comparator"
:
self
.
comparator
,
"get_stop_token_ids"
:
self
.
get_stop_token_ids
,
"hf_model_kwargs"
:
self
.
hf_model_kwargs
,
...
...
tests/models/embedding/vision_language/test_dse_qwen2_vl.py
View file @
ca796e19
# SPDX-License-Identifier: Apache-2.0
from
functools
import
partial
from
typing
import
Callable
import
pytest
import
torch
import
torch.nn.functional
as
F
from
PIL
import
Image
from
transformers
import
BatchEncoding
,
Qwen2VLForConditionalGeneration
from
transformers
import
Qwen2VLForConditionalGeneration
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
from
....utils
import
large_gpu_test
...
...
@@ -75,10 +75,6 @@ def apply_chat_template_and_add_eos(
return
prompt
def
postprocess_inputs
(
hf_model
:
HfRunner
,
inputs
:
BatchEncoding
,
**
kwargs
):
return
hf_model
.
model
.
prepare_inputs_for_generation
(
**
inputs
,
**
kwargs
)
def
_run_test
(
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
type
[
VllmRunner
],
...
...
@@ -118,14 +114,8 @@ def _run_test(
with
hf_runner
(
model
,
dtype
=
dtype
,
auto_cls
=
Qwen2VLForConditionalGeneration
)
as
hf_model
:
hf_model
.
postprocess_inputs
=
partial
(
postprocess_inputs
,
hf_model
,
cache_position
=
torch
.
arange
(
0
,
1
,
# 1 for batch size
requires_grad
=
False
),
use_cache
=
False
)
prompts
=
[]
for
text
,
image
,
embed_text
in
zip
(
input_texts
,
input_images
,
embed_texts
):
# dse requires non-standard input processing
...
...
@@ -133,20 +123,34 @@ def _run_test(
messages
=
get_messages
(
image
,
text
,
embed_text
)
prompt
=
apply_chat_template_and_add_eos
(
messages
,
hf_model
.
processor
.
apply_chat_template
)
inputs
=
hf_model
.
get_inputs
(
prompts
=
[[
prompt
]],
images
=
[[
image
]],
)
with
torch
.
no_grad
():
prompts
.
append
(
prompt
)
all_inputs
=
hf_model
.
get_inputs
(
prompts
=
prompts
,
images
=
input_images
,
)
with
torch
.
no_grad
():
all_outputs
=
[]
for
inputs
in
all_inputs
:
inputs
=
hf_model
.
model
.
prepare_inputs_for_generation
(
**
inputs
,
cache_position
=
torch
.
arange
(
1
),
# 1 for batch size
use_cache
=
False
,
)
outputs
=
hf_model
.
model
(
**
hf_model
.
wrap_device
(
inputs
[
0
],
device
=
hf_model
.
model
.
device
.
type
),
**
hf_model
.
wrap_device
(
inputs
),
return_dict
=
True
,
output_hidden_states
=
True
,
)
pooled_output
=
torch
.
nn
.
functional
.
normalize
(
outputs
.
hidden_states
[
-
1
][
0
,
-
1
],
p
=
2
,
dim
=-
1
)
hf_outputs
.
append
(
pooled_output
.
tolist
())
pooled_output
=
F
.
normalize
(
outputs
.
hidden_states
[
-
1
][
0
,
-
1
],
p
=
2
,
dim
=-
1
)
all_outputs
.
append
(
pooled_output
.
tolist
())
hf_outputs
=
all_outputs
check_embeddings_close
(
embeddings_0_lst
=
hf_outputs
,
...
...
tests/models/embedding/vision_language/test_llava_next.py
View file @
ca796e19
...
...
@@ -2,7 +2,7 @@
import
pytest
import
torch.nn.functional
as
F
from
transformers
import
AutoModelFor
Vision2Seq
from
transformers
import
AutoModelFor
ImageTextToText
from
vllm.platforms
import
current_platform
...
...
@@ -70,7 +70,7 @@ def _run_test(
vllm_outputs
=
vllm_model
.
encode
(
input_texts
,
images
=
input_images
)
with
hf_runner
(
model
,
dtype
=
dtype
,
auto_cls
=
AutoModelFor
Vision2Seq
)
as
hf_model
:
auto_cls
=
AutoModelFor
ImageTextToText
)
as
hf_model
:
# Patch the issue where generation_config.json is missing
hf_model
.
processor
.
patch_size
=
\
hf_model
.
model
.
config
.
vision_config
.
patch_size
...
...
@@ -86,8 +86,7 @@ def _run_test(
for
inputs
in
all_inputs
:
# Based on: https://huggingface.co/royokong/e5-v
outputs
=
hf_model
.
model
(
**
hf_model
.
wrap_device
(
inputs
,
device
=
hf_model
.
model
.
device
.
type
),
**
hf_model
.
wrap_device
(
inputs
),
return_dict
=
True
,
output_hidden_states
=
True
,
)
...
...
tests/models/embedding/vision_language/test_phi3v.py
View file @
ca796e19
...
...
@@ -53,8 +53,7 @@ def _run_test(
for
inputs
in
all_inputs
:
# Based on: https://github.com/TIGER-AI-Lab/VLM2Vec/blob/db3b951bccabba220c1f53ab46a734e50dd2fc08/src/model.py
outputs
=
hf_model
.
model
(
**
hf_model
.
wrap_device
(
inputs
,
device
=
hf_model
.
model
.
device
.
type
),
**
hf_model
.
wrap_device
(
inputs
),
return_dict
=
True
,
output_hidden_states
=
True
,
)
...
...
tests/models/encoder_decoder/vision_language/test_mllama.py
View file @
ca796e19
...
...
@@ -4,8 +4,7 @@ from typing import Optional, overload
import
pytest
import
torch
from
transformers
import
(
AutoConfig
,
AutoModelForVision2Seq
,
AutoTokenizer
,
BatchEncoding
)
from
transformers
import
AutoConfig
,
AutoModelForImageTextToText
,
AutoTokenizer
from
vllm
import
LLM
,
SamplingParams
from
vllm.attention.backends.flash_attn
import
FlashAttentionMetadata
...
...
@@ -227,14 +226,10 @@ def _run_test(
for
prompts
,
images
in
inputs
]
def
process
(
hf_inputs
:
BatchEncoding
,
**
kwargs
):
return
hf_inputs
with
hf_runner
(
model
,
dtype
=
dtype
,
model_kwargs
=
{
"device_map"
:
"auto"
},
postprocess_inputs
=
process
,
auto_cls
=
AutoModelForVision2Seq
)
as
hf_model
:
auto_cls
=
AutoModelForImageTextToText
)
as
hf_model
:
hf_outputs_per_image
=
[
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
max_tokens
,
...
...
tests/models/fixtures/mistral_small_3_chat.json
0 → 100644
View file @
ca796e19
This diff is collapsed.
Click to expand it.
tests/models/fixtures/pixtral_chat_engine.json
deleted
100644 → 0
View file @
e983c804
This diff is collapsed.
Click to expand it.
tests/models/utils.py
View file @
ca796e19
...
...
@@ -2,7 +2,7 @@
import
warnings
from
collections.abc
import
Sequence
from
typing
import
Optional
,
Union
from
typing
import
Any
,
Optional
,
Union
import
torch
...
...
@@ -254,9 +254,9 @@ def check_logprobs_close(
def
build_model_context
(
model_id
:
str
,
task
:
TaskOption
=
"auto"
,
dtype
:
Optional
[
Union
[
str
,
torch
.
dtype
]
]
=
None
,
mm_processor_kwargs
:
Optional
[
dict
]
=
None
,
limit_mm_per_prompt
:
Optional
[
dict
]
=
None
,
dtype
:
Union
[
str
,
torch
.
dtype
]
=
"auto"
,
mm_processor_kwargs
:
Optional
[
dict
[
str
,
Any
]
]
=
None
,
limit_mm_per_prompt
:
Optional
[
dict
[
str
,
int
]
]
=
None
,
disable_mm_preprocessor_cache
:
bool
=
True
,
):
"""Creates an InputContext for a given model.
...
...
@@ -274,9 +274,6 @@ def build_model_context(
model_info
.
check_available_online
(
on_fail
=
"skip"
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
if
dtype
is
None
:
dtype
=
"half"
model_config
=
ModelConfig
(
model_id
,
task
=
task
,
...
...
tests/multimodal/test_processing.py
View file @
ca796e19
...
...
@@ -7,19 +7,25 @@ from unittest.mock import MagicMock
import
numpy
as
np
import
pytest
import
torch
from
transformers
import
ProcessorMixin
from
vllm.config
import
ModelConfig
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalFieldElem
,
MultiModalKwargs
,
MultiModalKwargsItem
,
MultiModalSharedField
)
# yapf conflicts with isort for this block
# yapf: disable
from
vllm.multimodal.processing
import
(
PlaceholderFeaturesInfo
,
PromptIndexTargets
,
PromptInsertion
,
PromptReplacement
,
apply_text_matches
,
ProcessingCache
,
PromptIndexTargets
,
PromptInsertion
,
PromptReplacement
,
apply_text_matches
,
apply_token_matches
,
find_mm_placeholders
,
find_text_matches
,
find_token_matches
,
iter_token_matches
)
iter_token_matches
,
replace_token_matches
)
# yapf: enable
from
vllm.multimodal.profiling
import
MultiModalProfiler
from
vllm.transformers_utils.tokenizer
import
(
AnyTokenizer
,
...
...
@@ -89,6 +95,58 @@ def test_iter_token_matches(token_ids, match_ids, expected):
assert
all
(
match_len
==
len
(
match_ids
)
for
match_len
in
match_lens
)
# yapf: disable
@
pytest
.
mark
.
parametrize
(
(
"token_ids"
,
"match_ids"
,
"new_ids"
,
"expected"
),
[
([],
[],
[
-
1
],
[]),
([],
[
32000
],
[
-
1
],
[]),
(
[
32000
,
32000
,
32000
],
[
32000
],
[
-
1
],
[
-
1
,
-
1
,
-
1
],
),
(
[
32000
,
32000
,
32000
],
[
32000
,
32000
],
[
-
1
],
[
-
1
,
32000
],
),
(
[
32000
,
32000
,
32000
],
[
32000
,
32000
,
32000
],
[
-
1
],
[
-
1
],
),
(
[
9833
,
28747
,
32000
,
32000
,
32000
,
9833
,
28747
,
32000
,
32000
,
918
],
[
28747
,
32000
],
[
-
1
],
[
9833
,
-
1
,
32000
,
32000
,
9833
,
-
1
,
32000
,
918
],
),
(
[
9833
,
28747
,
32000
,
32000
,
32000
,
9833
,
28747
,
32000
,
32000
,
918
],
[
28747
,
32000
,
32000
,
32000
],
[
-
1
],
[
9833
,
-
1
,
9833
,
28747
,
32000
,
32000
,
918
],
),
(
[
9833
,
28747
,
32000
,
32000
,
32000
,
9833
,
28747
,
32000
,
32000
,
918
],
[
28747
,
0
,
32000
],
[
-
1
],
[
9833
,
28747
,
32000
,
32000
,
32000
,
9833
,
28747
,
32000
,
32000
,
918
],
),
],
)
# yapf: enable
def
test_replace_token_matches
(
token_ids
,
match_ids
,
new_ids
,
expected
):
result
=
replace_token_matches
(
token_ids
,
match_ids
,
new_ids
)
# Manually constructed results
assert
result
==
expected
# yapf: disable
@
pytest
.
mark
.
parametrize
(
(
"prompt"
,
"target_by_key"
,
"expected_by_key"
),
...
...
@@ -837,6 +895,45 @@ def test_find_mm_placeholders(
assert
result
==
expected
def
_dummy_elem
(
modality
:
str
,
key
:
str
,
size
:
int
):
return
MultiModalFieldElem
(
modality
=
modality
,
key
=
key
,
data
=
torch
.
empty
((
size
,
),
dtype
=
torch
.
int8
),
field
=
MultiModalSharedField
(
1
),
)
def
_dummy_item
(
modality
:
str
,
size_by_key
:
dict
[
str
,
int
]):
return
MultiModalKwargsItem
.
from_elems
([
_dummy_elem
(
modality
,
key
,
size
)
for
key
,
size
in
size_by_key
.
items
()
])
def
_dummy_kw
(
size_by_key_modality
:
dict
[
str
,
dict
[
str
,
int
]]):
return
MultiModalKwargs
.
from_items
([
_dummy_item
(
modality
,
size_by_key
)
for
modality
,
size_by_key
in
size_by_key_modality
.
items
()
])
# yapf: disable
@
pytest
.
mark
.
parametrize
(
(
"item"
,
"expected_size"
),
[
(
_dummy_item
(
"a"
,
{
"a1"
:
100
}),
100
),
(
_dummy_item
(
"a"
,
{
"a1"
:
100
,
"a2"
:
110
}),
210
),
(
_dummy_kw
({
"a"
:
{
"a1"
:
100
,
"a2"
:
110
},
"b"
:
{
"b1"
:
120
,
"b2"
:
130
}}),
460
),
# noqa: E501
],
)
# yapf: enable
def
test_cache_item_size
(
item
,
expected_size
):
cache
=
ProcessingCache
.
get_lru_cache
(
2048
,
type
(
item
))
cache
[
""
]
=
item
assert
cache
.
currsize
==
expected_size
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"llava-hf/llava-v1.6-mistral-7b-hf"
])
@
pytest
.
mark
.
parametrize
(
(
"limit"
,
"num_supported"
,
"is_valid"
),
...
...
@@ -853,7 +950,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
seed
=
0
,
dtype
=
"
half
"
,
dtype
=
"
auto
"
,
revision
=
None
,
limit_mm_per_prompt
=
limit_mm_per_prompt
,
)
...
...
@@ -892,7 +989,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
seed
=
0
,
dtype
=
"
half
"
,
dtype
=
"
auto
"
,
revision
=
None
,
limit_mm_per_prompt
=
limit_mm_per_prompt
,
)
...
...
@@ -965,7 +1062,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
seed
=
0
,
dtype
=
"
half
"
,
dtype
=
"
auto
"
,
revision
=
None
,
)
...
...
tests/neuron/1_core/test_prefix_prefill.py
View file @
ca796e19
...
...
@@ -314,7 +314,7 @@ def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
# Test edge cases
(
1
,
128
,
16
,
1024
,
4
,
2
,
16
,
False
),
# large decode batch
(
16
,
4
,
8
,
8192
,
4
8
,
1
,
128
,
True
),
# large prefill batch
(
16
,
4
,
8
,
1024
,
4
,
2
,
128
,
True
),
# large prefill batch
(
4
,
12
,
32
,
2048
,
16
,
1
,
32
,
True
),
# multi-head attention (MHA)
(
4
,
12
,
32
,
2048
,
16
,
16
,
32
,
True
),
# multi-query attention (MQA)
])
...
...
tests/quantization/test_bitsandbytes.py
View file @
ca796e19
...
...
@@ -15,6 +15,8 @@ from ..utils import compare_two_settings, create_new_process_for_each_test
models_4bit_to_test
=
[
(
"facebook/opt-125m"
,
"quantize opt model inflight"
),
(
"mistralai/Mistral-7B-Instruct-v0.3"
,
"quantize inflight model with both HF and Mistral format weights"
)
]
models_pre_qaunt_4bit_to_test
=
[
...
...
tests/tensorizer_loader/test_tensorizer.py
View file @
ca796e19
...
...
@@ -166,7 +166,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
test_prompts
=
multilora_inference
.
create_test_prompts
(
lora_path
)
# Serialize model before deserializing and binding LoRA adapters
with
vllm_runner
(
model_ref
,
)
as
vllm_model
:
with
vllm_runner
(
model_ref
)
as
vllm_model
:
model_path
=
tmp_path
/
(
model_ref
+
".tensors"
)
vllm_model
.
apply_model
(
...
...
@@ -208,7 +208,7 @@ def test_load_without_tensorizer_load_format(vllm_runner):
@
pytest
.
mark
.
skipif
(
not
is_curl_installed
(),
reason
=
"cURL is not installed"
)
def
test_openai_apiserver_with_tensorizer
(
vllm_runner
,
tmp_path
):
## Serialize model
with
vllm_runner
(
model_ref
,
)
as
vllm_model
:
with
vllm_runner
(
model_ref
)
as
vllm_model
:
model_path
=
tmp_path
/
(
model_ref
+
".tensors"
)
vllm_model
.
apply_model
(
...
...
tests/tpu/test_compilation.py
View file @
ca796e19
...
...
@@ -34,7 +34,9 @@ with depyf.prepare_debug(temp_dir):
# disable custom dispatcher, let Dynamo takes over
# all the control
llm
=
LLM
(
model
=
"google/gemma-2b"
,
llm
=
LLM
(
model
=
"Qwen/Qwen2.5-1.5B-Instruct"
,
max_model_len
=
512
,
max_num_seqs
=
64
,
enforce_eager
=
True
,
compilation_config
=
{
"level"
:
CompilationLevel
.
DYNAMO_AS_IS
})
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
...
...
@@ -44,38 +46,51 @@ with depyf.prepare_debug(temp_dir):
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
assert
generated_text
.
startswith
(
answer
)
compiled_code
=
sorted
(
compiled_code
s
=
sorted
(
glob
.
glob
(
os
.
path
.
join
(
temp_dir
,
"__transformed_code*.py"
)))
# we should only trigger Dynamo compilation three times:
# one for the profiling phase without kv cache
# one for the prefill phase with symbolic shapes
# one for the decode phase with symbolic shapes
for
i
,
compiled_code
in
enumerate
(
compiled_codes
):
print
(
"{} file: {}"
.
format
(
i
+
1
,
compiled_code
))
# We should only trigger Dynamo compilation 4 times:
# 1. forward pass (symbolic)
# 2. compute_logits (symbolic)
# 3. forward pass (shape 16)
# 4. forward pass (shape 32)
# and later calls should not trigger Dynamo compilation again.
# NOTE: it might still trigger XLA compilation.
# NOTE: It might still trigger XLA compilation.
# Check we have 4 compiled codes
assert
len
(
compiled_codes
)
==
4
# check we have three compiled code
# this is the assumption when we use the custom dispatcher
assert
len
(
compiled_code
)
==
3
kv_cache_prefix
=
"kv_cache"
attn_prefix
=
"ragged_paged_attention"
#
c
heck all the compilations are as expected
compiled_fn
=
sorted
(
#
C
heck all the compilations are as expected
compiled_fn
s
=
sorted
(
glob
.
glob
(
os
.
path
.
join
(
temp_dir
,
"__compiled_fn*Captured*.py"
)))
# the first compilation is the profiling phase,
# it should not have any kv cache
with
open
(
compiled_fn
[
0
])
as
f
:
for
i
,
compiled_fn
in
enumerate
(
compiled_fns
):
print
(
"{} file: {}"
.
format
(
i
+
1
,
compiled_fn
))
# The first compilation is symbolic, so it should not have any kv_caches
with
open
(
compiled_fns
[
0
])
as
f
:
content
=
f
.
read
()
assert
kv_cache_prefix
not
in
content
# The second compilation is symbolic, so it should not have any kv_caches
with
open
(
compiled_fns
[
1
])
as
f
:
content
=
f
.
read
()
assert
"
kv_cache
s"
not
in
content
assert
kv_cache
_prefix
not
in
content
#
t
he
secon
d compilation is
the prefill phase,
#
it should have kv cache and the flash
_attention
op
with
open
(
compiled_fn
[
1
])
as
f
:
#
T
he
thir
d compilation is
shape 16, so it should have kv_caches and the
#
ragged_paged
_attention
with
open
(
compiled_fn
s
[
2
])
as
f
:
content
=
f
.
read
()
assert
"
kv_cache
s"
in
content
and
"torch.ops.xla.flash_attention"
in
content
assert
(
kv_cache
_prefix
in
content
and
attn_prefix
in
content
)
#
t
he
third
compilation is
the decode phase,
#
it should have kv cache and the
paged_attention
op
with
open
(
compiled_fn
[
2
])
as
f
:
#
T
he
forth
compilation is
shape 32, so it should have kv_caches and the
#
ragged_
paged_attention
with
open
(
compiled_fn
s
[
3
])
as
f
:
content
=
f
.
read
()
assert
"
kv_cache
s"
in
content
and
"torch.ops.xla.paged_attention"
in
content
assert
(
kv_cache
_prefix
in
content
and
attn_prefix
in
content
)
tests/tpu/test_custom_dispatcher.py
View file @
ca796e19
...
...
@@ -14,12 +14,17 @@ from ..utils import compare_two_settings
def
test_custom_dispatcher
(
monkeypatch
:
pytest
.
MonkeyPatch
):
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_RPC_TIMEOUT"
,
"30000"
)
compare_two_settings
(
"google/gemma-2b"
,
arg1
=
[
"--enforce-eager"
,
f
"-O
{
CompilationLevel
.
DYNAMO_ONCE
}
"
,
],
arg2
=
[
"--enforce-eager"
,
f
"-O
{
CompilationLevel
.
DYNAMO_AS_IS
}
"
],
env1
=
{},
env2
=
{})
compare_two_settings
(
"Qwen/Qwen2.5-1.5B-Instruct"
,
arg1
=
[
"--max-model-len=256"
,
"--max-num-seqs=32"
,
"--enforce-eager"
,
f
"-O
{
CompilationLevel
.
DYNAMO_ONCE
}
"
,
],
arg2
=
[
"--max-model-len=256"
,
"--max-num-seqs=32"
,
"--enforce-eager"
,
f
"-O
{
CompilationLevel
.
DYNAMO_AS_IS
}
"
],
env1
=
{},
env2
=
{})
Prev
1
2
3
4
5
6
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment