Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
04629132
Commit
04629132
authored
Jun 12, 2025
by
zhuwenwen
Browse files
[tests] fix tests
parent
07c69390
Changes
52
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
88 additions
and
1008 deletions
+88
-1008
tests/models/decoder_only/language/untest_fp8.py
tests/models/decoder_only/language/untest_fp8.py
+0
-0
tests/models/decoder_only/language/untest_gptq_marlin.py
tests/models/decoder_only/language/untest_gptq_marlin.py
+0
-0
tests/models/decoder_only/language/untest_gptq_marlin_24.py
tests/models/decoder_only/language/untest_gptq_marlin_24.py
+0
-0
tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py
...ly/vision_language/mm_processor_kwargs/test_llava_next.py
+0
-72
tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
...er_only/vision_language/mm_processor_kwargs/test_phi3v.py
+0
-100
tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
...der_only/vision_language/mm_processor_kwargs/test_qwen.py
+0
-146
tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
...only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
+0
-107
tests/models/decoder_only/vision_language/test_glm4.py
tests/models/decoder_only/vision_language/test_glm4.py
+0
-135
tests/models/decoder_only/vision_language/test_h2ovl.py
tests/models/decoder_only/vision_language/test_h2ovl.py
+0
-131
tests/models/decoder_only/vision_language/test_phi3v.py
tests/models/decoder_only/vision_language/test_phi3v.py
+0
-237
tests/models/embedding/language/test_embedding.py
tests/models/embedding/language/test_embedding.py
+2
-2
tests/models/embedding/language/test_scoring.py
tests/models/embedding/language/test_scoring.py
+3
-3
tests/models/embedding/language/test_snowflake_arctic_embed.py
.../models/embedding/language/test_snowflake_arctic_embed.py
+9
-9
tests/models/encoder_decoder/audio_language/test_whisper.py
tests/models/encoder_decoder/audio_language/test_whisper.py
+10
-9
tests/models/encoder_decoder/language/test_bart.py
tests/models/encoder_decoder/language/test_bart.py
+4
-3
tests/models/encoder_decoder/vision_language/test_broadcast.py
.../models/encoder_decoder/vision_language/test_broadcast.py
+2
-2
tests/models/multimodal/processing/test_common.py
tests/models/multimodal/processing/test_common.py
+46
-44
tests/models/multimodal/processing/test_internvl.py
tests/models/multimodal/processing/test_internvl.py
+2
-2
tests/models/multimodal/processing/test_llama4.py
tests/models/multimodal/processing/test_llama4.py
+4
-2
tests/models/multimodal/processing/test_llava_next.py
tests/models/multimodal/processing/test_llava_next.py
+6
-4
No files found.
tests/models/decoder_only/language/test_fp8.py
→
tests/models/decoder_only/language/
un
test_fp8.py
View file @
04629132
File moved
tests/models/decoder_only/language/test_gptq_marlin.py
→
tests/models/decoder_only/language/
un
test_gptq_marlin.py
View file @
04629132
File moved
tests/models/decoder_only/language/test_gptq_marlin_24.py
→
tests/models/decoder_only/language/
un
test_gptq_marlin_24.py
View file @
04629132
File moved
tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py
deleted
100644 → 0
View file @
07c69390
import
os
import
pytest
from
vllm.inputs
import
InputContext
from
....utils
import
build_model_context
from
.....utils
import
models_path_prefix
@
pytest
.
fixture
()
def
get_max_llava_next_image_tokens
():
from
vllm.model_executor.models.llava_next
import
(
get_max_llava_next_image_tokens
)
return
get_max_llava_next_image_tokens
@
pytest
.
fixture
()
def
dummy_data_for_llava_next
():
from
vllm.model_executor.models.llava_next
import
dummy_data_for_llava_next
return
dummy_data_for_llava_next
@
pytest
.
mark
.
parametrize
(
"gridpoints,expected_max_tokens"
,
[
([[
336
,
336
]],
1176
),
([[
336
,
672
],
[
672
,
336
],
[
672
,
672
],
[
1008
,
336
],
[
336
,
1008
]],
2928
),
])
def
test_get_max_llava_next_image_tokens
(
gridpoints
,
expected_max_tokens
,
get_max_llava_next_image_tokens
):
ctx
=
build_model_context
(
model_name
=
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-v1.6-mistral-7b-hf"
))
# Update the config image_grid_pinpoints
# and calculate the resulting max tokens
ctx
.
model_config
.
hf_config
.
image_grid_pinpoints
=
gridpoints
actual_max_tokens
=
get_max_llava_next_image_tokens
(
InputContext
(
ctx
.
model_config
))
assert
expected_max_tokens
==
actual_max_tokens
@
pytest
.
mark
.
parametrize
(
"gridpoints,expected_size"
,
[
# One point; it has to be the largest
([[
336
,
336
]],
(
336
,
336
)),
# Default for most llava next models; the 2x2 tile is the largest
([[
336
,
672
],
[
672
,
336
],
[
672
,
672
],
[
1008
,
336
],
[
336
,
1008
]],
(
672
,
672
)),
# If two rectangular gridpoints are the same, the more vertical
# one has the higher feature count due to newline features
([[
336
,
672
],
[
672
,
336
]],
(
672
,
336
))
])
def
test_dummy_data_for_llava_next_feature_size
(
dummy_data_for_llava_next
,
gridpoints
,
expected_size
):
ctx
=
build_model_context
(
model_name
=
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-v1.6-mistral-7b-hf"
))
# Update the config image_grid_pinpoints
ctx
.
model_config
.
hf_config
.
image_grid_pinpoints
=
gridpoints
seq_len
=
5000
# bigger than the max feature size for any image
dummy_data
=
dummy_data_for_llava_next
(
ctx
,
seq_len
=
seq_len
,
mm_counts
=
{
"image"
:
1
},
)
seq_data
=
dummy_data
.
seq_data
mm_data
=
dummy_data
.
multi_modal_data
# The dummy data dims should match the gridpoint with the biggest feat size
assert
mm_data
[
"image"
].
height
==
expected_size
[
0
]
assert
mm_data
[
"image"
].
width
==
expected_size
[
1
]
assert
len
(
seq_data
.
get_token_ids
())
>=
seq_len
tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
deleted
100644 → 0
View file @
07c69390
"""Tests for phi3v's multimodal preprocessing kwargs."""
from
typing
import
Optional
import
os
import
pytest
from
transformers
import
AutoTokenizer
from
vllm.inputs
import
InputContext
,
InputProcessingContext
from
vllm.model_executor.models.phi3v
import
_IMAGE_TOKEN_ID
from
.....conftest
import
_ImageAssets
from
....utils
import
build_model_context
from
.....utils
import
models_path_prefix
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-vision-instruct"
)]
# Wrap lazy imports to avoid initializing CUDA during test collection
@
pytest
.
fixture
()
def
processor_for_phi3v
():
from
vllm.model_executor.models.phi3v
import
Phi3VMultiModalProcessor
return
Phi3VMultiModalProcessor
@
pytest
.
fixture
()
def
get_max_phi3v_image_tokens
():
from
vllm.model_executor.models.phi3v
import
get_max_phi3v_image_tokens
return
get_max_phi3v_image_tokens
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"num_crops,expected_max_tokens"
,
[
(
4
,
781
),
(
16
,
2653
),
])
def
test_max_tokens_override
(
get_max_phi3v_image_tokens
,
model
:
str
,
num_crops
:
int
,
expected_max_tokens
:
int
):
"""Ensure get_max_phi3v_image_tokens handles num_crops properly."""
# NOTE: mm_processor_kwargs on the context in this test is unused, since
# this is testing the mapper directly. In practice, the processor kwargs
# are wrapped in a closure when calling the max tokens func. We explicitly
# do NOT use the mm_processor_kwargs in the model context here to ensure
# that the max image tokens implementation is referencing a mix of the
# kwargs to the function and the original mm_processor_kwargs in case
# values are somehow updated and end up in a bad state.
ctx
=
build_model_context
(
model_name
=
model
,
tokenizer_name
=
model
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
None
,
)
actual_max_tokens
=
get_max_phi3v_image_tokens
(
InputContext
(
ctx
.
model_config
),
num_crops
=
num_crops
,
)
assert
expected_max_tokens
==
actual_max_tokens
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"num_crops,expected_toks_per_img"
,
[
(
4
,
757
),
(
16
,
1921
),
# the default num_crops of phi-3.5-vision is 4
(
None
,
757
),
])
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
def
test_processor_override
(
processor_for_phi3v
,
image_assets
:
_ImageAssets
,
model
:
str
,
num_crops
:
Optional
[
int
],
expected_toks_per_img
:
int
,
num_imgs
:
int
):
"""Ensure input_processor_for_phi3v handles num_crops properly."""
# Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by
# the partial when calling the custom input processor.
ctx
=
build_model_context
(
model_name
=
model
,
tokenizer_name
=
model
,
trust_remote_code
=
True
,
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
,
trust_remote_code
=
True
)
ctx
=
InputProcessingContext
(
ctx
.
model_config
,
tokenizer
)
# Build the image str / prompt based on the number of images we pass
img_str
=
""
.
join
([
f
"<|image_
{
idx
}
|>
\n
"
for
idx
in
range
(
1
,
num_imgs
+
1
)])
prompt
=
f
"<|user|>
\n
{
img_str
}
<|end|>
\n
<|assistant|>
\n
"
images
=
[
image_assets
[
0
].
pil_image
]
*
num_imgs
mm_data
=
{
"image"
:
images
}
mm_processor_kwargs
=
{}
if
num_crops
is
not
None
:
mm_processor_kwargs
=
{
"num_crops"
:
num_crops
}
processor
=
processor_for_phi3v
(
ctx
)
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
mm_processor_kwargs
)
# Ensure we have the right number of placeholders per num_crops size
img_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
_IMAGE_TOKEN_ID
)
assert
img_tok_count
==
expected_toks_per_img
*
num_imgs
tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
deleted
100644 → 0
View file @
07c69390
"""Tests for Qwen's multimodal preprocessing kwargs."""
from
typing
import
Dict
,
List
,
Union
import
os
import
pytest
import
torch
from
PIL.Image
import
Image
from
vllm.inputs
import
InputContext
,
token_inputs
from
vllm.multimodal
import
MultiModalKwargs
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
.....conftest
import
IMAGE_ASSETS
from
....utils
import
build_model_context
from
.....utils
import
models_path_prefix
### Multimodal preprocessing tests
SAMPLE_IMAGE
=
IMAGE_ASSETS
[
0
].
pil_image
# These values are specific to Qwen-VL/Chat; we can get these from the model
# config also, but they are hardcoded here to keep the parameterize/fixtures
# easy to read.
IMG_START_ID
=
151857
IMG_END_ID
=
151858
IMG_PAD_ID
=
151859
TOKS_PER_IMG
=
256
VIS_ENC_DIM
=
4096
IMG_SIZE
=
448
@
pytest
.
fixture
()
def
input_mapper_for_qwen
():
# Lazy import to avoid initializing CUDA during test collection
from
vllm.model_executor.models.qwen
import
input_mapper_for_qwen
return
input_mapper_for_qwen
@
pytest
.
fixture
()
def
input_processor_for_qwen
():
# Lazy import to avoid initializing CUDA during test collection
from
vllm.model_executor.models.qwen
import
input_processor_for_qwen
return
input_processor_for_qwen
@
pytest
.
fixture
()
def
qwen_vl_context
()
->
InputContext
:
"""Get an InputContext for Qwen-VL."""
return
build_model_context
(
model_name
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen-VL"
),
trust_remote_code
=
True
)
# Happy path tests for single/multi-image scenarios for the multimodal
# input processor and mapper, respectively
@
pytest
.
mark
.
parametrize
(
"num_images"
,
[
1
,
2
])
def
test_input_processor_valid_mm_data
(
input_processor_for_qwen
,
qwen_vl_context
:
InputContext
,
num_images
:
int
):
"""Happy cases for image inputs to Qwen's multimodal input processor."""
prompt
=
""
.
join
(
[
f
"Picture
{
num
}
: <img></img>
\n
"
for
num
in
range
(
1
,
num_images
+
1
)])
inputs
=
token_inputs
(
prompt
=
prompt
,
# When processing multimodal data for a multimodal model, the qwen
# input processor will overwrite the provided prompt_token_ids with
# the image prompts
prompt_token_ids
=
[],
multi_modal_data
=
{
"image"
:
torch
.
rand
(
num_images
,
TOKS_PER_IMG
,
4096
)},
)
proc_inputs
=
input_processor_for_qwen
(
qwen_vl_context
,
inputs
)
assert
isinstance
(
proc_inputs
,
dict
)
# Each image should have one start / stop and a fixed context of 256
proc_tokens
=
proc_inputs
[
"prompt_token_ids"
]
assert
proc_tokens
.
count
(
IMG_START_ID
)
==
num_images
assert
proc_tokens
.
count
(
IMG_END_ID
)
==
num_images
assert
proc_tokens
.
count
(
IMG_PAD_ID
)
==
num_images
*
TOKS_PER_IMG
@
pytest
.
mark
.
parametrize
(
"img_data,expected_shape"
,
[
# single / multi-image
(
SAMPLE_IMAGE
,
(
1
,
3
,
IMG_SIZE
,
IMG_SIZE
)),
(
2
*
[
SAMPLE_IMAGE
],
(
2
,
3
,
IMG_SIZE
,
IMG_SIZE
)),
# single / multi-image embeddings
(
torch
.
rand
(
(
TOKS_PER_IMG
,
VIS_ENC_DIM
)),
(
1
,
TOKS_PER_IMG
,
VIS_ENC_DIM
)),
(
torch
.
rand
(
(
1
,
TOKS_PER_IMG
,
VIS_ENC_DIM
)),
(
1
,
TOKS_PER_IMG
,
VIS_ENC_DIM
)),
(
torch
.
rand
(
(
2
,
TOKS_PER_IMG
,
VIS_ENC_DIM
)),
(
2
,
TOKS_PER_IMG
,
VIS_ENC_DIM
)),
])
def
test_input_mapper_valid_mm_data
(
input_mapper_for_qwen
,
qwen_vl_context
:
InputContext
,
img_data
:
Union
[
torch
.
Tensor
,
List
[
Image
],
Image
],
expected_shape
:
List
[
int
]):
"""Happy cases for image inputs to Qwen's multimodal input mapper."""
mapped_img_data
=
input_mapper_for_qwen
(
qwen_vl_context
,
img_data
)
# Ensure that we get the appropriately shaped pixel_values
# for images and image embeddings, respectively.
assert
isinstance
(
mapped_img_data
,
MultiModalKwargs
)
assert
"pixel_values"
in
mapped_img_data
assert
mapped_img_data
[
"pixel_values"
].
shape
==
expected_shape
# Sad path tests for the multimodal input processor and mapper, respectively
@
pytest
.
mark
.
parametrize
(
"mm_data"
,
[
{
"image"
:
torch
.
rand
(
5
)
},
{
"image"
:
torch
.
rand
((
5
,
5
,
5
,
5
,
5
))
},
])
def
test_input_processor_invalid_mm_data
(
input_processor_for_qwen
,
qwen_vl_context
:
InputContext
,
mm_data
:
Dict
[
str
,
torch
.
Tensor
]):
"""Test sad cases validated in Qwen's multimodal input processor."""
tokenizer
=
cached_get_tokenizer
(
qwen_vl_context
.
model_config
.
tokenizer
,
trust_remote_code
=
True
)
prompt
=
"Picture 1: <img></img>
\n
"
prompt_token_ids
=
tokenizer
.
encode
(
prompt
)
inputs
=
token_inputs
(
prompt
=
prompt
,
prompt_token_ids
=
prompt_token_ids
,
multi_modal_data
=
mm_data
)
# Should fail since we have too many or too few dimensions for embeddings
with
pytest
.
raises
(
ValueError
):
input_processor_for_qwen
(
qwen_vl_context
,
inputs
)
@
pytest
.
mark
.
parametrize
(
"img_data"
,
[
# Wrong context length
torch
.
rand
((
1
,
TOKS_PER_IMG
+
10
,
VIS_ENC_DIM
)),
# Wrong visual encoder output size
torch
.
rand
((
1
,
TOKS_PER_IMG
,
VIS_ENC_DIM
+
10
)),
])
def
test_input_mapper_invalid_mm_data
(
input_mapper_for_qwen
,
qwen_vl_context
:
InputContext
,
img_data
:
Union
[
torch
.
Tensor
,
List
[
Image
],
Image
],
):
"""Sad cases validated in Qwen VL's multimodal input mapper."""
with
pytest
.
raises
(
ValueError
):
input_mapper_for_qwen
(
qwen_vl_context
,
img_data
)
tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
deleted
100644 → 0
View file @
07c69390
from
typing
import
Any
,
Dict
,
Tuple
import
os
import
pytest
from
transformers
import
AutoTokenizer
from
vllm.inputs
import
InputContext
,
InputProcessingContext
from
.....conftest
import
_ImageAssets
from
....utils
import
build_model_context
from
.....utils
import
models_path_prefix
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-VL-2B-Instruct"
)
MIN_PIXELS
=
"min_pixels"
MAX_PIXELS
=
"max_pixels"
# Fixtures lazy import to avoid initializing CUDA during test collection
# NOTE: Qwen2VL supports multiple input modalities, so it registers multiple
# input mappers.
@
pytest
.
fixture
()
def
processor_for_qwen2_vl
():
from
vllm.model_executor.models.qwen2_vl
import
Qwen2VLMultiModalProcessor
return
Qwen2VLMultiModalProcessor
@
pytest
.
fixture
()
def
get_max_qwen2_vl_image_tokens
():
from
vllm.model_executor.models.qwen2_vl
import
(
get_max_qwen2_vl_image_tokens
)
return
get_max_qwen2_vl_image_tokens
@
pytest
.
mark
.
parametrize
(
"mm_processor_kwargs,expected_max_tokens"
,
[
({},
1225
),
({
MIN_PIXELS
:
64
**
2
,
MAX_PIXELS
:
512
**
2
},
324
),
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
MODEL
])
def
test_qwen2_vl_max_image_tokens
(
get_max_qwen2_vl_image_tokens
,
model
:
str
,
mm_processor_kwargs
:
Dict
[
str
,
Any
],
expected_max_tokens
:
int
,
):
"""Ensure that the max token calc handles min/max pixels properly."""
ctx
=
build_model_context
(
model_name
=
model
,
tokenizer_name
=
model
,
mm_processor_kwargs
=
None
,
)
actual_max_tokens
=
get_max_qwen2_vl_image_tokens
(
InputContext
(
ctx
.
model_config
),
**
mm_processor_kwargs
)
assert
actual_max_tokens
==
expected_max_tokens
@
pytest
.
mark
.
parametrize
(
"mm_processor_kwargs, expected_toks_per_img, expected_pixels_shape"
,
[
({},
1426
,
(
5704
,
1176
)),
({
MIN_PIXELS
:
64
**
2
,
MAX_PIXELS
:
512
**
2
},
330
,
(
1320
,
1176
)),
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
MODEL
])
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
def
test_processor_override
(
processor_for_qwen2_vl
,
image_assets
:
_ImageAssets
,
model
:
str
,
mm_processor_kwargs
:
Dict
[
str
,
Any
],
expected_toks_per_img
:
int
,
expected_pixels_shape
:
Tuple
[
int
,
int
],
num_imgs
:
int
,
):
"""Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly."""
# Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by
# the partial when calling the custom input processor.
ctx
=
build_model_context
(
model_name
=
model
,
tokenizer_name
=
model
,
mm_processor_kwargs
=
None
,
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
,
trust_remote_code
=
True
)
ctx
=
InputProcessingContext
(
ctx
.
model_config
,
tokenizer
)
# Build the image str / prompt based on the number of images we pass
prompt
=
"<|vision_start|><|image_pad|><|vision_end|>"
*
num_imgs
images
=
[
image_assets
[
0
].
pil_image
]
*
num_imgs
mm_data
=
{
"image"
:
images
}
processor
=
processor_for_qwen2_vl
(
ctx
)
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
mm_processor_kwargs
)
# Ensure we have the right number of placeholders per num_crops size
hf_processor
=
processor
.
_get_hf_processor
(
**
mm_processor_kwargs
)
image_token_id
=
tokenizer
.
convert_tokens_to_ids
(
hf_processor
.
image_token
)
img_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
image_token_id
)
pixel_shape
=
processed_inputs
[
"mm_kwargs"
][
"pixel_values"
].
shape
assert
img_tok_count
==
expected_toks_per_img
*
num_imgs
assert
pixel_shape
[
0
]
==
expected_pixels_shape
[
0
]
*
num_imgs
assert
pixel_shape
[
1
]
==
expected_pixels_shape
[
1
]
tests/models/decoder_only/vision_language/test_glm4.py
deleted
100644 → 0
View file @
07c69390
from
typing
import
List
,
Optional
,
Tuple
,
Type
import
pytest
import
os
from
vllm.multimodal.utils
import
rescale_image_size
from
vllm.transformers_utils.tokenizer
import
patch_padding_side
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
from
....utils
import
large_gpu_test
from
...utils
import
check_logprobs_close
from
....utils
import
models_path_prefix
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
"What's the content of the image?"
,
"cherry_blossom"
:
"What is the season?"
,
})
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"THUDM/glm-4v-9b"
)]
target_dtype
=
"bfloat16"
def
run_test
(
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
inputs
:
List
[
Tuple
[
List
[
str
],
PromptImageInput
]],
model
:
str
,
*
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
mm_limit
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
):
# max_model_len should be greater than image_feature_size
with
vllm_runner
(
model
,
max_model_len
=
2048
,
max_num_seqs
=
2
,
dtype
=
dtype
,
limit_mm_per_prompt
=
{
"image"
:
mm_limit
},
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
True
)
as
vllm_model
:
stop_token_ids
=
[
151329
,
151336
,
151338
]
vllm_outputs_per_image
=
[
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
,
stop_token_ids
=
stop_token_ids
)
for
prompts
,
images
in
inputs
]
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_processor
=
hf_model
.
processor
patch_padding_side
(
hf_processor
)
def
processor
(
*
args
,
text
=
""
,
images
=
None
,
**
kwargs
):
if
images
is
None
:
return
hf_processor
(
*
args
,
**
kwargs
)
return
hf_processor
.
apply_chat_template
(
[{
"role"
:
"user"
,
"image"
:
images
,
"content"
:
text
}],
add_generation_prompt
=
True
,
tokenize
=
True
,
return_dict
=
True
,
**
kwargs
,
)
hf_model
.
processor
=
processor
hf_model
.
model
.
get_output_embeddings
=
lambda
:
\
hf_model
.
model
.
transformer
.
output_layer
hf_outputs_per_image
=
[
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
,
)
for
prompts
,
images
in
inputs
]
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_image
,
vllm_outputs_per_image
):
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
@
large_gpu_test
(
min_gb
=
48
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
[
# No image
[],
# Single-scale
[
1.0
],
# Single-scale, batched
[
1.0
,
1.0
,
1.0
],
# Multi-scale
[
0.25
,
0.5
,
1.0
],
],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
size_factors
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
inputs_per_image
=
[(
[
prompt
for
_
in
size_factors
],
[
rescale_image_size
(
image
,
factor
)
for
factor
in
size_factors
],
)
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)]
run_test
(
hf_runner
,
vllm_runner
,
inputs_per_image
,
model
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
mm_limit
=
1
,
tensor_parallel_size
=
1
,
)
\ No newline at end of file
tests/models/decoder_only/vision_language/test_h2ovl.py
deleted
100644 → 0
View file @
07c69390
from
typing
import
Optional
,
Tuple
import
os
import
pytest
import
torch
from
PIL.Image
import
Image
from
transformers
import
AutoConfig
# Import the functions to test
from
vllm.model_executor.models.h2ovl
import
(
calculate_num_blocks
,
image_to_pixel_values_wrapper
)
from
vllm.multimodal.image
import
rescale_image_size
from
....utils
import
models_path_prefix
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"h2oai/h2ovl-mississippi-800m"
),
# Replace with your actual model names
os
.
path
.
join
(
models_path_prefix
,
"h2oai/h2ovl-mississippi-2b"
),
]
def
run_preprocessing_test
(
image
:
Image
,
config
,
max_dynamic_patch
:
Optional
[
int
]
=
None
,
)
->
Tuple
[
torch
.
Tensor
,
int
]:
"""Test the image preprocessing and calculate expected blocks."""
if
max_dynamic_patch
is
None
:
max_dynamic_patch
=
config
.
max_dynamic_patch
width
,
height
=
image
.
size
use_MSAC
=
config
.
use_msac
# Create the mapper function with the provided configuration
mapper
=
image_to_pixel_values_wrapper
(
config
,
max_dynamic_patch
,
use_MSAC
)
pixel_values
=
mapper
(
image
)
# Calculate the expected number of blocks
if
use_MSAC
:
# First pass
blocks1
,
_
,
_
,
aspect_ratio
=
calculate_num_blocks
(
width
,
height
,
config
.
min_dynamic_patch
,
max_dynamic_patch
,
config
.
vision_config
.
image_size
,
use_thumbnail
=
False
,
# Thumbnail is handled separately
prior_aspect_ratio
=
None
,
)
# Second pass
blocks2
,
_
,
_
,
_
=
calculate_num_blocks
(
width
,
height
,
config
.
min_dynamic_patch
,
max_dynamic_patch
,
config
.
vision_config
.
image_size
,
use_thumbnail
=
False
,
prior_aspect_ratio
=
aspect_ratio
,
)
# Add thumbnail if use_thumbnail is True and total_blocks > 1
if
config
.
use_thumbnail
:
blocks1
+=
1
if
blocks1
>
1
else
0
blocks2
+=
1
if
blocks2
>
1
else
0
# Total blocks is the sum of blocks from both passes minus overlapping
total_blocks
=
blocks1
+
blocks2
-
1
expected_blocks
=
total_blocks
else
:
blocks
,
_
,
_
,
_
=
calculate_num_blocks
(
width
,
height
,
config
.
min_dynamic_patch
,
max_dynamic_patch
,
config
.
vision_config
.
image_size
,
use_thumbnail
=
False
,
prior_aspect_ratio
=
None
,
)
expected_blocks
=
blocks
if
config
.
use_thumbnail
and
expected_blocks
>
1
:
expected_blocks
+=
1
return
pixel_values
,
expected_blocks
@
pytest
.
mark
.
parametrize
(
"model_name"
,
models
)
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
[
# Single-scale
[
1.0
],
# Single-scale, batched
[
1.0
,
1.0
,
1.0
],
# Multi-scale
[
0.25
,
0.5
,
1.0
],
],
)
@
pytest
.
mark
.
parametrize
(
"max_dynamic_patch"
,
[
None
,
2
,
4
,
8
])
def
test_image_preprocessing
(
image_assets
,
model_name
,
size_factors
,
max_dynamic_patch
):
"""Test image preprocessing pipeline with different configurations."""
# Load the configuration from the model
config
=
AutoConfig
.
from_pretrained
(
model_name
,
trust_remote_code
=
True
)
for
asset
in
image_assets
:
image
=
asset
.
pil_image
for
factor
in
size_factors
:
scaled_image
=
rescale_image_size
(
image
,
factor
)
# Test preprocessing and get expected number of blocks
pixel_values
,
expected_blocks
=
run_preprocessing_test
(
scaled_image
,
config
,
max_dynamic_patch
)
# Verify output shapes and properties
actual_blocks
=
pixel_values
.
shape
[
0
]
assert
actual_blocks
==
expected_blocks
,
(
f
"Expected
{
expected_blocks
}
blocks, got
{
actual_blocks
}
"
)
# Check image dimensions
expected_size
=
(
3
,
# Number of channels (C, H, W)
config
.
vision_config
.
image_size
,
config
.
vision_config
.
image_size
,
)
for
img
in
pixel_values
:
assert
img
.
shape
==
expected_size
,
(
f
"Expected image size
{
expected_size
}
, got
{
img
.
shape
}
"
)
tests/models/decoder_only/vision_language/test_phi3v.py
deleted
100644 → 0
View file @
07c69390
# SPDX-License-Identifier: Apache-2.0
import
os
import
re
from
typing
import
Optional
import
pytest
from
transformers
import
AutoTokenizer
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
SampleLogprobs
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
from
...utils
import
check_logprobs_close
from
....utils
import
models_path_prefix
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
"<|user|>
\n
<|image_1|>
\n
What's the content of the image?<|end|>
\n
<|assistant|>
\n
"
,
# noqa: E501
"cherry_blossom"
:
"<|user|>
\n
<|image_1|>
\n
What is the season?<|end|>
\n
<|assistant|>
\n
"
,
})
HF_MULTIIMAGE_IMAGE_PROMPT
=
"<|user|>
\n
<|image_1|>
\n
<|image_2|>
\n
Describe these images.<|end|>
\n
<|assistant|>
\n
"
# noqa: E501
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-vision-instruct"
)]
def
vllm_to_hf_output
(
vllm_output
:
tuple
[
list
[
int
],
str
,
Optional
[
SampleLogprobs
]],
model
:
str
):
"""Sanitize vllm output to be comparable with hf output."""
_
,
output_str
,
out_logprobs
=
vllm_output
output_str_without_image
=
re
.
sub
(
r
"(<\|image_\d+\|>)+"
,
""
,
output_str
)
assert
output_str_without_image
[
0
]
==
" "
output_str_without_image
=
output_str_without_image
[
1
:]
hf_output_str
=
output_str_without_image
+
"<|end|><|endoftext|>"
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
)
hf_output_ids
=
tokenizer
.
encode
(
output_str_without_image
)
assert
hf_output_ids
[
0
]
==
1
hf_output_ids
=
hf_output_ids
[
1
:]
return
hf_output_ids
,
hf_output_str
,
out_logprobs
target_dtype
=
"half"
# ROCm Triton FA can run into shared memory issues with these models,
# use other backends in the meantime
# FIXME (mattwong, gshtrasb, hongxiayan)
if
current_platform
.
is_rocm
():
os
.
environ
[
"VLLM_USE_TRITON_FLASH_ATTN"
]
=
"0"
def
run_test
(
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
type
[
VllmRunner
],
inputs
:
list
[
tuple
[
list
[
str
],
PromptImageInput
]],
model
:
str
,
*
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
mm_limit
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
):
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test are from IMAGE_ASSETS.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
# HACK - this is an attempted workaround for the following bug
# https://github.com/huggingface/transformers/issues/34307
from
transformers
import
AutoImageProcessor
# noqa: F401
from
transformers
import
AutoProcessor
# noqa: F401
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
with
vllm_runner
(
model
,
task
=
"generate"
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
dtype
=
dtype
,
limit_mm_per_prompt
=
{
"image"
:
mm_limit
},
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
True
)
as
vllm_model
:
vllm_outputs_per_case
=
[
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
)
for
prompts
,
images
in
inputs
]
# use eager mode for hf runner, since phi3_v didn't work with flash_attn
hf_model_kwargs
=
{
"_attn_implementation"
:
"eager"
}
with
hf_runner
(
model
,
dtype
=
dtype
,
model_kwargs
=
hf_model_kwargs
)
as
hf_model
:
eos_token_id
=
hf_model
.
processor
.
tokenizer
.
eos_token_id
hf_outputs_per_case
=
[
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
,
eos_token_id
=
eos_token_id
)
for
prompts
,
images
in
inputs
]
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_case
,
vllm_outputs_per_case
):
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
[
vllm_to_hf_output
(
vllm_output
,
model
)
for
vllm_output
in
vllm_outputs
],
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
# Since we use _attn_implementation="eager" for hf_runner, there is more
# significant numerical difference. The basic `logprobs=5` fails to pass.
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
[
# No image
[],
# Single-scale
[
1.0
],
# Single-scale, batched
[
1.0
,
1.0
,
1.0
],
# Multi-scale
[
0.25
,
0.5
,
1.0
],
],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
10
])
def
test_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
size_factors
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
inputs_per_image
=
[(
[
prompt
for
_
in
size_factors
],
[
rescale_image_size
(
image
,
factor
)
for
factor
in
size_factors
],
)
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)]
run_test
(
hf_runner
,
vllm_runner
,
inputs_per_image
,
model
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
mm_limit
=
1
,
tensor_parallel_size
=
1
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
def
test_regression_7840
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
dtype
)
->
None
:
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
inputs_regresion_7840
=
[
([
prompt
],
[
image
])
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)
]
# Regression test for #7840.
run_test
(
hf_runner
,
vllm_runner
,
inputs_regresion_7840
,
model
,
dtype
=
dtype
,
max_tokens
=
128
,
num_logprobs
=
10
,
mm_limit
=
1
,
tensor_parallel_size
=
1
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
[
# No image
[],
# Single-scale
[
1.0
],
# Single-scale, batched
[
1.0
,
1.0
,
1.0
],
# Multi-scale
[
0.25
,
0.5
,
1.0
],
],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
10
])
def
test_multi_images_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
size_factors
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
inputs_per_case
=
[
([
HF_MULTIIMAGE_IMAGE_PROMPT
for
_
in
size_factors
],
[[
rescale_image_size
(
image
,
factor
)
for
image
in
images
]
for
factor
in
size_factors
])
]
run_test
(
hf_runner
,
vllm_runner
,
inputs_per_case
,
model
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
mm_limit
=
2
,
tensor_parallel_size
=
1
,
)
tests/models/embedding/language/test_embedding.py
View file @
04629132
...
@@ -48,7 +48,7 @@ def test_models(
...
@@ -48,7 +48,7 @@ def test_models(
monkeypatch
,
monkeypatch
,
)
->
None
:
)
->
None
:
if
model
==
"BAAI/bge-multilingual-gemma2"
and
current_platform
.
is_rocm
():
if
model
==
os
.
path
.
join
(
models_path_prefix
,
"BAAI/bge-multilingual-gemma2"
)
and
current_platform
.
is_rocm
():
# ROCm Triton FA does not currently support sliding window attention
# ROCm Triton FA does not currently support sliding window attention
# switch to use ROCm CK FA backend
# switch to use ROCm CK FA backend
monkeypatch
.
setenv
(
"VLLM_USE_TRITON_FLASH_ATTN"
,
"False"
)
monkeypatch
.
setenv
(
"VLLM_USE_TRITON_FLASH_ATTN"
,
"False"
)
...
...
tests/models/embedding/language/test_scoring.py
View file @
04629132
...
@@ -12,12 +12,12 @@ import torch.nn.functional as F
...
@@ -12,12 +12,12 @@ import torch.nn.functional as F
from
....utils
import
models_path_prefix
from
....utils
import
models_path_prefix
MODELS
=
[
MODELS
=
[
os
.
path
.
join
(
models_path_prefix
,
"cross-encoder/ms-marco-MiniLM-L-6-v2"
),
# Bert
#
os.path.join(models_path_prefix, "cross-encoder/ms-marco-MiniLM-L-6-v2"), # Bert
os
.
path
.
join
(
models_path_prefix
,
"BAAI/bge-reranker-v2-m3"
),
# Roberta
os
.
path
.
join
(
models_path_prefix
,
"BAAI/bge-reranker-v2-m3"
),
# Roberta
]
]
EMBEDDING_MODELS
=
[
EMBEDDING_MODELS
=
[
"sentence-transformers/all-MiniLM-L12-v2"
,
# os.path.join(models_path_prefix,
"sentence-transformers/all-MiniLM-L12-v2"
)
,
]
]
TEXTS_1
=
[
TEXTS_1
=
[
...
...
tests/models/embedding/language/test_snowflake_arctic_embed.py
View file @
04629132
...
@@ -15,10 +15,10 @@ EMBEDDING_PROMPTS = [
...
@@ -15,10 +15,10 @@ EMBEDDING_PROMPTS = [
]
]
MODELS
=
[
MODELS
=
[
EmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-xs"
,
#
EmbedModelInfo("Snowflake/snowflake-arctic-embed-xs",
is_matryoshka
=
False
,
#
is_matryoshka=False,
architecture
=
"BertModel"
,
#
architecture="BertModel",
enable_test
=
True
),
#
enable_test=True),
EmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-s"
,
EmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-s"
,
is_matryoshka
=
False
,
is_matryoshka
=
False
,
architecture
=
"BertModel"
,
architecture
=
"BertModel"
,
...
@@ -43,10 +43,10 @@ MODELS = [
...
@@ -43,10 +43,10 @@ MODELS = [
is_matryoshka
=
True
,
is_matryoshka
=
True
,
architecture
=
"XLMRobertaModel"
,
architecture
=
"XLMRobertaModel"
,
enable_test
=
True
),
enable_test
=
True
),
EmbedModelInfo
(
"Snowflake/snowflake-arctic-embed-m-v2.0"
,
#
EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
is_matryoshka
=
True
,
#
is_matryoshka=True,
architecture
=
"GteModel"
,
#
architecture="GteModel",
enable_test
=
True
),
#
enable_test=True),
]
]
...
...
tests/models/encoder_decoder/audio_language/test_whisper.py
View file @
04629132
...
@@ -5,12 +5,13 @@ Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`.
...
@@ -5,12 +5,13 @@ Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`.
"""
"""
from
typing
import
Optional
from
typing
import
Optional
import
os
import
pytest
import
pytest
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
vllm.assets.audio
import
AudioAsset
from
vllm.assets.audio
import
AudioAsset
from
....utils
import
create_new_process_for_each_test
,
multi_gpu_test
from
....utils
import
create_new_process_for_each_test
,
multi_gpu_test
,
models_path_prefix
PROMPTS
=
[
PROMPTS
=
[
{
{
...
@@ -33,7 +34,7 @@ PROMPTS = [
...
@@ -33,7 +34,7 @@ PROMPTS = [
]
]
EXPECTED
=
{
EXPECTED
=
{
"openai/whisper-tiny"
:
[
os
.
path
.
join
(
models_path_prefix
,
"openai/whisper-tiny"
)
:
[
" He has birth words I spoke in the original corner of that. And a"
" He has birth words I spoke in the original corner of that. And a"
" little piece of black coat poetry. Mary had a little sandwich,"
" little piece of black coat poetry. Mary had a little sandwich,"
" sweet, with white and snow. And everyone had it very went the last"
" sweet, with white and snow. And everyone had it very went the last"
...
@@ -45,7 +46,7 @@ EXPECTED = {
...
@@ -45,7 +46,7 @@ EXPECTED = {
" American League Championship. I don't believe it. It just continues"
" American League Championship. I don't believe it. It just continues"
" by all five."
" by all five."
],
],
"openai/whisper-small"
:
[
os
.
path
.
join
(
models_path_prefix
,
"openai/whisper-small"
)
:
[
" The first words I spoke in the original pornograph. A little piece"
" The first words I spoke in the original pornograph. A little piece"
" of practical poetry. Mary had a little lamb, its fleece was quite a"
" of practical poetry. Mary had a little lamb, its fleece was quite a"
" slow, and everywhere that Mary went the lamb was sure to go."
,
" slow, and everywhere that Mary went the lamb was sure to go."
,
...
@@ -55,7 +56,7 @@ EXPECTED = {
...
@@ -55,7 +56,7 @@ EXPECTED = {
" play for the American League Championship. I don't believe it. It"
" play for the American League Championship. I don't believe it. It"
" just continues. My, oh my."
" just continues. My, oh my."
],
],
"openai/whisper-medium"
:
[
os
.
path
.
join
(
models_path_prefix
,
"openai/whisper-medium"
)
:
[
" The first words I spoke in the original phonograph, a little piece"
" The first words I spoke in the original phonograph, a little piece"
" of practical poetry. Mary had a little lamb, its fleece was quite as"
" of practical poetry. Mary had a little lamb, its fleece was quite as"
" slow, and everywhere that Mary went the lamb was sure to go."
,
" slow, and everywhere that Mary went the lamb was sure to go."
,
...
@@ -66,7 +67,7 @@ EXPECTED = {
...
@@ -66,7 +67,7 @@ EXPECTED = {
" League Championship. I don't believe it. It just continues. My, oh"
" League Championship. I don't believe it. It just continues. My, oh"
" my."
" my."
],
],
"openai/whisper-large-v3"
:
[
os
.
path
.
join
(
models_path_prefix
,
"openai/whisper-large-v3"
)
:
[
" The first words I spoke in the original phonograph, a little piece"
" The first words I spoke in the original phonograph, a little piece"
" of practical poetry. Mary had a little lamb, its feet were quite as"
" of practical poetry. Mary had a little lamb, its feet were quite as"
" slow, and everywhere that Mary went, the lamb was sure to go."
,
" slow, and everywhere that Mary went, the lamb was sure to go."
,
...
@@ -77,7 +78,7 @@ EXPECTED = {
...
@@ -77,7 +78,7 @@ EXPECTED = {
" League Championship. I don't believe it. It just continues. My, oh,"
" League Championship. I don't believe it. It just continues. My, oh,"
" my."
" my."
],
],
"openai/whisper-large-v3-turbo"
:
[
os
.
path
.
join
(
models_path_prefix
,
"openai/whisper-large-v3-turbo"
)
:
[
" The first words I spoke in the original phonograph, a little piece"
" The first words I spoke in the original phonograph, a little piece"
" of practical poetry. Mary had a little lamb, its streets were quite"
" of practical poetry. Mary had a little lamb, its streets were quite"
" as slow, and everywhere that Mary went the lamb was sure to go."
,
" as slow, and everywhere that Mary went the lamb was sure to go."
,
...
@@ -122,14 +123,14 @@ def run_test(
...
@@ -122,14 +123,14 @@ def run_test(
@
create_new_process_for_each_test
()
@
create_new_process_for_each_test
()
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"openai/whisper-small"
,
"openai/whisper-large-v3-turbo"
])
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"openai/whisper-small"
),
os
.
path
.
join
(
models_path_prefix
,
"openai/whisper-large-v3-turbo"
)
])
def
test_models
(
model
)
->
None
:
def
test_models
(
model
)
->
None
:
run_test
(
model
,
tensor_parallel_size
=
1
)
run_test
(
model
,
tensor_parallel_size
=
1
)
@
multi_gpu_test
(
num_gpus
=
2
)
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"openai/whisper-large-v3-turbo"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"openai/whisper-large-v3-turbo"
)
])
@
pytest
.
mark
.
parametrize
(
"distributed_executor_backend"
,
[
"ray"
,
"mp"
])
@
pytest
.
mark
.
parametrize
(
"distributed_executor_backend"
,
[
"ray"
,
"mp"
])
def
test_models_distributed
(
model
,
distributed_executor_backend
)
->
None
:
def
test_models_distributed
(
model
,
distributed_executor_backend
)
->
None
:
run_test
(
model
,
run_test
(
model
,
...
...
tests/models/encoder_decoder/language/test_bart.py
View file @
04629132
...
@@ -179,7 +179,8 @@ def run_test(
...
@@ -179,7 +179,8 @@ def run_test(
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"facebook/bart-large-cnn"
)),
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"facebook/bart-large-cnn"
)),
],
],
)
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
,
"bfloat16"
])
# @pytest.mark.parametrize("dtype", ["float", "bfloat16"])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"decoder_prompt_type"
,
list
(
DecoderPromptType
))
@
pytest
.
mark
.
parametrize
(
"decoder_prompt_type"
,
list
(
DecoderPromptType
))
...
@@ -201,7 +202,7 @@ def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
...
@@ -201,7 +202,7 @@ def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
@
multi_gpu_test
(
num_gpus
=
2
)
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"distributed_executor_backend"
,
[
"ray"
,
"mp"
])
@
pytest
.
mark
.
parametrize
(
"distributed_executor_backend"
,
[
"ray"
,
"mp"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/bart-large-cnn"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"facebook/bart-large-cnn"
)
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
...
...
tests/models/encoder_decoder/vision_language/test_broadcast.py
View file @
04629132
...
@@ -20,7 +20,7 @@ def test_models(hf_runner, vllm_runner, image_assets,
...
@@ -20,7 +20,7 @@ def test_models(hf_runner, vllm_runner, image_assets,
num_logprobs
=
5
num_logprobs
=
5
tensor_parallel_size
=
2
tensor_parallel_size
=
2
if
"meta-llama/Llama-3.2-11B-Vision-Instruct"
in
model
:
if
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-11B-Vision-Instruct"
)
in
model
:
# if model.startswith("meta-llama/Llama-3.2-11B-Vision-Instruct"):
# if model.startswith("meta-llama/Llama-3.2-11B-Vision-Instruct"):
from
.test_mllama
import
models
,
run_test
from
.test_mllama
import
models
,
run_test
else
:
else
:
...
...
tests/models/multimodal/processing/test_common.py
View file @
04629132
...
@@ -4,6 +4,7 @@ from functools import partial
...
@@ -4,6 +4,7 @@ from functools import partial
from
typing
import
Optional
,
Union
from
typing
import
Optional
,
Union
import
numpy
as
np
import
numpy
as
np
import
os
import
pytest
import
pytest
from
mistral_common.protocol.instruct.messages
import
(
ImageChunk
,
TextChunk
,
from
mistral_common.protocol.instruct.messages
import
(
ImageChunk
,
TextChunk
,
UserMessage
)
UserMessage
)
...
@@ -21,6 +22,7 @@ from vllm.transformers_utils.tokenizer import (MistralTokenizer,
...
@@ -21,6 +22,7 @@ from vllm.transformers_utils.tokenizer import (MistralTokenizer,
from
....multimodal.utils
import
random_audio
,
random_image
,
random_video
from
....multimodal.utils
import
random_audio
,
random_image
,
random_video
from
...registry
import
HF_EXAMPLE_MODELS
from
...registry
import
HF_EXAMPLE_MODELS
from
....utils
import
models_path_prefix
def
_test_processing_correctness
(
def
_test_processing_correctness
(
...
@@ -245,48 +247,48 @@ def _test_processing_correctness_mistral(
...
@@ -245,48 +247,48 @@ def _test_processing_correctness_mistral(
# yapf: disable
# yapf: disable
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"rhymes-ai/Aria"
,
os
.
path
.
join
(
models_path_prefix
,
"rhymes-ai/Aria"
)
,
"CohereForAI/aya-vision-8b"
,
os
.
path
.
join
(
models_path_prefix
,
"CohereForAI/aya-vision-8b"
)
,
"Salesforce/blip2-opt-2.7b"
,
os
.
path
.
join
(
models_path_prefix
,
"Salesforce/blip2-opt-2.7b"
)
,
"facebook/chameleon-7b"
,
os
.
path
.
join
(
models_path_prefix
,
"facebook/chameleon-7b"
)
,
"deepseek-ai/deepseek-vl2-tiny"
,
os
.
path
.
join
(
models_path_prefix
,
"deepseek-ai/deepseek-vl2-tiny"
)
,
"microsoft/Florence-2-base"
,
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Florence-2-base"
)
,
"adept/fuyu-8b"
,
os
.
path
.
join
(
models_path_prefix
,
"adept/fuyu-8b"
)
,
"google/gemma-3-4b-it"
,
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-3-4b-it"
)
,
"THUDM/glm-4v-9b"
,
os
.
path
.
join
(
models_path_prefix
,
"THUDM/glm-4v-9b"
)
,
"ibm-granite/granite-speech-3.3-8b"
,
os
.
path
.
join
(
models_path_prefix
,
"ibm-granite/granite-speech-3.3-8b"
)
,
"h2oai/h2ovl-mississippi-800m"
,
os
.
path
.
join
(
models_path_prefix
,
"h2oai/h2ovl-mississippi-800m"
)
,
"OpenGVLab/InternVL2-1B"
,
os
.
path
.
join
(
models_path_prefix
,
"OpenGVLab/InternVL2-1B"
)
,
"HuggingFaceM4/Idefics3-8B-Llama3"
,
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceM4/Idefics3-8B-Llama3"
)
,
"HuggingFaceTB/SmolVLM2-2.2B-Instruct"
,
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceTB/SmolVLM2-2.2B-Instruct"
)
,
"moonshotai/Kimi-VL-A3B-Instruct"
,
os
.
path
.
join
(
models_path_prefix
,
"moonshotai/Kimi-VL-A3B-Instruct"
)
,
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
,
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
)
,
"llava-hf/llava-1.5-7b-hf"
,
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-1.5-7b-hf"
)
,
"llava-hf/llava-v1.6-mistral-7b-hf"
,
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-v1.6-mistral-7b-hf"
)
,
"llava-hf/LLaVA-NeXT-Video-7B-hf"
,
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/LLaVA-NeXT-Video-7B-hf"
)
,
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
,
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
)
,
"meta-llama/Llama-3.2-11B-Vision-Instruct"
,
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-11B-Vision-Instruct"
)
,
"TIGER-Lab/Mantis-8B-siglip-llama3"
,
os
.
path
.
join
(
models_path_prefix
,
"TIGER-Lab/Mantis-8B-siglip-llama3"
)
,
"openbmb/MiniCPM-Llama3-V-2_5"
,
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM-Llama3-V-2_5"
)
,
"openbmb/MiniCPM-o-2_6"
,
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM-o-2_6"
)
,
"openbmb/MiniCPM-V-2_6"
,
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM-V-2_6"
)
,
"allenai/Molmo-7B-D-0924"
,
os
.
path
.
join
(
models_path_prefix
,
"allenai/Molmo-7B-D-0924"
)
,
"allenai/Molmo-7B-O-0924"
,
os
.
path
.
join
(
models_path_prefix
,
"allenai/Molmo-7B-O-0924"
)
,
"nvidia/NVLM-D-72B"
,
os
.
path
.
join
(
models_path_prefix
,
"nvidia/NVLM-D-72B"
)
,
"google/paligemma-3b-mix-224"
,
os
.
path
.
join
(
models_path_prefix
,
"google/paligemma-3b-mix-224"
)
,
"google/paligemma2-3b-ft-docci-448"
,
os
.
path
.
join
(
models_path_prefix
,
"google/paligemma2-3b-ft-docci-448"
)
,
"microsoft/Phi-4-multimodal-instruct"
,
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-4-multimodal-instruct"
)
,
"mistralai/Pixtral-12B-2409"
,
os
.
path
.
join
(
models_path_prefix
,
"mistralai/Pixtral-12B-2409"
)
,
"mistral-community/pixtral-12b"
,
os
.
path
.
join
(
models_path_prefix
,
"mistral-community/pixtral-12b"
)
,
"Qwen/Qwen-VL-Chat"
,
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen-VL-Chat"
)
,
"Qwen/Qwen2-VL-2B-Instruct"
,
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-VL-2B-Instruct"
)
,
"Qwen/Qwen2.5-VL-3B-Instruct"
,
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-VL-3B-Instruct"
)
,
"Qwen/Qwen2-Audio-7B-Instruct"
,
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-Audio-7B-Instruct"
)
,
"Qwen/Qwen2.5-Omni-7B"
,
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-Omni-7B"
)
,
"Skywork/Skywork-R1V-38B"
,
os
.
path
.
join
(
models_path_prefix
,
"Skywork/Skywork-R1V-38B"
)
,
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
,
os
.
path
.
join
(
models_path_prefix
,
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
)
,
"openai/whisper-large-v3"
,
os
.
path
.
join
(
models_path_prefix
,
"openai/whisper-large-v3"
)
,
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
,
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
)
,
])
])
@
pytest
.
mark
.
parametrize
(
"hit_rate"
,
[
0.3
,
0.5
,
1.0
])
@
pytest
.
mark
.
parametrize
(
"hit_rate"
,
[
0.3
,
0.5
,
1.0
])
@
pytest
.
mark
.
parametrize
(
"num_batches"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"num_batches"
,
[
32
])
...
@@ -315,7 +317,7 @@ def test_processing_correctness(
...
@@ -315,7 +317,7 @@ def test_processing_correctness(
# yapf: disable
# yapf: disable
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"microsoft/Phi-3.5-vision-instruct"
])
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-vision-instruct"
)
])
@
pytest
.
mark
.
parametrize
(
"hit_rate"
,
[
0.3
,
0.5
,
1.0
])
@
pytest
.
mark
.
parametrize
(
"hit_rate"
,
[
0.3
,
0.5
,
1.0
])
@
pytest
.
mark
.
parametrize
(
"num_batches"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"num_batches"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"simplify_rate"
,
[
1.0
])
@
pytest
.
mark
.
parametrize
(
"simplify_rate"
,
[
1.0
])
...
...
tests/models/multimodal/processing/test_internvl.py
View file @
04629132
...
@@ -75,7 +75,7 @@ def _run_check(
...
@@ -75,7 +75,7 @@ def _run_check(
assert
pixel_shape
[
0
]
==
total_expected_num_patches
assert
pixel_shape
[
0
]
==
total_expected_num_patches
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
os
.
path
.
join
(
models_path_prefix
,
"OpenGVLab/InternVL2-2B"
)
]
)
@
pytest
.
mark
.
parametrize
(
"model_id"
,
os
.
path
.
join
(
models_path_prefix
,
"OpenGVLab/InternVL2-2B"
))
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
"size_factors"
,
[
[
...
...
tests/models/multimodal/processing/test_llama4.py
View file @
04629132
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
"""Tests for Llama4's multimodal preprocessing kwargs."""
"""Tests for Llama4's multimodal preprocessing kwargs."""
import
os
import
pytest
import
pytest
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
...
@@ -8,10 +9,11 @@ from vllm.transformers_utils.tokenizer import encode_tokens
...
@@ -8,10 +9,11 @@ from vllm.transformers_utils.tokenizer import encode_tokens
from
....conftest
import
_ImageAssets
from
....conftest
import
_ImageAssets
from
...utils
import
build_model_context
from
...utils
import
build_model_context
from
....utils
import
models_path_prefix
@
pytest
.
mark
.
parametrize
(
"model_id"
,
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
])
[
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
)
])
@
pytest
.
mark
.
parametrize
(
"mm_processor_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"mm_processor_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
5
])
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
5
])
@
pytest
.
mark
.
parametrize
(
"disable_mm_preprocessor_cache"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"disable_mm_preprocessor_cache"
,
[
True
,
False
])
...
...
tests/models/multimodal/processing/test_llava_next.py
View file @
04629132
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
import
itertools
import
itertools
from
functools
import
partial
from
functools
import
partial
import
os
import
pytest
import
pytest
from
PIL
import
Image
from
PIL
import
Image
from
pqdm.threads
import
pqdm
from
pqdm.threads
import
pqdm
...
@@ -12,6 +13,7 @@ from vllm.multimodal.parse import ImageSize
...
@@ -12,6 +13,7 @@ from vllm.multimodal.parse import ImageSize
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
from
...utils
import
build_model_context
from
...utils
import
build_model_context
from
....utils
import
models_path_prefix
def
_validate_image_max_tokens_one
(
def
_validate_image_max_tokens_one
(
...
@@ -32,7 +34,7 @@ def _validate_image_max_tokens_one(
...
@@ -32,7 +34,7 @@ def _validate_image_max_tokens_one(
@
pytest
.
mark
.
skip
(
"This test takes around 5 minutes to run. "
@
pytest
.
mark
.
skip
(
"This test takes around 5 minutes to run. "
"Comment this out to run it manually."
)
"Comment this out to run it manually."
)
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"llava-hf/llava-v1.6-mistral-7b-hf"
])
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-v1.6-mistral-7b-hf"
)
])
def
test_processor_max_tokens
(
model_id
):
def
test_processor_max_tokens
(
model_id
):
ctx
=
build_model_context
(
ctx
=
build_model_context
(
model_id
,
model_id
,
...
@@ -127,7 +129,7 @@ def _test_image_prompt_replacements(
...
@@ -127,7 +129,7 @@ def _test_image_prompt_replacements(
raise
AssertionError
(
msg
)
raise
AssertionError
(
msg
)
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"llava-hf/llava-v1.6-mistral-7b-hf"
])
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-v1.6-mistral-7b-hf"
)
])
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
def
test_processor_prompt_replacements_regression
(
model_id
,
num_imgs
):
def
test_processor_prompt_replacements_regression
(
model_id
,
num_imgs
):
ctx
=
build_model_context
(
ctx
=
build_model_context
(
...
@@ -153,7 +155,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
...
@@ -153,7 +155,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
@
pytest
.
mark
.
skip
(
"This test takes around 2 hours to run. "
@
pytest
.
mark
.
skip
(
"This test takes around 2 hours to run. "
"Comment this out to run it manually."
)
"Comment this out to run it manually."
)
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"llava-hf/llava-v1.6-mistral-7b-hf"
])
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-v1.6-mistral-7b-hf"
)
])
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
])
def
test_processor_prompt_replacements_all
(
model_id
,
num_imgs
):
def
test_processor_prompt_replacements_all
(
model_id
,
num_imgs
):
ctx
=
build_model_context
(
ctx
=
build_model_context
(
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment