Commit afd0da21 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.7.1' into v0.7.1-dev

parents 1a11f127 4f4d427a
...@@ -35,10 +35,13 @@ def test_models( ...@@ -35,10 +35,13 @@ def test_models(
with vllm_runner(model, dtype=dtype) as vllm_model: with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
# This test is for verifying whether the model's extra_repr # This test is for verifying whether the model's extra_repr
# can be printed correctly. # can be printed correctly.
print(vllm_model.model.llm_engine.model_executor.driver_worker. def print_model(model):
model_runner.model) print(model)
vllm_model.apply_model(print_model)
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i] hf_output_ids, hf_output_str = hf_outputs[i]
......
...@@ -53,10 +53,13 @@ def test_models( ...@@ -53,10 +53,13 @@ def test_models(
with vllm_runner(model, dtype=dtype) as vllm_model: with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
# This test is for verifying whether the model's extra_repr # This test is for verifying whether the model's extra_repr
# can be printed correctly. # can be printed correctly.
print(vllm_model.model.llm_engine.model_executor.driver_worker. def print_model(model):
model_runner.model) print(model)
vllm_model.apply_model(print_model)
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i] hf_output_ids, hf_output_str = hf_outputs[i]
......
...@@ -50,6 +50,10 @@ from ....utils import models_path_prefix ...@@ -50,6 +50,10 @@ from ....utils import models_path_prefix
), ),
pytest.param(os.path.join(models_path_prefix, "stabilityai/stablelm-3b-4e1t")), # stablelm pytest.param(os.path.join(models_path_prefix, "stabilityai/stablelm-3b-4e1t")), # stablelm
pytest.param(os.path.join(models_path_prefix, "bigcode/starcoder2-3b")), # starcoder2 pytest.param(os.path.join(models_path_prefix, "bigcode/starcoder2-3b")), # starcoder2
pytest.param(
os.path.join(models_path_prefix, "ehristoforu/Falcon3-MoE-2x7B-Insruct"), # mixtral
marks=[pytest.mark.cpu_model],
)
]) ])
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("max_tokens", [32])
...@@ -71,10 +75,13 @@ def test_models( ...@@ -71,10 +75,13 @@ def test_models(
with vllm_runner(model, dtype=dtype) as vllm_model: with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs( vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs) example_prompts, max_tokens, num_logprobs)
# This test is for verifying whether the model's extra_repr # This test is for verifying whether the model's extra_repr
# can be printed correctly. # can be printed correctly.
print(vllm_model.model.llm_engine.model_executor.driver_worker. def print_model(model):
model_runner.model) print(model)
vllm_model.apply_model(print_model)
check_logprobs_close( check_logprobs_close(
outputs_0_lst=hf_outputs, outputs_0_lst=hf_outputs,
......
...@@ -10,7 +10,7 @@ from typing import Type ...@@ -10,7 +10,7 @@ from typing import Type
import os import os
import pytest import pytest
from transformers import AutoModelForVision2Seq from transformers import AutoModelForVision2Seq
from transformers.utils import is_flash_attn_2_available from transformers import __version__ as TRANSFORMERS_VERSION
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import identity from vllm.utils import identity
...@@ -141,12 +141,7 @@ VLM_TEST_SETTINGS = { ...@@ -141,12 +141,7 @@ VLM_TEST_SETTINGS = {
#### Extended model tests #### Extended model tests
"aria": VLMTestInfo( "aria": VLMTestInfo(
models=[os.path.join(models_path_prefix, "rhymes-ai/Aria")], models=[os.path.join(models_path_prefix, "rhymes-ai/Aria")],
tokenizer_mode="slow", test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
test_type=(
VLMTestType.IMAGE,
VLMTestType.MULTI_IMAGE,
),
dtype="bfloat16",
prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n", img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
max_model_len=4096, max_model_len=4096,
...@@ -162,8 +157,8 @@ VLM_TEST_SETTINGS = { ...@@ -162,8 +157,8 @@ VLM_TEST_SETTINGS = {
max_tokens=64, max_tokens=64,
marks=[ marks=[
pytest.mark.skipif( pytest.mark.skipif(
not is_flash_attn_2_available(), TRANSFORMERS_VERSION < "4.48.0",
reason="Model needs flash-attn for numeric convergence.", reason="HF model requires transformers>=4.48.0",
), ),
large_gpu_mark(min_gb=64), large_gpu_mark(min_gb=64),
], ],
...@@ -181,6 +176,7 @@ VLM_TEST_SETTINGS = { ...@@ -181,6 +176,7 @@ VLM_TEST_SETTINGS = {
test_type=VLMTestType.IMAGE, test_type=VLMTestType.IMAGE,
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForVision2Seq,
postprocess_inputs=model_utils.cast_dtype_post_processor( postprocess_inputs=model_utils.cast_dtype_post_processor(
"pixel_values" "pixel_values"
...@@ -192,6 +188,30 @@ VLM_TEST_SETTINGS = { ...@@ -192,6 +188,30 @@ VLM_TEST_SETTINGS = {
max_tokens=8, max_tokens=8,
dtype="bfloat16", dtype="bfloat16",
), ),
"deepseek_vl_v2": VLMTestInfo(
models=["Isotr0py/deepseek-vl2-tiny"], # model repo using dynamic module
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
single_image_prompts=IMAGE_ASSETS.prompts({
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<image>\nPlease infer the season with reason in details.", # noqa: E501
}),
multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?", # noqa: E501
vllm_runner_kwargs={"hf_overrides": {"architectures": ["DeepseekVLV2ForCausalLM"]}}, # noqa: E501
patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
postprocess_inputs=model_utils.cast_dtype_post_processor("images"),
hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"], # noqa: E501
image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
marks=[
pytest.mark.skipif(
TRANSFORMERS_VERSION >= "4.48.0",
reason="HF model is not compatible with transformers>=4.48.0",
)
],
),
"fuyu": VLMTestInfo( "fuyu": VLMTestInfo(
models=[os.path.join(models_path_prefix, "adept/fuyu-8b")], models=[os.path.join(models_path_prefix, "adept/fuyu-8b")],
test_type=VLMTestType.IMAGE, test_type=VLMTestType.IMAGE,
...@@ -214,7 +234,7 @@ VLM_TEST_SETTINGS = { ...@@ -214,7 +234,7 @@ VLM_TEST_SETTINGS = {
dtype="bfloat16", dtype="bfloat16",
get_stop_token_ids=lambda tok: [151329, 151336, 151338], get_stop_token_ids=lambda tok: [151329, 151336, 151338],
patch_hf_runner=model_utils.glm_patch_hf_runner, patch_hf_runner=model_utils.glm_patch_hf_runner,
marks=[large_gpu_mark(min_gb=48)], marks=[large_gpu_mark(min_gb=32)],
), ),
"h2ovl": VLMTestInfo( "h2ovl": VLMTestInfo(
models = [ models = [
...@@ -263,6 +283,7 @@ VLM_TEST_SETTINGS = { ...@@ -263,6 +283,7 @@ VLM_TEST_SETTINGS = {
dtype="bfloat16", dtype="bfloat16",
use_tokenizer_eos=True, use_tokenizer_eos=True,
patch_hf_runner=model_utils.internvl_patch_hf_runner, patch_hf_runner=model_utils.internvl_patch_hf_runner,
marks=[large_gpu_mark(min_gb=32)],
), ),
"llava_next": VLMTestInfo( "llava_next": VLMTestInfo(
models=[os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")], models=[os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")],
...@@ -277,10 +298,8 @@ VLM_TEST_SETTINGS = { ...@@ -277,10 +298,8 @@ VLM_TEST_SETTINGS = {
), ),
limit_mm_per_prompt={"image": 4}, limit_mm_per_prompt={"image": 4},
)], )],
# Llava-next tests fixed sizes & the default size factors
image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
), ),
"llava_one_vision": VLMTestInfo( "llava_onevision": VLMTestInfo(
models=[os.path.join(models_path_prefix, "llava-hf/llava-onevision-qwen2-0.5b-ov-hf")], models=[os.path.join(models_path_prefix, "llava-hf/llava-onevision-qwen2-0.5b-ov-hf")],
test_type=VLMTestType.CUSTOM_INPUTS, test_type=VLMTestType.CUSTOM_INPUTS,
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
...@@ -291,8 +310,6 @@ VLM_TEST_SETTINGS = { ...@@ -291,8 +310,6 @@ VLM_TEST_SETTINGS = {
), ),
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
# Llava-one-vision tests fixed sizes & the default size factors
image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
custom_test_opts=[CustomTestOptions( custom_test_opts=[CustomTestOptions(
inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs( inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs(
formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
...@@ -309,7 +326,6 @@ VLM_TEST_SETTINGS = { ...@@ -309,7 +326,6 @@ VLM_TEST_SETTINGS = {
max_model_len=4096, max_model_len=4096,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output,
image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
), ),
"mantis": VLMTestInfo( "mantis": VLMTestInfo(
models=[os.path.join(models_path_prefix, "TIGER-Lab/Mantis-8B-siglip-llama3")], models=[os.path.join(models_path_prefix, "TIGER-Lab/Mantis-8B-siglip-llama3")],
...@@ -336,6 +352,20 @@ VLM_TEST_SETTINGS = { ...@@ -336,6 +352,20 @@ VLM_TEST_SETTINGS = {
postprocess_inputs=model_utils.wrap_inputs_post_processor, postprocess_inputs=model_utils.wrap_inputs_post_processor,
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
), ),
"minicpmo_26": VLMTestInfo(
models=["openbmb/MiniCPM-o-2_6"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
max_model_len=4096,
max_num_seqs=2,
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
postprocess_inputs=model_utils.ignore_inputs_post_processor(
"image_sizes"
),
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
patch_hf_runner=model_utils.minicpmo_patch_hf_runner
),
"minicpmv_26": VLMTestInfo( "minicpmv_26": VLMTestInfo(
models=[os.path.join(models_path_prefix, "openbmb/MiniCPM-V-2_6")], models=[os.path.join(models_path_prefix, "openbmb/MiniCPM-V-2_6")],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
...@@ -349,6 +379,16 @@ VLM_TEST_SETTINGS = { ...@@ -349,6 +379,16 @@ VLM_TEST_SETTINGS = {
), ),
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
), ),
"molmo": VLMTestInfo(
models=["allenai/Molmo-7B-D-0924"],
test_type=(VLMTestType.IMAGE),
prompt_formatter=lambda img_prompt:"User: " + img_prompt + " Assistant:", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
image_size_factors=[(),(1.0, 1.0, 1.0)],
patch_hf_runner=model_utils.mlomo_patch_hf_runner,
postprocess_inputs=model_utils.molmo_post_processor,
),
# Tests for phi3v currently live in another file because of a bug in # Tests for phi3v currently live in another file because of a bug in
# transformers. Once this issue is fixed, we can enable them here instead. # transformers. Once this issue is fixed, we can enable them here instead.
# https://github.com/huggingface/transformers/issues/34307 # https://github.com/huggingface/transformers/issues/34307
...@@ -434,7 +474,7 @@ VLM_TEST_SETTINGS = { ...@@ -434,7 +474,7 @@ VLM_TEST_SETTINGS = {
) for inp in custom_inputs.different_patch_input_cases_internvl() ) for inp in custom_inputs.different_patch_input_cases_internvl()
], ],
), ),
"llava_one_vision-multiple-images": VLMTestInfo( "llava_onevision-multiple-images": VLMTestInfo(
models=[os.path.join(models_path_prefix, "llava-hf/llava-onevision-qwen2-0.5b-ov-hf")], models=[os.path.join(models_path_prefix, "llava-hf/llava-onevision-qwen2-0.5b-ov-hf")],
test_type=VLMTestType.CUSTOM_INPUTS, test_type=VLMTestType.CUSTOM_INPUTS,
max_model_len=16384, max_model_len=16384,
...@@ -497,12 +537,13 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2) ...@@ -497,12 +537,13 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
# - image embeddings # - image embeddings
# - video # - video
# - custom inputs # - custom inputs
@pytest.mark.parametrize("model_type,test_case", @pytest.mark.parametrize(
get_parametrized_options( "model_type,test_case",
VLM_TEST_SETTINGS, get_parametrized_options(
test_type=VLMTestType.IMAGE, VLM_TEST_SETTINGS,
fork_new_process_for_each_test=False, test_type=VLMTestType.IMAGE,
)) fork_new_process_for_each_test=False,
))
def test_single_image_models(tmp_path: PosixPath, model_type: str, def test_single_image_models(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner], hf_runner: Type[HfRunner],
...@@ -519,12 +560,13 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str, ...@@ -519,12 +560,13 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
) )
@pytest.mark.parametrize("model_type,test_case", @pytest.mark.parametrize(
get_parametrized_options( "model_type,test_case",
VLM_TEST_SETTINGS, get_parametrized_options(
test_type=VLMTestType.MULTI_IMAGE, VLM_TEST_SETTINGS,
fork_new_process_for_each_test=False, test_type=VLMTestType.MULTI_IMAGE,
)) fork_new_process_for_each_test=False,
))
def test_multi_image_models(tmp_path: PosixPath, model_type: str, def test_multi_image_models(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner], hf_runner: Type[HfRunner],
...@@ -541,12 +583,13 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str, ...@@ -541,12 +583,13 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
) )
@pytest.mark.parametrize("model_type,test_case", @pytest.mark.parametrize(
get_parametrized_options( "model_type,test_case",
VLM_TEST_SETTINGS, get_parametrized_options(
test_type=VLMTestType.EMBEDDING, VLM_TEST_SETTINGS,
fork_new_process_for_each_test=False, test_type=VLMTestType.EMBEDDING,
)) fork_new_process_for_each_test=False,
))
def test_image_embedding_models(model_type: str, def test_image_embedding_models(model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner], hf_runner: Type[HfRunner],
...@@ -562,12 +605,13 @@ def test_image_embedding_models(model_type: str, ...@@ -562,12 +605,13 @@ def test_image_embedding_models(model_type: str,
) )
@pytest.mark.parametrize("model_type,test_case", @pytest.mark.parametrize(
get_parametrized_options( "model_type,test_case",
VLM_TEST_SETTINGS, get_parametrized_options(
test_type=VLMTestType.VIDEO, VLM_TEST_SETTINGS,
fork_new_process_for_each_test=False, test_type=VLMTestType.VIDEO,
)) fork_new_process_for_each_test=False,
))
def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs, def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner], hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner],
video_assets: _VideoAssets): video_assets: _VideoAssets):
...@@ -581,12 +625,13 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs, ...@@ -581,12 +625,13 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
) )
@pytest.mark.parametrize("model_type,test_case", @pytest.mark.parametrize(
get_parametrized_options( "model_type,test_case",
VLM_TEST_SETTINGS, get_parametrized_options(
test_type=VLMTestType.CUSTOM_INPUTS, VLM_TEST_SETTINGS,
fork_new_process_for_each_test=False, test_type=VLMTestType.CUSTOM_INPUTS,
)) fork_new_process_for_each_test=False,
))
def test_custom_inputs_models( def test_custom_inputs_models(
model_type: str, model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
...@@ -603,12 +648,13 @@ def test_custom_inputs_models( ...@@ -603,12 +648,13 @@ def test_custom_inputs_models(
#### Tests filtering for things running each test as a new process #### Tests filtering for things running each test as a new process
@pytest.mark.parametrize("model_type,test_case", @pytest.mark.parametrize(
get_parametrized_options( "model_type,test_case",
VLM_TEST_SETTINGS, get_parametrized_options(
test_type=VLMTestType.IMAGE, VLM_TEST_SETTINGS,
fork_new_process_for_each_test=True, test_type=VLMTestType.IMAGE,
)) fork_new_process_for_each_test=True,
))
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str, def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
...@@ -626,12 +672,13 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str, ...@@ -626,12 +672,13 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
) )
@pytest.mark.parametrize("model_type,test_case", @pytest.mark.parametrize(
get_parametrized_options( "model_type,test_case",
VLM_TEST_SETTINGS, get_parametrized_options(
test_type=VLMTestType.MULTI_IMAGE, VLM_TEST_SETTINGS,
fork_new_process_for_each_test=True, test_type=VLMTestType.MULTI_IMAGE,
)) fork_new_process_for_each_test=True,
))
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str, def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
...@@ -649,12 +696,13 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str, ...@@ -649,12 +696,13 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
) )
@pytest.mark.parametrize("model_type,test_case", @pytest.mark.parametrize(
get_parametrized_options( "model_type,test_case",
VLM_TEST_SETTINGS, get_parametrized_options(
test_type=VLMTestType.EMBEDDING, VLM_TEST_SETTINGS,
fork_new_process_for_each_test=True, test_type=VLMTestType.EMBEDDING,
)) fork_new_process_for_each_test=True,
))
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_image_embedding_models_heavy(model_type: str, def test_image_embedding_models_heavy(model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
...@@ -671,12 +719,13 @@ def test_image_embedding_models_heavy(model_type: str, ...@@ -671,12 +719,13 @@ def test_image_embedding_models_heavy(model_type: str,
) )
@pytest.mark.parametrize("model_type,test_case", @pytest.mark.parametrize(
get_parametrized_options( "model_type,test_case",
VLM_TEST_SETTINGS, get_parametrized_options(
test_type=VLMTestType.VIDEO, VLM_TEST_SETTINGS,
fork_new_process_for_each_test=True, test_type=VLMTestType.VIDEO,
)) fork_new_process_for_each_test=True,
))
def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner], hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner], vllm_runner: Type[VllmRunner],
...@@ -691,12 +740,13 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, ...@@ -691,12 +740,13 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
) )
@pytest.mark.parametrize("model_type,test_case", @pytest.mark.parametrize(
get_parametrized_options( "model_type,test_case",
VLM_TEST_SETTINGS, get_parametrized_options(
test_type=VLMTestType.CUSTOM_INPUTS, VLM_TEST_SETTINGS,
fork_new_process_for_each_test=True, test_type=VLMTestType.CUSTOM_INPUTS,
)) fork_new_process_for_each_test=True,
))
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_custom_inputs_models_heavy( def test_custom_inputs_models_heavy(
model_type: str, model_type: str,
......
...@@ -138,10 +138,10 @@ def _dump_outputs_w_logprobs( ...@@ -138,10 +138,10 @@ def _dump_outputs_w_logprobs(
outputs: OutputsLogprobs, outputs: OutputsLogprobs,
filename: "StrPath", filename: "StrPath",
) -> None: ) -> None:
json_data = [(tokens, text, json_data = [(tokens, text, [{
[{k: asdict(v) k: asdict(v)
for k, v in token_logprobs.items()} for k, v in token_logprobs.items()
for token_logprobs in (logprobs or [])]) } for token_logprobs in (logprobs or [])])
for tokens, text, logprobs in outputs] for tokens, text, logprobs in outputs]
with open(filename, "w") as f: with open(filename, "w") as f:
...@@ -152,11 +152,10 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs: ...@@ -152,11 +152,10 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
with open(filename, "rb") as f: with open(filename, "rb") as f:
json_data = json.load(f) json_data = json.load(f)
return [(tokens, text, return [(tokens, text, [{
[{int(k): Logprob(**v) int(k): Logprob(**v)
for k, v in token_logprobs.items()} for k, v in token_logprobs.items()
for token_logprobs in logprobs]) } for token_logprobs in logprobs]) for tokens, text, logprobs in json_data]
for tokens, text, logprobs in json_data]
@large_gpu_test(min_gb=80) @large_gpu_test(min_gb=80)
......
...@@ -6,7 +6,6 @@ import pytest ...@@ -6,7 +6,6 @@ import pytest
import torch import torch
from PIL import Image from PIL import Image
from vllm.entrypoints.llm import LLM
from vllm.multimodal.image import rescale_image_size from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.video import rescale_video_size, sample_frames_from_video from vllm.multimodal.video import rescale_video_size, sample_frames_from_video
...@@ -71,7 +70,7 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict): ...@@ -71,7 +70,7 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict):
def batch_make_image_embeddings( def batch_make_image_embeddings(
image_batches: List[Union[Image.Image, List[Image.Image]]], processor, image_batches: List[Union[Image.Image, List[Image.Image]]], processor,
llm: LLM) -> List[Qwen2VLPromptImageEmbeddingInput]: llm: VllmRunner) -> List[Qwen2VLPromptImageEmbeddingInput]:
"""batched image embeddings for Qwen2-VL """batched image embeddings for Qwen2-VL
This will infer all images' embeddings in a single batch, This will infer all images' embeddings in a single batch,
...@@ -107,17 +106,19 @@ def batch_make_image_embeddings( ...@@ -107,17 +106,19 @@ def batch_make_image_embeddings(
pixel_values = preprocess_result["pixel_values"] pixel_values = preprocess_result["pixel_values"]
image_grid_thw = preprocess_result["image_grid_thw"] image_grid_thw = preprocess_result["image_grid_thw"]
# pixel values to embeddinds & grid_thws # pixel values to embeddings & grid_thws
with torch.no_grad(): def get_image_embeds(model):
visual = llm.llm_engine.model_executor.driver_worker. \ with torch.no_grad():
model_runner.model.visual visual = model.visual
pixel_values_on_device = pixel_values.to(visual.device, pixel_values_on_device = pixel_values.to(visual.device,
dtype=visual.dtype) dtype=visual.dtype)
image_grid_thw_on_device = image_grid_thw.to(visual.device, image_grid_thw_on_device = image_grid_thw.to(visual.device,
dtype=torch.int64) dtype=torch.int64)
image_embeds = visual(pixel_values_on_device, return visual(pixel_values_on_device,
grid_thw=image_grid_thw_on_device) grid_thw=image_grid_thw_on_device)
image_embeds = torch.concat(llm.apply_model(get_image_embeds))
# split into original batches # split into original batches
result: List[Qwen2VLPromptImageEmbeddingInput] = [] result: List[Qwen2VLPromptImageEmbeddingInput] = []
...@@ -126,11 +127,10 @@ def batch_make_image_embeddings( ...@@ -126,11 +127,10 @@ def batch_make_image_embeddings(
for image_batch in image_batches_: for image_batch in image_batches_:
cur_batch_image_count = len(image_batch) cur_batch_image_count = len(image_batch)
merge_size = image_processor.merge_size merge_size = image_processor.merge_size
cur_batch_embed_len = sum([ cur_batch_embed_len = sum(
grid_thw.prod() // merge_size // merge_size grid_thw.prod(-1) // merge_size // merge_size
for grid_thw in image_grid_thw[image_counter:image_counter + for grid_thw in image_grid_thw[image_counter:image_counter +
cur_batch_image_count] cur_batch_image_count])
])
result.append({ result.append({
"image_embeds": "image_embeds":
...@@ -153,7 +153,7 @@ def batch_make_image_embeddings( ...@@ -153,7 +153,7 @@ def batch_make_image_embeddings(
def batch_make_video_embeddings( def batch_make_video_embeddings(
video_batches: PromptVideoInput, processor, video_batches: PromptVideoInput, processor,
llm: LLM) -> List[Qwen2VLPromptVideoEmbeddingInput]: llm: VllmRunner) -> List[Qwen2VLPromptVideoEmbeddingInput]:
"""batched video embeddings for Qwen2-VL """batched video embeddings for Qwen2-VL
A NDArray represents a single video's all frames. A NDArray represents a single video's all frames.
...@@ -189,17 +189,19 @@ def batch_make_video_embeddings( ...@@ -189,17 +189,19 @@ def batch_make_video_embeddings(
pixel_values = preprocess_result["pixel_values_videos"] pixel_values = preprocess_result["pixel_values_videos"]
video_grid_thw = preprocess_result["video_grid_thw"] video_grid_thw = preprocess_result["video_grid_thw"]
# pixel values to embeddinds & grid_thws # pixel values to embeddings & grid_thws
with torch.no_grad(): def get_image_embeds(model):
visual = llm.llm_engine.model_executor.driver_worker.\ with torch.no_grad():
model_runner.model.visual visual = model.visual
pixel_values_on_device = pixel_values.to(visual.device,
dtype=visual.dtype)
video_grid_thw_on_device = video_grid_thw.to(visual.device,
dtype=torch.int64)
return visual(pixel_values_on_device,
grid_thw=video_grid_thw_on_device)
pixel_values_on_device = pixel_values.to(visual.device, video_embeds = torch.concat(llm.apply_model(get_image_embeds))
dtype=visual.dtype)
video_grid_thw_on_device = video_grid_thw.to(visual.device,
dtype=torch.int64)
video_embeds = visual(pixel_values_on_device,
grid_thw=video_grid_thw_on_device)
# split into original batches # split into original batches
result: List[Qwen2VLPromptVideoEmbeddingInput] = [] result: List[Qwen2VLPromptVideoEmbeddingInput] = []
...@@ -208,11 +210,10 @@ def batch_make_video_embeddings( ...@@ -208,11 +210,10 @@ def batch_make_video_embeddings(
for video_batch in video_batches_: for video_batch in video_batches_:
cur_batch_video_count = len(video_batch) cur_batch_video_count = len(video_batch)
merge_size = image_processor.merge_size merge_size = image_processor.merge_size
cur_batch_embed_len = sum([ cur_batch_embed_len = sum(
grid_thw.prod() // merge_size // merge_size grid_thw.prod(-1) // merge_size // merge_size
for grid_thw in video_grid_thw[video_counter:video_counter + for grid_thw in video_grid_thw[video_counter:video_counter +
cur_batch_video_count] cur_batch_video_count])
])
result.append({ result.append({
"video_embeds": "video_embeds":
...@@ -282,9 +283,9 @@ def run_embedding_input_test( ...@@ -282,9 +283,9 @@ def run_embedding_input_test(
max_tokens, max_tokens,
num_logprobs=num_logprobs, num_logprobs=num_logprobs,
images=batch_make_image_embeddings( images=batch_make_image_embeddings(
images, processor, vllm_model.model) if images else None, images, processor, vllm_model) if images else None,
videos=batch_make_video_embeddings( videos=batch_make_video_embeddings(
videos, processor, vllm_model.model) if videos else None) videos, processor, vllm_model) if videos else None)
for prompts, images, videos in inputs for prompts, images, videos in inputs
] ]
...@@ -429,130 +430,3 @@ def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model, ...@@ -429,130 +430,3 @@ def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model,
mm_limit=1, mm_limit=1,
tensor_parallel_size=1, tensor_parallel_size=1,
) )
def run_chunked_prefill_test(
vllm_runner: Type[VllmRunner],
inputs: List[Tuple[List[str], PromptImageInput, PromptVideoInput]],
model: str,
*,
dtype: str,
max_tokens: int,
num_logprobs: int,
mm_limit: int,
tensor_parallel_size: int,
distributed_executor_backend: Optional[str] = None,
):
"""Compare inference result between
chunked prefill disabled and chunked prefill enabled
"""
# NOTE:
# max_model_len should be greater than image_feature_size
with vllm_runner(model,
task="generate",
max_model_len=4000,
max_num_seqs=4,
dtype=dtype,
limit_mm_per_prompt={
"image": mm_limit,
"video": mm_limit
},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend
) as vllm_model:
outputs_per_case = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images or None,
videos=videos or None)
for prompts, images, videos in inputs
]
with vllm_runner(
model,
task="generate",
max_model_len=4000,
max_num_seqs=4,
dtype=dtype,
limit_mm_per_prompt={
"image": mm_limit,
"video": mm_limit
},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enable_chunked_prefill=True,
# should be small enough to ensure prefilling is chunked
max_num_batched_tokens=32,
mm_processor_kwargs={
"max_pixels": 16 * 28 * 28,
}) as vllm_model_chunked:
outputs_per_case_chunked = [
vllm_model_chunked.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images or None,
videos=videos or None) for prompts, images, videos in inputs
]
for outputs, \
outputs_chunked \
in zip(outputs_per_case,
outputs_per_case_chunked):
check_logprobs_close(
outputs_0_lst=outputs,
outputs_1_lst=outputs_chunked,
name_0="non_chunked",
name_1="chunked",
)
@pytest.mark.core_model
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [1])
@pytest.mark.parametrize("num_logprobs", [10])
def test_qwen2_vl_mrope_chunked_prefill(vllm_runner, example_prompts,
model: str, dtype: str,
max_tokens: int,
num_logprobs: int) -> None:
"""
Test Qwen2-VL's chunked prefill with M-RoPE
"""
prompts = [
qwen2_vl_chat_template(IMAGE_PLACEHOLDER, prompt)
for prompt in example_prompts[:1]
]
# 1. Qwen2-VL's M-RoPE works only when there are some multi-modal inputs,
# so an image is included in the inputs
# 2. however, Qwen2-VL currently won't work properly
# when chunked prefill is enabled and there are some multi-modal inputs,
# here use a hacky way: provide a **zero-length** image to make it happy
#
# and finally we achieved:
# (1) chunked_prefill enabled; (2) M-RoPE works; to continue our tests
zero_len_image = {
"image_embeds": torch.empty((0, MODEL_HIDDEN_SIZE)),
"image_grid_thw": torch.tensor([[0, 0, 0]])
}
images = [zero_len_image] * len(prompts)
inputs_per_case: List[Tuple[List[str], PromptImageInput,
PromptVideoInput]] = [
(prompts, images, []),
]
run_chunked_prefill_test(
vllm_runner,
inputs_per_case,
model,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=1,
tensor_parallel_size=1,
)
...@@ -5,17 +5,20 @@ typically specific to a small subset of models. ...@@ -5,17 +5,20 @@ typically specific to a small subset of models.
import re import re
import types import types
from pathlib import PosixPath from pathlib import PosixPath
from typing import Callable, List, Optional, Tuple, Union from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import torch import torch
from PIL.Image import Image from PIL.Image import Image
from transformers import AutoConfig, AutoTokenizer, BatchEncoding from transformers import (AutoConfig, AutoTokenizer, BatchEncoding,
GenerationConfig)
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from vllm.transformers_utils.tokenizer import patch_padding_side from vllm.transformers_utils.tokenizer import patch_padding_side
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
from .....conftest import HfRunner, ImageAsset, _ImageAssets from .....conftest import (HfRunner, ImageAsset, PromptAudioInput,
PromptImageInput, PromptVideoInput, _ImageAssets)
from ....utils import TokensTextLogprobs
from .types import RunnerOutput from .types import RunnerOutput
...@@ -180,6 +183,14 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput, ...@@ -180,6 +183,14 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
####### Post-processors for HF outputs ####### Post-processors for HF outputs
def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput,
model: str) -> RunnerOutput:
output_ids, output_str, out_logprobs = hf_output
if output_str.endswith("<|end▁of▁sentence|>"):
output_str = output_str.split("<|end▁of▁sentence|>")[0]
return output_ids, output_str, out_logprobs
def minicpmv_trunc_hf_output(hf_output: RunnerOutput, def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
model: str) -> RunnerOutput: model: str) -> RunnerOutput:
output_ids, output_str, out_logprobs = hf_output output_ids, output_str, out_logprobs = hf_output
...@@ -222,6 +233,11 @@ def wrap_inputs_post_processor(hf_inputs: BatchEncoding, dtype: str): ...@@ -222,6 +233,11 @@ def wrap_inputs_post_processor(hf_inputs: BatchEncoding, dtype: str):
return {"model_inputs": hf_inputs} return {"model_inputs": hf_inputs}
def molmo_post_processor(hf_inputs: BatchEncoding, dtype: str):
hf_inputs = cast_dtype_post_processor("images")(hf_inputs, dtype)
return {k: v.unsqueeze(0) for k, v in hf_inputs.items()}
####### Prompt path encoders for models that need models on disk ####### Prompt path encoders for models that need models on disk
def qwen_prompt_path_encoder( def qwen_prompt_path_encoder(
tmp_path: PosixPath, prompt: str, assets: Union[List[ImageAsset], tmp_path: PosixPath, prompt: str, assets: Union[List[ImageAsset],
...@@ -253,6 +269,34 @@ def qwen_prompt_path_encoder( ...@@ -253,6 +269,34 @@ def qwen_prompt_path_encoder(
####### Model-specific HuggingFace runner patchers ####### Model-specific HuggingFace runner patchers
def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
"""Patches and returns an instance of the HfRunner to use for GLM4."""
hf_processor = hf_model.processor
def processor(*args, text="", images=None, **kwargs):
if isinstance(images, Image):
images = [images]
# inputs is a custom class instead of dict or BatchFeature
inputs = hf_processor(
*args,
prompt=text,
images=images,
**kwargs,
)
inputs = {
k: inputs[k]
for k in inputs.keys() # noqa
if k not in ("seq_lens", "sft_format")
}
inputs = BatchEncoding(data=inputs, tensor_type="pt")
return inputs
hf_model.processor = processor
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.language.model.embed_tokens
return hf_model
def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner: def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
"""Patches and returns an instance of the HfRunner to use for GLM4.""" """Patches and returns an instance of the HfRunner to use for GLM4."""
hf_processor = hf_model.processor hf_processor = hf_model.processor
...@@ -451,3 +495,99 @@ def mantis_patch_hf_runner(hf_model: HfRunner) -> HfRunner: ...@@ -451,3 +495,99 @@ def mantis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
hf_model.model.generate = types.MethodType(_generate, hf_model.model) hf_model.model.generate = types.MethodType(_generate, hf_model.model)
return hf_model return hf_model
def minicpmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
orig_generate = hf_model.model.generate
def _generate(self, *args, **kwargs):
return orig_generate(*args, decode_text=False, **kwargs)
hf_model.model.generate = types.MethodType(_generate, hf_model.model)
return hf_model
def _generate_greedy_logprobs_limit(
self,
prompts: List[str],
max_tokens: int,
num_logprobs: int,
images: Optional[PromptImageInput] = None,
audios: Optional[PromptAudioInput] = None,
videos: Optional[PromptVideoInput] = None,
**kwargs: Any,
) -> List[TokensTextLogprobs]:
all_inputs = self.get_inputs(prompts,
images=images,
videos=videos,
audios=audios)
# Process in batches for inference.
if len(all_inputs):
input_ids_lst = []
images_lst = []
images_input_idx_lst = []
imges_masks_lst = []
for inputs in all_inputs:
input_ids_lst.append(inputs["input_ids"])
images_lst.append(inputs["images"])
images_input_idx_lst.append(inputs["image_input_idx"])
imges_masks_lst.append(inputs["image_masks"])
batch_inputs = {}
batch_inputs['input_ids'] = torch.cat(input_ids_lst, dim=0)
batch_inputs['images'] = torch.cat(images_lst, dim=0)
batch_inputs['image_input_idx'] = torch.cat(images_input_idx_lst,
dim=0)
batch_inputs['image_masks'] = torch.cat(imges_masks_lst, dim=0)
outputs = self.model.generate_from_batch(
batch=self.wrap_device(batch_inputs,
device=self.model.device.type),
generation_config=GenerationConfig(
max_new_tokens=max_tokens,
stop_strings="<|endoftext|>",
do_sample=False,
),
tokenizer=self.tokenizer,
output_hidden_states=True,
return_dict_in_generate=True,
)
all_logprobs: List[List[Dict[int, float]]] = []
all_output_ids: List[List[int]] = []
all_output_strs: List[str] = []
for index in range(len(all_inputs)):
(
seq_logprobs_lst,
output_len,
) = self._hidden_states_to_logprobs(outputs.hidden_states,
num_logprobs)
all_logprobs.append(seq_logprobs_lst)
seq_ids = outputs.sequences[index]
output_ids = seq_ids[-output_len:]
all_output_ids.append(output_ids.tolist())
all_output_strs.append(self.tokenizer.decode(output_ids))
outputs = zip(all_output_ids, all_output_strs, all_logprobs)
return [(output_ids, output_str, output_logprobs)
for output_ids, output_str, output_logprobs in outputs]
####### Molmo-specific HuggingFace runner patchers
def mlomo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
"""Patches and returns an instance of the HfRunner to use for Molmo."""
hf_processor = hf_model.processor
def _processor(*args, **kwargs):
return hf_processor.process(*args, **kwargs)
hf_model.processor = _processor
setattr( # noqa: B010
hf_model,
"generate_greedy_logprobs_limit",
types.MethodType(_generate_greedy_logprobs_limit, hf_model),
)
return hf_model
...@@ -26,10 +26,13 @@ def test_classification_models( ...@@ -26,10 +26,13 @@ def test_classification_models(
) -> None: ) -> None:
with vllm_runner(model, dtype=dtype) as vllm_model: with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.classify(example_prompts) vllm_outputs = vllm_model.classify(example_prompts)
# This test is for verifying whether the model's extra_repr # This test is for verifying whether the model's extra_repr
# can be printed correctly. # can be printed correctly.
print(vllm_model.model.llm_engine.model_executor.driver_worker. def print_model(model):
model_runner.model) print(model)
vllm_model.apply_model(print_model)
with hf_runner(model, with hf_runner(model,
dtype=dtype, dtype=dtype,
......
...@@ -18,15 +18,18 @@ from vllm.platforms import current_platform ...@@ -18,15 +18,18 @@ from vllm.platforms import current_platform
# [Encoder-only] # [Encoder-only]
pytest.param(os.path.join(models_path_prefix, "BAAI/bge-base-en-v1.5"), pytest.param(os.path.join(models_path_prefix, "BAAI/bge-base-en-v1.5"),
marks=[pytest.mark.core_model, pytest.mark.cpu_model]), marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
pytest.param(os.path.join(models_path_prefix, "sentence-transformers/all-MiniLM-L12-v2")),
pytest.param(os.path.join(models_path_prefix, "intfloat/multilingual-e5-large")), pytest.param(os.path.join(models_path_prefix, "intfloat/multilingual-e5-large")),
# [Encoder-decoder] # [Decoder-only]
pytest.param(os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct"),
marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
pytest.param(os.path.join(models_path_prefix, "BAAI/bge-multilingual-gemma2"), pytest.param(os.path.join(models_path_prefix, "BAAI/bge-multilingual-gemma2"),
marks=[pytest.mark.core_model]), marks=[pytest.mark.core_model]),
pytest.param(os.path.join(models_path_prefix, "ssmits/Qwen2-7B-Instruct-embed-base")), pytest.param(os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct"),
marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
pytest.param(os.path.join(models_path_prefix, "Alibaba-NLP/gte-Qwen2-1.5B-instruct")), pytest.param(os.path.join(models_path_prefix, "Alibaba-NLP/gte-Qwen2-1.5B-instruct")),
pytest.param(os.path.join(models_path_prefix, "Alibaba-NLP/gte-Qwen2-7B-instruct")), pytest.param(os.path.join(models_path_prefix, "Alibaba-NLP/gte-Qwen2-7B-instruct")),
pytest.param(os.path.join(models_path_prefix, "ssmits/Qwen2-7B-Instruct-embed-base")),
# [Encoder-decoder]
pytest.param(os.path.join(models_path_prefix, "sentence-transformers/stsb-roberta-base-v2")),
], ],
) )
...@@ -66,10 +69,13 @@ def test_models( ...@@ -66,10 +69,13 @@ def test_models(
max_model_len=None, max_model_len=None,
**vllm_extra_kwargs) as vllm_model: **vllm_extra_kwargs) as vllm_model:
vllm_outputs = vllm_model.encode(example_prompts) vllm_outputs = vllm_model.encode(example_prompts)
# This test is for verifying whether the model's extra_repr # This test is for verifying whether the model's extra_repr
# can be printed correctly. # can be printed correctly.
print(vllm_model.model.llm_engine.model_executor.driver_worker. def print_model(model):
model_runner.model) print(model)
vllm_model.apply_model(print_model)
check_embeddings_close( check_embeddings_close(
embeddings_0_lst=hf_outputs, embeddings_0_lst=hf_outputs,
......
...@@ -6,6 +6,8 @@ import math ...@@ -6,6 +6,8 @@ import math
import os import os
import pytest import pytest
import torch
import torch.nn.functional as F
from ....utils import models_path_prefix from ....utils import models_path_prefix
MODELS = [ MODELS = [
...@@ -13,6 +15,10 @@ MODELS = [ ...@@ -13,6 +15,10 @@ MODELS = [
os.path.join(models_path_prefix, "BAAI/bge-reranker-v2-m3"), # Roberta os.path.join(models_path_prefix, "BAAI/bge-reranker-v2-m3"), # Roberta
] ]
EMBEDDING_MODELS = [
"sentence-transformers/all-MiniLM-L12-v2",
]
TEXTS_1 = [ TEXTS_1 = [
"What is the capital of France?", "What is the capital of France?",
"What is the capital of Germany?", "What is the capital of Germany?",
...@@ -89,3 +95,97 @@ def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str): ...@@ -89,3 +95,97 @@ def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str):
assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
@pytest.fixture(scope="module", params=EMBEDDING_MODELS)
def emb_model_name(request):
yield request.param
@pytest.mark.parametrize("dtype", ["half"])
def test_llm_1_to_1_embedding(vllm_runner, hf_runner, emb_model_name,
dtype: str):
text_pair = [TEXTS_1[0], TEXTS_2[0]]
with hf_runner(emb_model_name, dtype=dtype,
is_sentence_transformer=True) as hf_model:
hf_embeddings = hf_model.encode(text_pair)
hf_outputs = [
F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0)
]
with vllm_runner(emb_model_name,
task="embed",
dtype=dtype,
max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
assert len(vllm_outputs) == 1
assert len(hf_outputs) == 1
assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
@pytest.mark.parametrize("dtype", ["half"])
def test_llm_1_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
dtype: str):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[0], TEXTS_2[1]],
]
with hf_runner(emb_model_name, dtype=dtype,
is_sentence_transformer=True) as hf_model:
hf_embeddings = [
hf_model.encode(text_pair) for text_pair in text_pairs
]
hf_outputs = [
F.cosine_similarity(*map(torch.tensor, pair), dim=0)
for pair in hf_embeddings
]
with vllm_runner(emb_model_name,
task="embed",
dtype=dtype,
max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
assert len(vllm_outputs) == 2
assert len(hf_outputs) == 2
assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
@pytest.mark.parametrize("dtype", ["half"])
def test_llm_N_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
dtype: str):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[1], TEXTS_2[1]],
]
with hf_runner(emb_model_name, dtype=dtype,
is_sentence_transformer=True) as hf_model:
hf_embeddings = [
hf_model.encode(text_pair) for text_pair in text_pairs
]
hf_outputs = [
F.cosine_similarity(*map(torch.tensor, pair), dim=0)
for pair in hf_embeddings
]
with vllm_runner(emb_model_name,
task="embed",
dtype=dtype,
max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
assert len(vllm_outputs) == 2
assert len(hf_outputs) == 2
assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
"""Compare the outputs of HF and vLLM for Whisper models using greedy sampling.
Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`.
"""
from typing import Optional
import pytest
from vllm import LLM, SamplingParams
from vllm.assets.audio import AudioAsset
from ....utils import fork_new_process_for_each_test, multi_gpu_test
PROMPTS = [
{
"prompt":
"<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
"multi_modal_data": {
"audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
},
},
{ # Test explicit encoder/decoder prompt
"encoder_prompt": {
"prompt": "",
"multi_modal_data": {
"audio": AudioAsset("winning_call").audio_and_sample_rate,
},
},
"decoder_prompt":
"<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
}
]
EXPECTED = {
"openai/whisper-tiny": [
" He has birth words I spoke in the original corner of that. And a"
" little piece of black coat poetry. Mary had a little sandwich,"
" sweet, with white and snow. And everyone had it very went the last"
" would sure to go.",
" >> And the old one, fit John the way to Edgar Martinez. >> One more"
" to line down the field line for our base camp. Here comes joy. Here"
" is June and the third base. They're going to wave him in. The throw"
" to the plate will be late. The Mariners are going to play for the"
" American League Championship. I don't believe it. It just continues"
" by all five."
],
"openai/whisper-small": [
" The first words I spoke in the original pornograph. A little piece"
" of practical poetry. Mary had a little lamb, its fleece was quite a"
" slow, and everywhere that Mary went the lamb was sure to go.",
" And the old one pitch on the way to Edgar Martinez one month. Here"
" comes joy. Here is Junior to third base. They're gonna wave him"
" in. The throw to the plate will be late. The Mariners are going to"
" play for the American League Championship. I don't believe it. It"
" just continues. My, oh my."
],
"openai/whisper-medium": [
" The first words I spoke in the original phonograph, a little piece"
" of practical poetry. Mary had a little lamb, its fleece was quite as"
" slow, and everywhere that Mary went the lamb was sure to go.",
" And the 0-1 pitch on the way to Edgar Martinez swung on the line"
" down the left field line for Obeyshev. Here comes Joy. Here is"
" Jorgen at third base. They're going to wave him in. The throw to the"
" plate will be late. The Mariners are going to play for the American"
" League Championship. I don't believe it. It just continues. My, oh"
" my."
],
"openai/whisper-large-v3": [
" The first words I spoke in the original phonograph, a little piece"
" of practical poetry. Mary had a little lamb, its feet were quite as"
" slow, and everywhere that Mary went, the lamb was sure to go.",
" And the 0-1 pitch on the way to Edgar Martinez. Swung on the line."
" Now the left field line for a base hit. Here comes Joy. Here is"
" Junior to third base. They're going to wave him in. The throw to the"
" plate will be late. The Mariners are going to play for the American"
" League Championship. I don't believe it. It just continues. My, oh,"
" my."
],
"openai/whisper-large-v3-turbo": [
" The first words I spoke in the original phonograph, a little piece"
" of practical poetry. Mary had a little lamb, its streets were quite"
" as slow, and everywhere that Mary went the lamb was sure to go.",
" And the 0-1 pitch on the way to Edgar Martinez. Swung on the line"
" down the left field line for a base hit. Here comes Joy. Here is"
" Junior to third base. They're going to wave him in. The throw to the"
" plate will be late. The Mariners are going to play for the American"
" League Championship. I don't believe it. It just continues. My, oh,"
" my."
]
}
def run_test(
model: str,
*,
tensor_parallel_size: int,
distributed_executor_backend: Optional[str] = None,
) -> None:
prompt_list = PROMPTS * 10
expected_list = EXPECTED[model] * 10
llm = LLM(
model=model,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
)
sampling_params = SamplingParams(
temperature=0,
top_p=1.0,
max_tokens=200,
)
outputs = llm.generate(prompt_list, sampling_params)
for output, expected in zip(outputs, expected_list):
print(output.outputs[0].text)
assert output.outputs[0].text == expected
@fork_new_process_for_each_test
@pytest.mark.core_model
@pytest.mark.parametrize(
"model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"])
def test_models(model) -> None:
run_test(model, tensor_parallel_size=1)
@multi_gpu_test(num_gpus=2)
@pytest.mark.core_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
def test_models_distributed(model, distributed_executor_backend) -> None:
run_test(model,
tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend)
...@@ -2,11 +2,15 @@ from typing import List, Optional, Tuple, Type, overload ...@@ -2,11 +2,15 @@ from typing import List, Optional, Tuple, Type, overload
import os import os
import pytest import pytest
import torch
from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer, from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
BatchEncoding) BatchEncoding)
from vllm.attention.backends.flash_attn import FlashAttentionMetadata
from vllm.attention.selector import (_Backend, _cached_get_attn_backend, from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
global_force_attn_backend_context_manager) global_force_attn_backend_context_manager)
from vllm.model_executor.models.mllama import (MLLAMA_IMAGE_TOKEN_ID,
MllamaForConditionalGeneration)
from vllm.multimodal.image import rescale_image_size from vllm.multimodal.image import rescale_image_size
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
...@@ -35,6 +39,29 @@ models = [ ...@@ -35,6 +39,29 @@ models = [
os.path.join(models_path_prefix, "meta-llama/Llama-3.2-11B-Vision-Instruct"), os.path.join(models_path_prefix, "meta-llama/Llama-3.2-11B-Vision-Instruct"),
] ]
# Indices for inputs
TEXT_ONLY = '0'
IMAGE_AT_BEG = '1'
IMAGE_AT_MIDDLE = '2'
TWO_IMAGES = '3'
# Input tokenized
prompt_data = {
# Tell me a story
TEXT_ONLY: [41551, 757, 264, 3446],
# <|image|> What's the content of this image
IMAGE_AT_BEG:
[MLLAMA_IMAGE_TOKEN_ID, 3639, 596, 279, 2262, 315, 420, 2217, 220],
# Hello <|image|>What' the content of this image
IMAGE_AT_MIDDLE:
[9906, 220, MLLAMA_IMAGE_TOKEN_ID, 3923, 6, 279, 2262, 315, 420, 2217],
#<|image|>Is there a duck in this image?<|image|>What's the animal in this image? # noqa: E501
TWO_IMAGES: [
MLLAMA_IMAGE_TOKEN_ID, 3957, 1070, 264, 37085, 304, 420, 2217, 30,
MLLAMA_IMAGE_TOKEN_ID, 3923, 596, 279, 10065, 304, 420, 2217, 30
]
}
def vllm_to_hf_output(vllm_output: Tuple[List[int], str, def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
Optional[SampleLogprobs]], Optional[SampleLogprobs]],
...@@ -367,3 +394,184 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model, ...@@ -367,3 +394,184 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
num_logprobs=num_logprobs, num_logprobs=num_logprobs,
tensor_parallel_size=1, tensor_parallel_size=1,
) )
@large_gpu_test(min_gb=48)
@pytest.mark.core_model
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
num_logprobs, attn_backend: _Backend) -> None:
stop_sign = image_assets[0].pil_image
with global_force_attn_backend_context_manager(attn_backend), vllm_runner(
model,
dtype=dtype,
max_model_len=4096,
max_num_seqs=2,
tensor_parallel_size=1,
enforce_eager=True,
limit_mm_per_prompt={"image":
_LIMIT_IMAGE_PER_PROMPT}) as vllm_model:
# Regression tests for https://github.com/vllm-project/vllm/issues/10648
# Number of image tags is greater than the number of images provided
prompt = "<|begin_of_text|><|image|><|image|> Compare the two images" # noqa: E501
image = stop_sign
with pytest.raises(ValueError):
vllm_model.generate_greedy_logprobs([prompt],
max_tokens,
num_logprobs,
images=[image])
# Batch of a text-only and image request that requires cross-attention
prompts = [
"What is the capital of spain?",
"Text before the image...<|image|>What is in the image?", # noqa: E501
]
images = [
None,
[stop_sign],
]
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs,
images=images)
# Test the reverse order too for good measure
prompts = [
"<|begin_of_text|>Text before the image...<|image|>What is in the image?", # noqa: E501
"<|begin_of_text|>Hello!",
]
images = [
[stop_sign],
None,
]
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs,
images=images)
@pytest.mark.core_model
@pytest.mark.parametrize(
"input_indices_and_output",
# inputs, (cross_attention_mask, kv_range_for_decode)
[([TEXT_ONLY], (None, None)), ([IMAGE_AT_BEG], (None, None)),
([TEXT_ONLY, IMAGE_AT_BEG], (None, None)),
([IMAGE_AT_MIDDLE], ((10, 12), [[0, 6]])),
([TEXT_ONLY, IMAGE_AT_MIDDLE], ((14, 12), [[0, 6]])),
([TEXT_ONLY, IMAGE_AT_BEG, IMAGE_AT_MIDDLE],
((23, 24), [[0, 6], [6, 12]])),
([IMAGE_AT_MIDDLE, TEXT_ONLY], ((14, 12), [[0, 6]])),
([TWO_IMAGES], ((18, 12), [[6, 12]])),
([TEXT_ONLY, TWO_IMAGES], ((22, 12), [[6, 12]]))])
def test_get_cross_attention_mask(input_indices_and_output) -> None:
input_indices, expected_output = input_indices_and_output
sequences = [torch.tensor(prompt_data[i]) for i in input_indices]
num_tiles = [[2, 2] if i != TEXT_ONLY else [] for i in input_indices
if i != TEXT_ONLY]
input = torch.cat(sequences)
seq_lens = [len(s) for s in sequences]
attn_data = FlashAttentionMetadata(
seq_lens=seq_lens,
# Dummy values
enable_kv_scales_calculation=False,
num_prefills=0,
num_prefill_tokens=0,
num_decode_tokens=0,
slot_mapping=0,
multi_modal_placeholder_index_maps=None,
seq_lens_tensor=0,
max_prefill_seq_len=0,
max_decode_seq_len=0,
context_lens_tensor=None,
block_tables=None,
use_cuda_graph=False,
)
dummy: dict[str, str] = {}
cross_attention_mask, kv_range_for_decode = MllamaForConditionalGeneration\
.get_cross_attention_mask(dummy,
input,
attn_data,
num_tiles=num_tiles,
num_tokens_per_tile=3,
dtype=torch.bfloat16)
expected_cross_attention_mask, expected_kv_range_for_decode = \
expected_output
assert kv_range_for_decode == expected_kv_range_for_decode
if expected_cross_attention_mask is not None:
assert cross_attention_mask is not None
assert cross_attention_mask.shape == expected_cross_attention_mask
else:
assert cross_attention_mask is None
@pytest.mark.core_model
@pytest.mark.parametrize(
"input_indices",
[[TEXT_ONLY], [IMAGE_AT_BEG], [TEXT_ONLY, IMAGE_AT_BEG], [IMAGE_AT_MIDDLE],
[TEXT_ONLY, IMAGE_AT_MIDDLE], [TEXT_ONLY, IMAGE_AT_BEG, IMAGE_AT_MIDDLE],
[IMAGE_AT_MIDDLE, TEXT_ONLY], [TWO_IMAGES], [TEXT_ONLY, TWO_IMAGES]])
def test_get_full_text_row_masked_out_mask(input_indices) -> None:
sequences = [torch.tensor(prompt_data[i]) for i in input_indices]
seq_lens = [len(s) for s in sequences]
num_prefill_tokens = sum(seq_lens)
# TEXT_ONLY is zero, so it will be masked out,
# other instances should not be.
encoder_seq_lens = [int(i) for i in input_indices]
attn_data = FlashAttentionMetadata(
seq_lens=seq_lens,
encoder_seq_lens=encoder_seq_lens,
num_prefill_tokens=num_prefill_tokens,
# Dummy values
enable_kv_scales_calculation=False,
num_prefills=0,
num_decode_tokens=0,
slot_mapping=0,
multi_modal_placeholder_index_maps=None,
seq_lens_tensor=0,
max_prefill_seq_len=0,
max_decode_seq_len=0,
context_lens_tensor=None,
block_tables=None,
use_cuda_graph=False,
)
dummy: dict[str, str] = {}
full_text_row_masked_out_mask = MllamaForConditionalGeneration\
.get_full_text_row_masked_out_mask(dummy,
attn_data,
torch.get_default_device())
full_text_row_masked_out_mask = full_text_row_masked_out_mask.squeeze()
full_text_row_masked_out_mask = full_text_row_masked_out_mask.tolist()
idx = 0
assert len(full_text_row_masked_out_mask) == num_prefill_tokens
for i, seq_len in enumerate(seq_lens):
must_be_masked = input_indices[i] != TEXT_ONLY
for _ in range(seq_len):
assert full_text_row_masked_out_mask[idx] == must_be_masked, \
f"full_text_row_masked_out_mask[{idx}] must be " \
f"'{must_be_masked}' "
idx += 1
from functools import partial
import numpy as np
import pytest
from PIL import Image
from vllm.config import ModelConfig
from vllm.inputs import InputProcessingContext
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.processing import ProcessingCache
from vllm.multimodal.utils import cached_get_tokenizer
from ....multimodal.utils import random_audio, random_image, random_video
from ...registry import HF_EXAMPLE_MODELS
def _test_processing_correctness(
model_id: str,
hit_rate: float,
num_batches: int,
simplify_rate: float,
):
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
model_config = ModelConfig(
model_id,
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=model_info.trust_remote_code,
seed=0,
dtype="float16",
revision=None,
hf_overrides=model_info.hf_overrides,
)
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
ctx = InputProcessingContext(
model_config,
tokenizer=cached_get_tokenizer(
model_config.tokenizer,
trust_remote_code=model_info.trust_remote_code,
),
)
# Ensure that it can fit all of the data
cache = ProcessingCache(capacity=1 << 30)
processing_info = factories.info(ctx)
supported_mm_limits = processing_info.get_supported_mm_limits()
limit_mm_per_prompt = {
modality: 3 if limit is None else limit
for modality, limit in supported_mm_limits.items()
}
model_config.get_multimodal_config().limit_per_prompt = limit_mm_per_prompt
baseline_processor = factories.build_processor(ctx, cache=None)
cached_processor = factories.build_processor(ctx, cache=cache)
dummy_inputs = baseline_processor.dummy_inputs
tokenizer = baseline_processor.info.get_tokenizer()
rng = np.random.RandomState(0)
input_to_hit = {
"image": Image.new("RGB", size=(128, 128)),
"video": np.zeros((4, 128, 128, 3), dtype=np.uint8),
"audio": (np.zeros((512, )), 16000),
}
input_factory = {
"image":
partial(random_image, rng, min_wh=128, max_wh=256),
"video":
partial(random_video,
rng,
min_frames=2,
max_frames=8,
min_wh=128,
max_wh=256),
"audio":
partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
}
for batch_idx in range(num_batches):
mm_data = {
k:
[(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
for _ in range(rng.randint(limit))]
for k, limit in limit_mm_per_prompt.items()
}
mm_counts = {k: len(vs) for k, vs in mm_data.items()}
prompt = dummy_inputs.get_dummy_processor_inputs(
model_config.max_model_len,
mm_counts,
).prompt_text
# Drop unnecessary keys and test single -> multi conversion
if rng.rand() < simplify_rate:
for k in list(mm_data.keys()):
if not mm_data[k]:
del mm_data[k]
elif len(mm_data[k]) == 1:
mm_data[k] = mm_data[k][0]
baseline_result = baseline_processor.apply(
prompt,
mm_data=mm_data,
hf_processor_mm_kwargs={},
)
cached_result = cached_processor.apply(
prompt,
mm_data=mm_data,
hf_processor_mm_kwargs={},
)
assert baseline_result == cached_result, (
f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
baseline_tokenized_result = baseline_processor.apply(
tokenizer.encode(prompt),
mm_data=mm_data,
hf_processor_mm_kwargs={},
)
assert baseline_result == baseline_tokenized_result, (
f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
cached_tokenized_result = cached_processor.apply(
tokenizer.encode(prompt),
mm_data=mm_data,
hf_processor_mm_kwargs={},
)
assert cached_result == cached_tokenized_result, (
f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
# yapf: disable
# True if the model supports multiple data items of the modality per request
@pytest.mark.parametrize("model_id", [
"rhymes-ai/Aria",
"Salesforce/blip2-opt-2.7b",
"facebook/chameleon-7b",
"deepseek-ai/deepseek-vl2-tiny",
"adept/fuyu-8b",
"llava-hf/llava-1.5-7b-hf",
"llava-hf/llava-v1.6-mistral-7b-hf",
"llava-hf/LLaVA-NeXT-Video-7B-hf",
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
"TIGER-Lab/Mantis-8B-siglip-llama3",
"mistral-community/pixtral-12b",
"openbmb/MiniCPM-o-2_6",
"openbmb/MiniCPM-V-2_6",
"Qwen/Qwen-VL-Chat",
"Qwen/Qwen2-VL-2B-Instruct",
"Qwen/Qwen2-Audio-7B-Instruct",
"fixie-ai/ultravox-v0_3",
])
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
@pytest.mark.parametrize("num_batches", [32])
@pytest.mark.parametrize("simplify_rate", [1.0])
# yapf: enable
def test_processing_correctness(
model_id: str,
hit_rate: float,
num_batches: int,
simplify_rate: float,
):
_test_processing_correctness(
model_id,
hit_rate=hit_rate,
num_batches=num_batches,
simplify_rate=simplify_rate,
)
# yapf: disable
@pytest.mark.parametrize("model_id", ["microsoft/Phi-3-vision-128k-instruct"])
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
@pytest.mark.parametrize("num_batches", [32])
@pytest.mark.parametrize("simplify_rate", [1.0])
# yapf: enable
def test_processing_correctness_phi3v(
model_id: str,
hit_rate: float,
num_batches: int,
simplify_rate: float,
):
# HACK - this is an attempted workaround for the following bug
# https://github.com/huggingface/transformers/issues/34307
from transformers import AutoImageProcessor # noqa: F401
from transformers import AutoProcessor # noqa: F401
AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True)
_test_processing_correctness(
model_id,
hit_rate=hit_rate,
num_batches=num_batches,
simplify_rate=simplify_rate,
)
...@@ -9,9 +9,9 @@ from transformers import AutoImageProcessor, AutoTokenizer ...@@ -9,9 +9,9 @@ from transformers import AutoImageProcessor, AutoTokenizer
from vllm.inputs import InputContext, token_inputs from vllm.inputs import InputContext, token_inputs
from vllm.multimodal import MultiModalRegistry from vllm.multimodal import MultiModalRegistry
from .....conftest import _ImageAssets from ....conftest import _ImageAssets
from ....utils import build_model_context from ...utils import build_model_context
from .....utils import models_path_prefix from ....utils import models_path_prefix
models = [os.path.join(models_path_prefix, "HuggingFaceM4/Idefics3-8B-Llama3")] models = [os.path.join(models_path_prefix, "HuggingFaceM4/Idefics3-8B-Llama3")]
......
...@@ -8,9 +8,9 @@ from transformers import AutoTokenizer ...@@ -8,9 +8,9 @@ from transformers import AutoTokenizer
from vllm.inputs import InputContext, token_inputs from vllm.inputs import InputContext, token_inputs
from vllm.multimodal import MultiModalRegistry from vllm.multimodal import MultiModalRegistry
from .....conftest import _ImageAssets from ....conftest import _ImageAssets
from ....utils import build_model_context from ...utils import build_model_context
from .....utils import models_path_prefix from ....utils import models_path_prefix
models = [os.path.join(models_path_prefix, "OpenGVLab/InternVL2-2B")] models = [os.path.join(models_path_prefix, "OpenGVLab/InternVL2-2B")]
......
import itertools
from functools import partial
import pytest
from PIL import Image
from pqdm.threads import pqdm
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.parse import ImageSize
from vllm.multimodal.processing import BaseMultiModalProcessor
from vllm.multimodal.utils import cached_get_tokenizer
from ...utils import build_model_context
def _validate_image_max_tokens_one(
processor: BaseMultiModalProcessor,
max_tokens: int,
failed_size_excs: list[tuple[ImageSize, Exception]],
image_size: ImageSize,
) -> None:
info = processor.info
feature_size = info.get_num_image_tokens(image_width=image_size.width,
image_height=image_size.height)
try:
assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
except Exception as exc:
failed_size_excs.append((image_size, exc))
@pytest.mark.skip("This test takes around 5 minutes to run. "
"Comment this out to run it manually.")
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
def test_processor_max_tokens(model_id):
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": 1},
)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
)
info = processor.info
seen_aspect_ratios = set[float]()
image_sizes = list[ImageSize]()
# The aspect ratio of the grid layout is between 1 and 2
# NOTE: Assumes that feature size calculation is the same if we
# swap the width and height of the image
for w, h in itertools.product(range(32, 4096), repeat=2):
aspect_ratio = w / h
if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios:
image_sizes.append(ImageSize(w, h))
seen_aspect_ratios.add(aspect_ratio)
failed_size_excs = list[tuple[ImageSize, Exception]]()
validate_one = partial(
_validate_image_max_tokens_one,
processor,
info.get_max_image_tokens(), # type: ignore
failed_size_excs,
)
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
if failed_size_excs:
msg = "Found failing image sizes:" \
+ "\n========\n".join(f"[{size}]\n{exc}"
for size, exc in failed_size_excs)
raise AssertionError(msg)
def _validate_image_prompt_replacements_one(
processor: BaseMultiModalProcessor,
num_imgs: int,
failed_size_excs: list[tuple[ImageSize, Exception]],
image_size: ImageSize,
) -> None:
prompt = "<image>" * num_imgs
image = Image.new("RGB", size=image_size)
mm_data = {"image": [image] * num_imgs}
try:
# The processor will throw an error if there is a mismatch
# in the prompt replacements
processed_inputs = processor.apply(prompt, mm_data, {})
image_placeholders = processed_inputs["mm_placeholders"]["image"]
assert len(image_placeholders) == num_imgs
first_placeholder = image_placeholders[0]
# NOTE: There is a BOS token
assert first_placeholder["offset"] == 1
assert first_placeholder["length"] == (
len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
except Exception as exc:
failed_size_excs.append((image_size, exc))
def _test_image_prompt_replacements(
processor,
*,
num_imgs: int,
image_sizes: list[ImageSize],
) -> None:
"""
Ensure LlavaNextMultiModalProcessor
handles prompt replacement properly for input images.
"""
failed_size_excs = list[tuple[ImageSize, Exception]]()
validate_one = partial(
_validate_image_prompt_replacements_one,
processor,
num_imgs,
failed_size_excs,
)
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
if failed_size_excs:
msg = "Found failing image sizes:" \
+ "\n========\n".join(f"[{size}]\n{exc}"
for size, exc in failed_size_excs)
raise AssertionError(msg)
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
@pytest.mark.parametrize("num_imgs", [1, 2])
def test_processor_prompt_replacements_regression(model_id, num_imgs):
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
)
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
(488, 183), (2560, 1669)]
image_sizes = [
size for w, h in image_ratios
for size in [ImageSize(w, h), ImageSize(h, w)]
]
_test_image_prompt_replacements(
processor,
num_imgs=num_imgs,
image_sizes=image_sizes,
)
@pytest.mark.skip("This test takes around 2 hours to run. "
"Comment this out to run it manually.")
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
@pytest.mark.parametrize("num_imgs", [1])
def test_processor_prompt_replacements_all(model_id, num_imgs):
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
)
seen_aspect_ratios = set[float]()
image_sizes = list[ImageSize]()
# The aspect ratio of the grid layout is between 1 and 2
# NOTE: Assumes that feature size calculation is the same if we
# swap the width and height of the image
for w, h in itertools.product(range(64, 1024), repeat=2):
aspect_ratio = w / h
if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios:
image_sizes.append(ImageSize(w, h))
seen_aspect_ratios.add(aspect_ratio)
_test_image_prompt_replacements(
processor,
num_imgs=num_imgs,
image_sizes=image_sizes,
)
import itertools
from functools import partial
import pytest
from PIL import Image
from pqdm.threads import pqdm
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.parse import ImageSize
from vllm.multimodal.processing import BaseMultiModalProcessor
from vllm.multimodal.utils import cached_get_tokenizer
from ...utils import build_model_context
def _validate_image_max_tokens_one(
processor: BaseMultiModalProcessor,
max_tokens: int,
failed_size_excs: list[tuple[ImageSize, Exception]],
image_size: ImageSize,
) -> None:
info = processor.info
feature_size = info.get_num_image_tokens(image_width=image_size.width,
image_height=image_size.height)
try:
assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
except Exception as exc:
failed_size_excs.append((image_size, exc))
@pytest.mark.skip("This test takes around 5 minutes to run. "
"Comment this out to run it manually.")
@pytest.mark.parametrize("model_id",
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
def test_processor_max_tokens(model_id):
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": 1},
)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
)
info = processor.info
seen_aspect_ratios = set[float]()
image_sizes = list[ImageSize]()
# The aspect ratio of the grid layout is between 1 and 6
# NOTE: Assumes that feature size calculation is the same if we
# swap the width and height of the image
for w, h in itertools.product(range(32, 4096), repeat=2):
aspect_ratio = w / h
if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios:
image_sizes.append(ImageSize(w, h))
seen_aspect_ratios.add(aspect_ratio)
failed_size_excs = list[tuple[ImageSize, Exception]]()
validate_one = partial(
_validate_image_max_tokens_one,
processor,
info.get_max_image_tokens(), # type: ignore
failed_size_excs,
)
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
if failed_size_excs:
msg = "Found failing image sizes:" \
+ "\n========\n".join(f"[{size}]\n{exc}"
for size, exc in failed_size_excs)
raise AssertionError(msg)
def _validate_image_prompt_replacements_one(
processor: BaseMultiModalProcessor,
num_imgs: int,
failed_size_excs: list[tuple[ImageSize, Exception]],
image_size: ImageSize,
) -> None:
prompt = "<image>" * num_imgs
image = Image.new("RGB", size=image_size)
mm_data = {"image": [image] * num_imgs}
try:
# The processor will throw an error if there is a mismatch
# in the prompt replacements
processed_inputs = processor.apply(prompt, mm_data, {})
image_placeholders = processed_inputs["mm_placeholders"]["image"]
assert len(image_placeholders) == num_imgs
first_placeholder = image_placeholders[0]
assert first_placeholder["offset"] == 0
assert first_placeholder["length"] == len(
processed_inputs["prompt_token_ids"]) // num_imgs
except Exception as exc:
failed_size_excs.append((image_size, exc))
def _test_image_prompt_replacements(
processor,
*,
num_imgs: int,
image_sizes: list[ImageSize],
) -> None:
"""
Ensure LlavaOnevisionMultiModalProcessor
handles prompt replacement properly for input images.
"""
failed_size_excs = list[tuple[ImageSize, Exception]]()
validate_one = partial(
_validate_image_prompt_replacements_one,
processor,
num_imgs,
failed_size_excs,
)
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
if failed_size_excs:
msg = "Found failing image sizes:" \
+ "\n========\n".join(f"[{size}]\n{exc}"
for size, exc in failed_size_excs)
raise AssertionError(msg)
@pytest.mark.parametrize("model_id",
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
@pytest.mark.parametrize("num_imgs", [1, 2])
def test_processor_prompt_replacements_regression(model_id, num_imgs):
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
)
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
(488, 183), (2560, 1669)]
image_sizes = [
size for w, h in image_ratios
for size in [ImageSize(w, h), ImageSize(h, w)]
]
_test_image_prompt_replacements(
processor,
num_imgs=num_imgs,
image_sizes=image_sizes,
)
@pytest.mark.skip("This test takes around 2 hours to run. "
"Comment this out to run it manually.")
@pytest.mark.parametrize("model_id",
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
@pytest.mark.parametrize("num_imgs", [1])
def test_processor_prompt_replacements_all(model_id, num_imgs):
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
)
seen_aspect_ratios = set[float]()
image_sizes = list[ImageSize]()
# The aspect ratio of the grid layout is between 1 and 6
# NOTE: Assumes that feature size calculation is the same if we
# swap the width and height of the image
for w, h in itertools.product(range(64, 1024), repeat=2):
aspect_ratio = w / h
if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios:
image_sizes.append(ImageSize(w, h))
seen_aspect_ratios.add(aspect_ratio)
_test_image_prompt_replacements(
processor,
num_imgs=num_imgs,
image_sizes=image_sizes,
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment