test_common.py 46.2 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
"""Common tests for testing .generate() functionality for single / multiple
image, embedding, and video support for different VLMs in vLLM.
"""
6
import math
7
import os
8
from collections import defaultdict
9
10
from pathlib import PosixPath

zhuwenwen's avatar
zhuwenwen committed
11
import os
12
import pytest
13
from transformers import (AutoModel, AutoModelForImageTextToText,
14
                          AutoModelForTextToWaveform, AutoModelForVision2Seq)
15
16

from vllm.platforms import current_platform
17
from vllm.utils import identity
18

19
20
from ....conftest import (IMAGE_ASSETS, AudioTestAssets, HfRunner,
                          ImageTestAssets, VideoTestAssets, VllmRunner)
21
from ....utils import (create_new_process_for_each_test, large_gpu_mark,
22
                       multi_gpu_marks)
23
24
25
26
27
from ...utils import check_outputs_equal
from .vlm_utils import custom_inputs, model_utils, runners
from .vlm_utils.case_filtering import get_parametrized_options
from .vlm_utils.types import (CustomTestOptions, ExpandableVLMTestArgs,
                              VLMTestInfo, VLMTestType)
zhuwenwen's avatar
zhuwenwen committed
28
from ....utils import models_path_prefix
29
30
31
32
33
34
35
36

# This hack is needed for phi3v & paligemma models
# ROCm Triton FA can run into shared memory issues with these models,
# use other backends in the meantime
# FIXME (mattwong, gshtrasb, hongxiayan)
if current_platform.is_rocm():
    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"

37
38
39
40
41
42
43
44
REQUIRES_V0_MODELS = [
    # V1 Test: no way to fall back for head_dim = 80
    # https://github.com/vllm-project/vllm/issues/14524
    "qwen_vl",
    # V1 Test: not enough KV cache space in C1.
    "fuyu",
]

45
46
47
48
49
50
# yapf: disable
COMMON_BROADCAST_SETTINGS = {
    "test_type": VLMTestType.IMAGE,
    "dtype": "half",
    "max_tokens": 5,
    "tensor_parallel_size": 2,
51
    "hf_model_kwargs": {"device_map": "auto"},
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
    "image_size_factors": [(.25, 0.5, 1.0)],
    "distributed_executor_backend": (
        "ray",
        "mp",
    )
}

### Test configuration for specific models
# NOTE: The convention of the test settings below is to lead each test key
# with the name of the model arch used in the test, using underscores in place
# of hyphens; this makes it more convenient to filter tests for a specific kind
# of model. For example....
#
# To run all test types for a specific key:
#     use the k flag to substring match with a leading square bracket; if the
#     model arch happens to be a substring of another one, you can add a
#     trailing hyphen. E.g.,
#                 - pytest $TEST_FILE -k "[llava-"
#     prevents matching on "[llava_next-" & will match just the enabled cases
#     for llava, i.e., single image, image embedding, and custom input tests.
#
# To run a test for a Test Info for just one of multiple models:
#     use the k flag to substring match the model name, e.g.,
#                 - pytest $TEST_FILE -k OpenGVLab/InternVL2-1B
#     prevents matching on nGVLab/InternVL2-2B.
#
# You can also combine substrings to match more granularly.
#     ex 1:
#        pytest $TEST_FILE -k "test_single_image and OpenGVLab/InternVL2-1B"
#     will run only test_single_image* for OpenGVLab/InternVL2-1B; this would
#     match both wrappers for single image tests, since it also matches
#     test_single_image_heavy (which forks if we have a distributed backend)
#     ex 2:
#        pytest $TEST_FILE -k  "[llava- or [intern_vl-"
#     will run all of the tests for only llava & internvl.
#
# NOTE you can add --collect-only to any of the above commands to see
# which cases would be selected and deselected by pytest. In general,
# this is a good idea for checking your command first, since tests are slow.

VLM_TEST_SETTINGS = {
93
94
    #### Core tests to always run in the CI
    "llava": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
95
        models=[os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf")],
96
97
98
99
100
101
102
103
        test_type=(
            VLMTestType.EMBEDDING,
            VLMTestType.IMAGE,
            VLMTestType.CUSTOM_INPUTS
        ),
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        convert_assets_to_embeddings=model_utils.get_llava_embeddings,
        max_model_len=4096,
104
        auto_cls=AutoModelForImageTextToText,
105
106
107
108
109
110
111
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
            inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
                formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:"
            ),
            limit_mm_per_prompt={"image": 4},
        )],
112
        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
113
114
    ),
    "paligemma": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
115
        models=[os.path.join(models_path_prefix, "google/paligemma-3b-mix-224")],
116
117
118
119
120
121
122
123
        test_type=VLMTestType.IMAGE,
        prompt_formatter=identity,
        img_idx_to_prompt = lambda idx: "",
        # Paligemma uses its own sample prompts because the default one fails
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "caption es",
            "cherry_blossom": "What is in the picture?",
        }),
124
        auto_cls=AutoModelForImageTextToText,
125
        vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
126
127
        dtype="bfloat16",
        marks=[pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")],  # noqa: E501
128
    ),
zhuwenwen's avatar
zhuwenwen committed
129

Roger Wang's avatar
Roger Wang committed
130
    "qwen2_5_vl": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
131
        models=[os.path.join(models_path_prefix, "Qwen/Qwen2.5-VL-3B-Instruct")],
132
133
134
135
136
137
138
139
140
141
142
143
144
        test_type=(
            VLMTestType.IMAGE,
            VLMTestType.MULTI_IMAGE,
            VLMTestType.VIDEO
        ),
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
145
        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
146
    ),
147
    "qwen2_5_omni": VLMTestInfo(
148
        models=["Qwen/Qwen2.5-Omni-3B"],
149
150
151
152
153
154
155
156
157
158
        test_type=(
            VLMTestType.IMAGE,
            VLMTestType.MULTI_IMAGE,
            VLMTestType.VIDEO
        ),
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
        img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>", # noqa: E501
        video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
159
        auto_cls=AutoModelForTextToWaveform,
160
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
161
        patch_hf_runner=model_utils.qwen2_5_omni_patch_hf_runner,
162
163
164
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
    ),
165
166
167
168
169
170
171
172
173
174
175
    "ultravox": VLMTestInfo(
        models = ["fixie-ai/ultravox-v0_5-llama-3_2-1b"],
        test_type=VLMTestType.AUDIO,
        prompt_formatter=lambda audio_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{audio_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
        audio_idx_to_prompt=lambda idx: "<|audio|>",
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModel,
        hf_output_post_proc=model_utils.ultravox_trunc_hf_output,
        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
    ),
176
    #### Extended model tests
177
    "aria": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
178
        models=[os.path.join(models_path_prefix, "rhymes-ai/Aria")],
179
180
181
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
        img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
Roger Wang's avatar
Roger Wang committed
182
183
        max_model_len=4096,
        max_num_seqs=2,
184
185
186
187
188
189
190
191
192
193
194
        auto_cls=AutoModelForImageTextToText,
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "<vlm_image>Please describe the image shortly.",
            "cherry_blossom": "<vlm_image>Please infer the season with reason.",  # noqa: E501
        }),
        multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.",    # noqa: E501
        stop_str=["<|im_end|>"],
        image_size_factors=[(0.10, 0.15)],
        max_tokens=64,
        marks=[large_gpu_mark(min_gb=64)],
    ),
Jennifer Zhao's avatar
Jennifer Zhao committed
195
    "aya_vision": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
196
        models=[os.path.join(models_path_prefix, "CohereForAI/aya-vision-8b")],
197
        test_type=(VLMTestType.IMAGE),
Jennifer Zhao's avatar
Jennifer Zhao committed
198
199
200
201
202
203
        prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "<image>What's the content in the center of the image?",  # noqa: E501
            "cherry_blossom": "<image>What is the season?",  # noqa: E501
        }),
        multi_image_prompt="<image><image>Describe the two images in detail.",  # noqa: E501
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
        vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}},
    ),
    "aya_vision-multi_image": VLMTestInfo(
        models=["CohereForAI/aya-vision-8b"],
        test_type=(VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "<image>What's the content in the center of the image?",  # noqa: E501
            "cherry_blossom": "<image>What is the season?",  # noqa: E501
        }),
        multi_image_prompt="<image><image>Describe the two images in detail.",  # noqa: E501
        max_model_len=4096,
Jennifer Zhao's avatar
Jennifer Zhao committed
219
220
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
221
222
        vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}},
        marks=[large_gpu_mark(min_gb=32)],
Roger Wang's avatar
Roger Wang committed
223
    ),
224
    "blip2": VLMTestInfo(
225
        # TODO: Change back to 2.7b once head_dim = 80 is supported
zhuwenwen's avatar
zhuwenwen committed
226
        models=[os.path.join(models_path_prefix, "Salesforce/blip2-opt-2.7b")],
227
228
229
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
        img_idx_to_prompt=lambda idx: "",
230
        auto_cls=AutoModelForImageTextToText,
231
        vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output,
232
233
        # FIXME: https://github.com/huggingface/transformers/pull/38510
        marks=[pytest.mark.skip("Model is broken")],
234
235
    ),
    "chameleon": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
236
        models=[os.path.join(models_path_prefix, "facebook/chameleon-7b")],
237
238
239
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        max_model_len=4096,
240
        max_num_seqs=2,
241
        auto_cls=AutoModelForImageTextToText,
242
243
244
245
246
247
248
        # For chameleon, we only compare the sequences
        vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
        hf_output_post_proc = lambda hf_output, model: hf_output[:2],
        comparator=check_outputs_equal,
        max_tokens=8,
        dtype="bfloat16",
    ),
249
    "deepseek_vl_v2": VLMTestInfo(
250
        models=["Isotr0py/deepseek-vl2-tiny"], # model repo using dynamic module
251
252
253
254
255
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
        single_image_prompts=IMAGE_ASSETS.prompts({
256
257
            "stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
            "cherry_blossom": "<image>\nPlease infer the season with reason in details.",   # noqa: E501
258
        }),
259
        multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?",    # noqa: E501
260
261
262
        patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
        hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
        stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"],  # noqa: E501
263
        image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
264
    ),
265
    "fuyu": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
266
        models=[os.path.join(models_path_prefix, "adept/fuyu-8b")],
267
268
269
270
271
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda img_prompt: f"{img_prompt}\n",
        img_idx_to_prompt=lambda idx: "",
        max_model_len=2048,
        max_num_seqs=2,
272
        auto_cls=AutoModelForImageTextToText,
273
274
275
276
277
        use_tokenizer_eos=True,
        vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
        num_logprobs=10,
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
    ),
278
    "gemma3": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
279
        models=[os.path.join(models_path_prefix, "google/gemma-3-4b-it")],
280
281
282
283
284
285
286
287
288
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "<start_of_image>What's the content in the center of the image?",  # noqa: E501
            "cherry_blossom": "<start_of_image>What is the season?",  # noqa: E501
        }),
        multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.",  # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
289
        auto_cls=AutoModelForImageTextToText,
290
291
        vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
        patch_hf_runner=model_utils.gemma3_patch_hf_runner,
292
        num_logprobs=10,
293
    ),
294
    "glm4v": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
295
        models=[os.path.join(models_path_prefix, "THUDM/glm-4v-9b")],
296
        test_type=VLMTestType.IMAGE,
297
298
299
300
301
        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",  # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?",  # noqa: E501
            "cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?",  # noqa: E501
        }),
302
303
304
        max_model_len=2048,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: [151329, 151336, 151338],
305
306
307
308
309
310
        patch_hf_runner=model_utils.glm4v_patch_hf_runner,
        # The image embeddings match with HF but the outputs of the language
        # decoder are only consistent up to 2 decimal places.
        # So, we need to reduce the number of tokens for the test to pass.
        max_tokens=8,
        num_logprobs=10,
311
        marks=[large_gpu_mark(min_gb=32)],
312
313
314
    ),
    "h2ovl": VLMTestInfo(
        models = [
zhuwenwen's avatar
zhuwenwen committed
315
            os.path.join(models_path_prefix, "h2oai/h2ovl-mississippi-800m"),
316
317
            # TODO: Re-enable once head_dim = 80 is supported
            # "h2oai/h2ovl-mississippi-2b",
318
319
320
321
322
323
324
325
326
327
        ],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "<image>\nWhat's the content in the center of the image?",  # noqa: E501
            "cherry_blossom": "<image>\nWhat is the season?",
        }),
        multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.",  # noqa: E501
        max_model_len=8192,
        use_tokenizer_eos=True,
328
        num_logprobs=10,
329
        patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
330
    ),
331
    "idefics3": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
332
        models=[os.path.join(models_path_prefix, "HuggingFaceTB/SmolVLM-256M-Instruct")],
333
334
335
336
337
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
        img_idx_to_prompt=lambda idx: "<image>",
        max_model_len=8192,
        max_num_seqs=2,
338
        auto_cls=AutoModelForImageTextToText,
339
        hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
340
    ),
341
342
    "intern_vl": VLMTestInfo(
        models=[
zhuwenwen's avatar
zhuwenwen committed
343
344
            os.path.join(models_path_prefix, "OpenGVLab/InternVL2-1B"),
            os.path.join(models_path_prefix, "OpenGVLab/InternVL2-2B"),
345
346
            # FIXME: Config cannot be loaded in transformers 4.52
            # "OpenGVLab/Mono-InternVL-2B",
347
348
349
350
351
352
353
354
355
356
357
358
        ],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "<image>\nWhat's the content in the center of the image?",  # noqa: E501
            "cherry_blossom": "<image>\nWhat is the season?",
        }),
        multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.",  # noqa: E501
        max_model_len=4096,
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
    ),
359
360
361
362
363
364
365
366
367
368
369
    "intern_vl-video": VLMTestInfo(
        models=[
            "OpenGVLab/InternVL3-1B",
        ],
        test_type=VLMTestType.VIDEO,
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
        video_idx_to_prompt=lambda idx: "<video>",
        max_model_len=8192,
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
    ),
370
371
372
373
374
375
376
377
378
379
380
381
    "kimi_vl": VLMTestInfo(
        models=["moonshotai/Kimi-VL-A3B-Instruct"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|im_user|>user<|im_middle|>{img_prompt}<|im_end|><|im_assistant|>assistant<|im_middle|>", # noqa: E501
        img_idx_to_prompt=lambda _: "<|media_start|>image<|media_content|><|media_pad|><|media_end|>",  # noqa: E501
        max_model_len=8192,
        max_num_seqs=2,
        dtype="bfloat16",
        tensor_parallel_size=1,
        vllm_output_post_proc=model_utils.kimiv_vl_vllm_to_hf_output,
        marks=[large_gpu_mark(min_gb=48)],
    ),
382
383
384
385
386
387
388
389
390
391
392
393
    "llama4": VLMTestInfo(
        models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n", # noqa: E501
        img_idx_to_prompt=lambda _: "<|image|>",
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        distributed_executor_backend="mp",
        image_size_factors=[(.25, 0.5, 1.0)],
        hf_model_kwargs={"device_map": "auto"},
        max_model_len=8192,
        max_num_seqs=4,
        dtype="bfloat16",
        auto_cls=AutoModelForImageTextToText,
394
395
        tensor_parallel_size=4,
        marks=multi_gpu_marks(num_gpus=4),
396
    ),
397
    "llava_next": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
398
        models=[os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")],
399
400
401
        test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
        prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
        max_model_len=10240,
402
        auto_cls=AutoModelForImageTextToText,
403
404
405
406
407
408
409
410
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
            inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
                formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]"
            ),
            limit_mm_per_prompt={"image": 4},
        )],
    ),
411
    "llava_onevision": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
412
        models=[os.path.join(models_path_prefix, "llava-hf/llava-onevision-qwen2-0.5b-ov-hf")],
413
414
415
416
        test_type=VLMTestType.CUSTOM_INPUTS,
        prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
        num_video_frames=16,
        max_model_len=16384,
417
        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),   # noqa: E501
418
419
420
421
422
423
424
425
426
427
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
            inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs(
                formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
            ),
            limit_mm_per_prompt={"video": 4},
        )],
    ),
    "llava_next_video": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
428
        models=[os.path.join(models_path_prefix, "llava-hf/LLaVA-NeXT-Video-7B-hf")],
429
430
431
432
        test_type=VLMTestType.VIDEO,
        prompt_formatter=lambda vid_prompt: f"USER: {vid_prompt} ASSISTANT:",
        num_video_frames=16,
        max_model_len=4096,
433
        max_num_seqs=2,
434
435
436
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output,
    ),
437
    "mantis": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
438
        models=[os.path.join(models_path_prefix, "TIGER-Lab/Mantis-8B-siglip-llama3")],
439
440
441
442
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
        max_model_len=4096,
        get_stop_token_ids=lambda tok: [128009],
443
        auto_cls=AutoModelForImageTextToText,
444
445
446
        vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
        patch_hf_runner=model_utils.mantis_patch_hf_runner,
    ),
447
    "minicpmv_25": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
448
        models=[os.path.join(models_path_prefix, "openbmb/MiniCPM-Llama3-V-2_5")],
449
        test_type=VLMTestType.IMAGE,
450
451
452
453
454
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
        max_model_len=4096,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
455
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
456
        patch_hf_runner=model_utils.minicpmv_25_patch_hf_runner,
457
458
        # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
        marks=[pytest.mark.skip("HF import fails")],
459
    ),
460
461
462
463
464
465
466
467
468
    "minicpmo_26": VLMTestInfo(
        models=["openbmb/MiniCPM-o-2_6"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
        max_model_len=4096,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
469
        patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
470
471
        # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
        marks=[pytest.mark.skip("HF import fails")],
472
    ),
473
    "minicpmv_26": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
474
        models=[os.path.join(models_path_prefix, "openbmb/MiniCPM-V-2_6")],
475
476
477
478
479
480
481
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
        max_model_len=4096,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
482
        patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
483
484
        # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
        marks=[pytest.mark.skip("HF import fails")],
485
    ),
486
487
488
489
490
491
492
493
494
495
496
497
    "minimax_vl_01": VLMTestInfo(
        models=["MiniMaxAI/MiniMax-VL-01"],
        prompt_formatter=lambda img_prompt: f"<beginning_of_sentence>user: {img_prompt} assistant:<end_of_sentence>", # noqa: E501
        img_idx_to_prompt=lambda _: "<image>",
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        max_model_len=8192,
        max_num_seqs=4,
        dtype="bfloat16",
        hf_output_post_proc=model_utils.minimax_vl_01_hf_output,
        patch_hf_runner=model_utils.minimax_vl_01_patch_hf_runner,
        auto_cls=AutoModelForImageTextToText,
        marks=[large_gpu_mark(min_gb=80)],
498
    ),
499
500
    "molmo": VLMTestInfo(
        models=["allenai/Molmo-7B-D-0924"],
501
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
502
        prompt_formatter=identity,
503
504
        max_model_len=4096,
        max_num_seqs=2,
505
        patch_hf_runner=model_utils.molmo_patch_hf_runner,
506
    ),
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
    "ovis1_6-gemma2": VLMTestInfo(
        models=["AIDC-AI/Ovis1.6-Gemma2-9B"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
        img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
        dtype="half",
        # use sdpa mode for hf runner since ovis2 didn't work with flash_attn
        hf_model_kwargs={"llm_attn_implementation": "sdpa"},
        patch_hf_runner=model_utils.ovis_patch_hf_runner,
        marks=[large_gpu_mark(min_gb=32)],
    ),
    "ovis1_6": VLMTestInfo(
        models=["AIDC-AI/Ovis1.6-Llama3.2-3B"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful and honest multimodal assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
        img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
        dtype="half",
        # use sdpa mode for hf runner since ovis2 didn't work with flash_attn
        hf_model_kwargs={"llm_attn_implementation": "sdpa"},
        patch_hf_runner=model_utils.ovis_patch_hf_runner,
    ),
532
533
534
535
536
537
538
539
540
541
    "ovis2": VLMTestInfo(
        models=["AIDC-AI/Ovis2-1B"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
        img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
        dtype="half",
        # use sdpa mode for hf runner since ovis2 didn't work with flash_attn
        hf_model_kwargs={"llm_attn_implementation": "sdpa"},
542
        patch_hf_runner=model_utils.ovis_patch_hf_runner,
543
    ),
544
545
546
547
548
549
550
551
    "phi3v": VLMTestInfo(
        models=["microsoft/Phi-3.5-vision-instruct"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501
        img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
        max_model_len=4096,
        max_num_seqs=2,
        task="generate",
552
553
        # use sdpa mode for hf runner since phi3v didn't work with flash_attn
        hf_model_kwargs={"_attn_implementation": "sdpa"},
554
555
556
557
        use_tokenizer_eos=True,
        vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output,
        num_logprobs=10,
    ),
558
    "pixtral_hf": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
559
        models=[os.path.join(models_path_prefix, "nm-testing/pixtral-12b-FP8-dynamic")],
560
561
562
563
564
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<s>[INST]{img_prompt}[/INST]",
        img_idx_to_prompt=lambda idx: "[IMG]",
        max_model_len=8192,
        max_num_seqs=2,
565
        auto_cls=AutoModelForImageTextToText,
566
        marks=[large_gpu_mark(min_gb=48)],
567
    ),
568
    "qwen_vl": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
569
        models=[os.path.join(models_path_prefix, "Qwen/Qwen-VL")],
570
571
572
573
574
575
576
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=identity,
        img_idx_to_prompt=lambda idx: f"Picture {idx}: <img></img>\n",
        max_model_len=1024,
        max_num_seqs=2,
        vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output,
        prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
577
578
        # FIXME: https://github.com/huggingface/transformers/issues/38358
        marks=[pytest.mark.skip("Model initialization fails")],
579
    ),
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
    "qwen2_vl": VLMTestInfo(
        models=["Qwen/Qwen2-VL-2B-Instruct"],
        test_type=(
            VLMTestType.IMAGE,
            VLMTestType.MULTI_IMAGE,
            VLMTestType.VIDEO
        ),
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
        marks=[pytest.mark.cpu_model],
    ),
597
598
599
600
601
602
603
604
605
606
607
608
609
610
    "skywork_r1v": VLMTestInfo(
        models=["Skywork/Skywork-R1V-38B"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|begin▁of▁sentence|><|User|>\n{img_prompt}<|Assistant|><think>\n", # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "<image>\nWhat's the content in the center of the image?",  # noqa: E501
            "cherry_blossom": "<image>\nWhat is the season?",
        }),
        multi_image_prompt="<image>\n<image>\nDescribe the two images in short.",  # noqa: E501
        max_model_len=4096,
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.skyworkr1v_patch_hf_runner,
        marks=[large_gpu_mark(min_gb=80)],
    ),
611
612
613
614
615
616
617
618
619
620
    "smolvlm": VLMTestInfo(
        models=["HuggingFaceTB/SmolVLM2-2.2B-Instruct"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt:f"<|im_start|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
        img_idx_to_prompt=lambda idx: "<image>",
        max_model_len=8192,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
        hf_output_post_proc=model_utils.smolvlm_trunc_hf_output,
    ),
621
    ### Tensor parallel / multi-gpu broadcast tests
622
    "chameleon-broadcast": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
623
        models=[os.path.join(models_path_prefix, "facebook/chameleon-7b")],
624
625
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        max_model_len=4096,
626
        auto_cls=AutoModelForImageTextToText,
627
628
629
        vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
        hf_output_post_proc = lambda hf_output, model: hf_output[:2],
        comparator=check_outputs_equal,
630
        marks=multi_gpu_marks(num_gpus=2),
631
632
        **COMMON_BROADCAST_SETTINGS # type: ignore
    ),
633
    "llava-broadcast": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
634
        models=[os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf")],
635
636
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        max_model_len=4096,
637
        auto_cls=AutoModelForImageTextToText,
638
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
639
        marks=multi_gpu_marks(num_gpus=2),
640
641
        **COMMON_BROADCAST_SETTINGS # type: ignore
    ),
642
    "llava_next-broadcast": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
643
        models=[os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")],
644
645
        prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
        max_model_len=10240,
646
        auto_cls=AutoModelForImageTextToText,
647
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
648
        marks=multi_gpu_marks(num_gpus=2),
649
650
651
652
        **COMMON_BROADCAST_SETTINGS # type: ignore
    ),
    ### Custom input edge-cases for specific models
    "intern_vl-diff-patches": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
653
        models=[os.path.join(models_path_prefix, "OpenGVLab/InternVL2-2B")],
654
655
656
657
658
659
660
661
662
663
664
665
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
        test_type=VLMTestType.CUSTOM_INPUTS,
        max_model_len=4096,
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
        custom_test_opts=[
            CustomTestOptions(
                inputs=inp,
                limit_mm_per_prompt={"image": 2},
            ) for inp in custom_inputs.different_patch_input_cases_internvl()
        ],
    ),
666
    "llava_onevision-multiple-images": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
667
        models=[os.path.join(models_path_prefix, "llava-hf/llava-onevision-qwen2-0.5b-ov-hf")],
668
669
670
671
        test_type=VLMTestType.CUSTOM_INPUTS,
        max_model_len=16384,
        max_num_seqs=2,
        auto_cls=AutoModelForVision2Seq,
672
        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),   # noqa: E501
673
674
675
676
677
678
679
680
        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
            inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
                formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
            ),
            limit_mm_per_prompt={"image": 4},
        )],
    ),
681
682
683
684
685
686
687
688
689
690
691
692
693
    # regression test for https://github.com/vllm-project/vllm/issues/15122
    "qwen2_5_vl-windows-attention": VLMTestInfo(
        models=["Qwen/Qwen2.5-VL-3B-Instruct"],
        test_type=VLMTestType.CUSTOM_INPUTS,
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
            inputs=custom_inputs.windows_attention_image_qwen2_5_vl(),
            limit_mm_per_prompt={"image": 1},
        )],
    ),
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
    "llama4": VLMTestInfo(
        models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n", # noqa: E501
        img_idx_to_prompt=lambda _: "<|image|>",
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        distributed_executor_backend="mp",
        image_size_factors=[(.25, 0.5, 1.0)],
        hf_model_kwargs={"device_map": "auto"},
        max_model_len=8192,
        max_num_seqs=4,
        dtype="bfloat16",
        auto_cls=AutoModelForImageTextToText,
        tensor_parallel_size=8,
        vllm_runner_kwargs={"gpu_memory_utilization": 0.8},
        marks=[large_gpu_mark(min_gb=80), multi_gpu_marks(num_gpus=8)],
    ),
710
711
712
713
}
# yapf: enable


714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
def _mark_splits(
    test_settings: dict[str, VLMTestInfo],
    *,
    num_groups: int,
) -> dict[str, VLMTestInfo]:
    name_by_test_info_id = {id(v): k for k, v in test_settings.items()}
    test_infos_by_model = defaultdict[str, list[VLMTestInfo]](list)

    for info in test_settings.values():
        for model in info.models:
            test_infos_by_model[model].append(info)

    models = sorted(test_infos_by_model.keys())
    split_size = math.ceil(len(models) / num_groups)

    new_test_settings = dict[str, VLMTestInfo]()

    for i in range(num_groups):
        models_in_group = models[i * split_size:(i + 1) * split_size]

        for model in models_in_group:
            for info in test_infos_by_model[model]:
                new_marks = (info.marks or []) + [pytest.mark.split(group=i)]
                new_info = info._replace(marks=new_marks)
                new_test_settings[name_by_test_info_id[id(info)]] = new_info

    missing_keys = test_settings.keys() - new_test_settings.keys()
    assert not missing_keys, f"Missing keys: {missing_keys}"

    return new_test_settings


VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)


749
750
751
752
753
754
### Test wrappers
# Wrappers around the core test running func for:
# - single image
# - multi-image
# - image embeddings
# - video
755
# - audio
756
# - custom inputs
757
758
759
760
761
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.IMAGE,
762
        create_new_process_for_each_test=False,
763
    ))
764
765
def test_single_image_models(tmp_path: PosixPath, model_type: str,
                             test_case: ExpandableVLMTestArgs,
766
767
                             hf_runner: type[HfRunner],
                             vllm_runner: type[VllmRunner],
768
                             image_assets: ImageTestAssets, monkeypatch):
769
770
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
771
772
773
774
775
776
777
778
779
780
781
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_single_image_test(
        tmp_path=tmp_path,
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        image_assets=image_assets,
    )


782
783
784
785
786
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.MULTI_IMAGE,
787
        create_new_process_for_each_test=False,
788
    ))
789
790
def test_multi_image_models(tmp_path: PosixPath, model_type: str,
                            test_case: ExpandableVLMTestArgs,
791
792
                            hf_runner: type[HfRunner],
                            vllm_runner: type[VllmRunner],
793
                            image_assets: ImageTestAssets, monkeypatch):
794
795
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
796
797
798
799
800
801
802
803
804
805
806
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_multi_image_test(
        tmp_path=tmp_path,
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        image_assets=image_assets,
    )


807
808
809
810
811
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.EMBEDDING,
812
        create_new_process_for_each_test=False,
813
    ))
814
815
def test_image_embedding_models(model_type: str,
                                test_case: ExpandableVLMTestArgs,
816
817
                                hf_runner: type[HfRunner],
                                vllm_runner: type[VllmRunner],
818
                                image_assets: ImageTestAssets, monkeypatch):
819
820
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
821
822
823
824
825
826
827
828
829
830
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_embedding_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        image_assets=image_assets,
    )


831
832
833
834
835
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.VIDEO,
836
        create_new_process_for_each_test=False,
837
    ))
838
def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
839
                      hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
840
                      video_assets: VideoTestAssets, monkeypatch):
841
842
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
843
844
845
846
847
848
849
850
851
852
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_video_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        video_assets=video_assets,
    )


853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.AUDIO,
        create_new_process_for_each_test=False,
    ))
def test_audio_models(model_type: str, test_case: ExpandableVLMTestArgs,
                      hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
                      audio_assets: AudioTestAssets, monkeypatch):
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_audio_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        audio_assets=audio_assets,
    )


875
876
877
878
879
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.CUSTOM_INPUTS,
880
        create_new_process_for_each_test=False,
881
    ))
882
883
884
def test_custom_inputs_models(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
885
886
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
887
    monkeypatch,
888
):
889
890
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
891
892
893
894
895
896
897
898
899
900
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_custom_inputs_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
    )


#### Tests filtering for things running each test as a new process
901
902
903
904
905
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.IMAGE,
906
        create_new_process_for_each_test=True,
907
    ))
908
@create_new_process_for_each_test()
909
910
def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
                                   test_case: ExpandableVLMTestArgs,
911
912
                                   hf_runner: type[HfRunner],
                                   vllm_runner: type[VllmRunner],
913
                                   image_assets: ImageTestAssets, monkeypatch):
914
915
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
916
917
918
919
920
921
922
923
924
925
926
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_single_image_test(
        tmp_path=tmp_path,
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        image_assets=image_assets,
    )


927
928
929
930
931
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.MULTI_IMAGE,
932
        create_new_process_for_each_test=True,
933
    ))
934
@create_new_process_for_each_test()
935
936
def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
                                  test_case: ExpandableVLMTestArgs,
937
938
                                  hf_runner: type[HfRunner],
                                  vllm_runner: type[VllmRunner],
939
                                  image_assets: ImageTestAssets, monkeypatch):
940
941
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
942
943
944
945
946
947
948
949
950
951
952
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_multi_image_test(
        tmp_path=tmp_path,
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        image_assets=image_assets,
    )


953
954
955
956
957
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.EMBEDDING,
958
        create_new_process_for_each_test=True,
959
    ))
960
@create_new_process_for_each_test()
961
962
def test_image_embedding_models_heavy(model_type: str,
                                      test_case: ExpandableVLMTestArgs,
963
964
                                      hf_runner: type[HfRunner],
                                      vllm_runner: type[VllmRunner],
965
966
                                      image_assets: ImageTestAssets,
                                      monkeypatch):
967
968
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
969
970
971
972
973
974
975
976
977
978
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_embedding_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        image_assets=image_assets,
    )


979
980
981
982
983
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.VIDEO,
984
        create_new_process_for_each_test=True,
985
    ))
986
def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
987
988
                            hf_runner: type[HfRunner],
                            vllm_runner: type[VllmRunner],
989
                            video_assets: VideoTestAssets, monkeypatch):
990
991
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
992
993
994
995
996
997
998
999
1000
1001
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_video_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        video_assets=video_assets,
    )


1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.AUDIO,
        create_new_process_for_each_test=True,
    ))
def test_audio_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
                            hf_runner: type[HfRunner],
                            vllm_runner: type[VllmRunner],
                            audio_assets: AudioTestAssets, monkeypatch):
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_audio_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        audio_assets=audio_assets,
    )


1025
1026
1027
1028
1029
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.CUSTOM_INPUTS,
1030
        create_new_process_for_each_test=True,
1031
    ))
1032
@create_new_process_for_each_test()
1033
1034
1035
def test_custom_inputs_models_heavy(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
1036
1037
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
1038
    monkeypatch,
1039
):
1040
1041
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
1042
1043
1044
1045
1046
1047
1048
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_custom_inputs_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
    )