"vscode:/vscode.git/clone" did not exist on "43710e8d0965d616db51639ced76605dee1bde93"
test_common.py 45.7 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
"""Common tests for testing .generate() functionality for single / multiple
image, embedding, and video support for different VLMs in vLLM.
"""
6
import math
7
import os
8
from collections import defaultdict
9
10
11
from pathlib import PosixPath

import pytest
12
from transformers import (AutoModel, AutoModelForImageTextToText,
13
                          AutoModelForTextToWaveform, AutoModelForVision2Seq)
14
15

from vllm.platforms import current_platform
16
from vllm.utils import identity
17

18
19
from ....conftest import (IMAGE_ASSETS, AudioTestAssets, HfRunner,
                          ImageTestAssets, VideoTestAssets, VllmRunner)
20
from ....utils import (create_new_process_for_each_test, large_gpu_mark,
21
                       multi_gpu_marks)
22
23
24
25
26
27
28
29
30
31
32
33
34
from ...utils import check_outputs_equal
from .vlm_utils import custom_inputs, model_utils, runners
from .vlm_utils.case_filtering import get_parametrized_options
from .vlm_utils.types import (CustomTestOptions, ExpandableVLMTestArgs,
                              VLMTestInfo, VLMTestType)

# This hack is needed for phi3v & paligemma models
# ROCm Triton FA can run into shared memory issues with these models,
# use other backends in the meantime
# FIXME (mattwong, gshtrasb, hongxiayan)
if current_platform.is_rocm():
    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"

35
36
37
38
39
REQUIRES_V0_MODELS = [
    # V1 Test: not enough KV cache space in C1.
    "fuyu",
]

40
41
42
43
44
45
# yapf: disable
COMMON_BROADCAST_SETTINGS = {
    "test_type": VLMTestType.IMAGE,
    "dtype": "half",
    "max_tokens": 5,
    "tensor_parallel_size": 2,
46
    "hf_model_kwargs": {"device_map": "auto"},
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
    "image_size_factors": [(.25, 0.5, 1.0)],
    "distributed_executor_backend": (
        "ray",
        "mp",
    )
}

### Test configuration for specific models
# NOTE: The convention of the test settings below is to lead each test key
# with the name of the model arch used in the test, using underscores in place
# of hyphens; this makes it more convenient to filter tests for a specific kind
# of model. For example....
#
# To run all test types for a specific key:
#     use the k flag to substring match with a leading square bracket; if the
#     model arch happens to be a substring of another one, you can add a
#     trailing hyphen. E.g.,
#                 - pytest $TEST_FILE -k "[llava-"
#     prevents matching on "[llava_next-" & will match just the enabled cases
#     for llava, i.e., single image, image embedding, and custom input tests.
#
# To run a test for a Test Info for just one of multiple models:
#     use the k flag to substring match the model name, e.g.,
#                 - pytest $TEST_FILE -k OpenGVLab/InternVL2-1B
#     prevents matching on nGVLab/InternVL2-2B.
#
# You can also combine substrings to match more granularly.
#     ex 1:
#        pytest $TEST_FILE -k "test_single_image and OpenGVLab/InternVL2-1B"
#     will run only test_single_image* for OpenGVLab/InternVL2-1B; this would
#     match both wrappers for single image tests, since it also matches
#     test_single_image_heavy (which forks if we have a distributed backend)
#     ex 2:
#        pytest $TEST_FILE -k  "[llava- or [intern_vl-"
#     will run all of the tests for only llava & internvl.
#
# NOTE you can add --collect-only to any of the above commands to see
# which cases would be selected and deselected by pytest. In general,
# this is a good idea for checking your command first, since tests are slow.

VLM_TEST_SETTINGS = {
88
89
90
91
92
93
94
95
96
97
98
    #### Core tests to always run in the CI
    "llava": VLMTestInfo(
        models=["llava-hf/llava-1.5-7b-hf"],
        test_type=(
            VLMTestType.EMBEDDING,
            VLMTestType.IMAGE,
            VLMTestType.CUSTOM_INPUTS
        ),
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        convert_assets_to_embeddings=model_utils.get_llava_embeddings,
        max_model_len=4096,
99
        auto_cls=AutoModelForImageTextToText,
100
101
102
103
104
105
106
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
            inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
                formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:"
            ),
            limit_mm_per_prompt={"image": 4},
        )],
107
108
        # TODO: Revert to "auto" when CPU backend can use torch > 2.6
        dtype="bfloat16" if current_platform.is_cpu() else "auto",
109
        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
110
111
112
113
114
115
116
117
118
119
120
    ),
    "paligemma": VLMTestInfo(
        models=["google/paligemma-3b-mix-224"],
        test_type=VLMTestType.IMAGE,
        prompt_formatter=identity,
        img_idx_to_prompt = lambda idx: "",
        # Paligemma uses its own sample prompts because the default one fails
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "caption es",
            "cherry_blossom": "What is in the picture?",
        }),
121
        auto_cls=AutoModelForImageTextToText,
122
        vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
123
124
        dtype="bfloat16",
        marks=[pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")],  # noqa: E501
125
    ),
Roger Wang's avatar
Roger Wang committed
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
    "qwen2_5_vl": VLMTestInfo(
        models=["Qwen/Qwen2.5-VL-3B-Instruct"],
        test_type=(
            VLMTestType.IMAGE,
            VLMTestType.MULTI_IMAGE,
            VLMTestType.VIDEO
        ),
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
141
        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
Roger Wang's avatar
Roger Wang committed
142
    ),
143
    "qwen2_5_omni": VLMTestInfo(
144
        models=["Qwen/Qwen2.5-Omni-3B"],
145
146
147
148
149
150
151
152
153
154
        test_type=(
            VLMTestType.IMAGE,
            VLMTestType.MULTI_IMAGE,
            VLMTestType.VIDEO
        ),
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
        img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>", # noqa: E501
        video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
155
        num_logprobs= 6 if current_platform.is_cpu() else 5,
156
        auto_cls=AutoModelForTextToWaveform,
157
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
158
        patch_hf_runner=model_utils.qwen2_5_omni_patch_hf_runner,
159
160
161
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
    ),
162
163
164
165
166
167
168
169
170
171
172
    "ultravox": VLMTestInfo(
        models = ["fixie-ai/ultravox-v0_5-llama-3_2-1b"],
        test_type=VLMTestType.AUDIO,
        prompt_formatter=lambda audio_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{audio_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
        audio_idx_to_prompt=lambda idx: "<|audio|>",
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModel,
        hf_output_post_proc=model_utils.ultravox_trunc_hf_output,
        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
    ),
173
    #### Extended model tests
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
    "aria": VLMTestInfo(
        models=["rhymes-ai/Aria"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
        img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "<vlm_image>Please describe the image shortly.",
            "cherry_blossom": "<vlm_image>Please infer the season with reason.",  # noqa: E501
        }),
        multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.",    # noqa: E501
        stop_str=["<|im_end|>"],
        image_size_factors=[(0.10, 0.15)],
        max_tokens=64,
        marks=[large_gpu_mark(min_gb=64)],
    ),
Jennifer Zhao's avatar
Jennifer Zhao committed
192
193
    "aya_vision": VLMTestInfo(
        models=["CohereForAI/aya-vision-8b"],
194
        test_type=(VLMTestType.IMAGE),
Jennifer Zhao's avatar
Jennifer Zhao committed
195
196
197
198
199
200
        prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "<image>What's the content in the center of the image?",  # noqa: E501
            "cherry_blossom": "<image>What is the season?",  # noqa: E501
        }),
        multi_image_prompt="<image><image>Describe the two images in detail.",  # noqa: E501
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
        vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}},
    ),
    "aya_vision-multi_image": VLMTestInfo(
        models=["CohereForAI/aya-vision-8b"],
        test_type=(VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "<image>What's the content in the center of the image?",  # noqa: E501
            "cherry_blossom": "<image>What is the season?",  # noqa: E501
        }),
        multi_image_prompt="<image><image>Describe the two images in detail.",  # noqa: E501
        max_model_len=4096,
Jennifer Zhao's avatar
Jennifer Zhao committed
216
217
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
218
219
        vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}},
        marks=[large_gpu_mark(min_gb=32)],
Jennifer Zhao's avatar
Jennifer Zhao committed
220
    ),
221
    "blip2": VLMTestInfo(
222
        models=["Salesforce/blip2-opt-2.7b"],
223
224
225
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
        img_idx_to_prompt=lambda idx: "",
226
        auto_cls=AutoModelForImageTextToText,
227
        vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output,
228
229
        # FIXME: https://github.com/huggingface/transformers/pull/38510
        marks=[pytest.mark.skip("Model is broken")],
230
231
232
233
234
235
    ),
    "chameleon": VLMTestInfo(
        models=["facebook/chameleon-7b"],
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        max_model_len=4096,
236
        max_num_seqs=2,
237
        auto_cls=AutoModelForImageTextToText,
238
239
240
241
242
243
244
        # For chameleon, we only compare the sequences
        vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
        hf_output_post_proc = lambda hf_output, model: hf_output[:2],
        comparator=check_outputs_equal,
        max_tokens=8,
        dtype="bfloat16",
    ),
245
    "deepseek_vl_v2": VLMTestInfo(
246
        models=["Isotr0py/deepseek-vl2-tiny"], # model repo using dynamic module
247
248
249
250
251
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
        single_image_prompts=IMAGE_ASSETS.prompts({
252
253
            "stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
            "cherry_blossom": "<image>\nPlease infer the season with reason in details.",   # noqa: E501
254
        }),
255
        multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?",    # noqa: E501
256
257
258
        patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
        hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
        stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"],  # noqa: E501
259
        image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
260
    ),
261
262
263
264
265
266
267
    "fuyu": VLMTestInfo(
        models=["adept/fuyu-8b"],
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda img_prompt: f"{img_prompt}\n",
        img_idx_to_prompt=lambda idx: "",
        max_model_len=2048,
        max_num_seqs=2,
268
        auto_cls=AutoModelForImageTextToText,
269
270
271
272
273
        use_tokenizer_eos=True,
        vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
        num_logprobs=10,
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
    ),
274
275
276
277
278
279
280
281
282
283
284
    "gemma3": VLMTestInfo(
        models=["google/gemma-3-4b-it"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "<start_of_image>What's the content in the center of the image?",  # noqa: E501
            "cherry_blossom": "<start_of_image>What is the season?",  # noqa: E501
        }),
        multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.",  # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
285
        auto_cls=AutoModelForImageTextToText,
286
287
        vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
        patch_hf_runner=model_utils.gemma3_patch_hf_runner,
288
        num_logprobs=10,
289
    ),
290
    "glm4v": VLMTestInfo(
291
292
        models=["THUDM/glm-4v-9b"],
        test_type=VLMTestType.IMAGE,
293
294
295
296
297
        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",  # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?",  # noqa: E501
            "cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?",  # noqa: E501
        }),
298
299
300
        max_model_len=2048,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: [151329, 151336, 151338],
301
302
303
304
305
306
        patch_hf_runner=model_utils.glm4v_patch_hf_runner,
        # The image embeddings match with HF but the outputs of the language
        # decoder are only consistent up to 2 decimal places.
        # So, we need to reduce the number of tokens for the test to pass.
        max_tokens=8,
        num_logprobs=10,
307
        marks=[large_gpu_mark(min_gb=32)],
308
    ),
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
    "glm4_1v": VLMTestInfo(
        models=["THUDM/GLM-4.1V-9B-Thinking"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",  # noqa: E501
        img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", # noqa: E501
        video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>", # noqa: E501
        max_model_len=2048,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: [151329, 151336, 151338],
        num_logprobs=10,
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
        auto_cls=AutoModelForImageTextToText,
    ),
    "glm4_1v-video": VLMTestInfo(
        models=["THUDM/GLM-4.1V-9B-Thinking"],
        # GLM4.1V require include video metadata for input
        test_type=VLMTestType.CUSTOM_INPUTS,
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
        patch_hf_runner=model_utils.glm4_1v_patch_hf_runner,
        custom_test_opts=[CustomTestOptions(
            inputs=custom_inputs.video_with_metadata_glm4_1v(),
            limit_mm_per_prompt={"video": 1},
        )],
        # This is needed to run on machine with 24GB VRAM
        vllm_runner_kwargs={"gpu_memory_utilization": 0.95},
    ),
337
338
339
    "h2ovl": VLMTestInfo(
        models = [
            "h2oai/h2ovl-mississippi-800m",
340
            "h2oai/h2ovl-mississippi-2b",
341
342
343
344
345
346
347
348
349
350
        ],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "<image>\nWhat's the content in the center of the image?",  # noqa: E501
            "cherry_blossom": "<image>\nWhat is the season?",
        }),
        multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.",  # noqa: E501
        max_model_len=8192,
        use_tokenizer_eos=True,
351
        num_logprobs=10,
352
        patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
353
    ),
354
    "idefics3": VLMTestInfo(
355
        models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
356
357
358
359
360
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
        img_idx_to_prompt=lambda idx: "<image>",
        max_model_len=8192,
        max_num_seqs=2,
361
        auto_cls=AutoModelForImageTextToText,
362
        hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
363
    ),
364
365
366
367
    "intern_vl": VLMTestInfo(
        models=[
            "OpenGVLab/InternVL2-1B",
            "OpenGVLab/InternVL2-2B",
368
369
            # FIXME: Config cannot be loaded in transformers 4.52
            # "OpenGVLab/Mono-InternVL-2B",
370
371
372
373
374
375
376
377
378
379
380
381
        ],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "<image>\nWhat's the content in the center of the image?",  # noqa: E501
            "cherry_blossom": "<image>\nWhat is the season?",
        }),
        multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.",  # noqa: E501
        max_model_len=4096,
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
    ),
382
383
384
385
386
387
388
389
390
391
392
    "intern_vl-video": VLMTestInfo(
        models=[
            "OpenGVLab/InternVL3-1B",
        ],
        test_type=VLMTestType.VIDEO,
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
        video_idx_to_prompt=lambda idx: "<video>",
        max_model_len=8192,
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
    ),
393
394
395
396
397
398
399
400
401
402
403
404
    "kimi_vl": VLMTestInfo(
        models=["moonshotai/Kimi-VL-A3B-Instruct"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|im_user|>user<|im_middle|>{img_prompt}<|im_end|><|im_assistant|>assistant<|im_middle|>", # noqa: E501
        img_idx_to_prompt=lambda _: "<|media_start|>image<|media_content|><|media_pad|><|media_end|>",  # noqa: E501
        max_model_len=8192,
        max_num_seqs=2,
        dtype="bfloat16",
        tensor_parallel_size=1,
        vllm_output_post_proc=model_utils.kimiv_vl_vllm_to_hf_output,
        marks=[large_gpu_mark(min_gb=48)],
    ),
405
406
407
408
409
410
411
412
413
414
415
416
    "llama4": VLMTestInfo(
        models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n", # noqa: E501
        img_idx_to_prompt=lambda _: "<|image|>",
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        distributed_executor_backend="mp",
        image_size_factors=[(.25, 0.5, 1.0)],
        hf_model_kwargs={"device_map": "auto"},
        max_model_len=8192,
        max_num_seqs=4,
        dtype="bfloat16",
        auto_cls=AutoModelForImageTextToText,
417
418
        tensor_parallel_size=4,
        marks=multi_gpu_marks(num_gpus=4),
419
    ),
420
421
422
423
424
    "llava_next": VLMTestInfo(
        models=["llava-hf/llava-v1.6-mistral-7b-hf"],
        test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
        prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
        max_model_len=10240,
425
        auto_cls=AutoModelForImageTextToText,
426
427
428
429
430
431
432
433
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
            inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
                formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]"
            ),
            limit_mm_per_prompt={"image": 4},
        )],
    ),
434
    "llava_onevision": VLMTestInfo(
435
436
437
438
439
        models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
        test_type=VLMTestType.CUSTOM_INPUTS,
        prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
        num_video_frames=16,
        max_model_len=16384,
440
        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),   # noqa: E501
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
            inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs(
                formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
            ),
            limit_mm_per_prompt={"video": 4},
        )],
    ),
    "llava_next_video": VLMTestInfo(
        models=["llava-hf/LLaVA-NeXT-Video-7B-hf"],
        test_type=VLMTestType.VIDEO,
        prompt_formatter=lambda vid_prompt: f"USER: {vid_prompt} ASSISTANT:",
        num_video_frames=16,
        max_model_len=4096,
456
        max_num_seqs=2,
457
458
459
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output,
    ),
460
461
462
463
464
465
    "mantis": VLMTestInfo(
        models=["TIGER-Lab/Mantis-8B-siglip-llama3"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
        max_model_len=4096,
        get_stop_token_ids=lambda tok: [128009],
466
        auto_cls=AutoModelForImageTextToText,
467
468
469
        vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
        patch_hf_runner=model_utils.mantis_patch_hf_runner,
    ),
470
    "minicpmv_25": VLMTestInfo(
471
        models=["openbmb/MiniCPM-Llama3-V-2_5"],
472
        test_type=VLMTestType.IMAGE,
473
474
475
476
477
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
        max_model_len=4096,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
478
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
479
        patch_hf_runner=model_utils.minicpmv_25_patch_hf_runner,
480
481
        # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
        marks=[pytest.mark.skip("HF import fails")],
482
    ),
483
484
    "minicpmo_26": VLMTestInfo(
        models=["openbmb/MiniCPM-o-2_6"],
485
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
486
487
488
489
490
491
492
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
        max_model_len=4096,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
        patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
493
494
        # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
        marks=[pytest.mark.skip("HF import fails")],
495
    ),
496
497
    "minicpmv_26": VLMTestInfo(
        models=["openbmb/MiniCPM-V-2_6"],
498
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
499
500
501
502
503
504
505
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
        max_model_len=4096,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
        patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
506
507
        # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
        marks=[pytest.mark.skip("HF import fails")],
508
    ),
509
510
511
512
513
514
515
516
517
518
519
520
521
    "minimax_vl_01": VLMTestInfo(
        models=["MiniMaxAI/MiniMax-VL-01"],
        prompt_formatter=lambda img_prompt: f"<beginning_of_sentence>user: {img_prompt} assistant:<end_of_sentence>", # noqa: E501
        img_idx_to_prompt=lambda _: "<image>",
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        max_model_len=8192,
        max_num_seqs=4,
        dtype="bfloat16",
        hf_output_post_proc=model_utils.minimax_vl_01_hf_output,
        patch_hf_runner=model_utils.minimax_vl_01_patch_hf_runner,
        auto_cls=AutoModelForImageTextToText,
        marks=[large_gpu_mark(min_gb=80)],
    ),
522
523
    "molmo": VLMTestInfo(
        models=["allenai/Molmo-7B-D-0924"],
524
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
525
        prompt_formatter=identity,
526
527
        max_model_len=4096,
        max_num_seqs=2,
528
        patch_hf_runner=model_utils.molmo_patch_hf_runner,
529
    ),
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
    "ovis1_6-gemma2": VLMTestInfo(
        models=["AIDC-AI/Ovis1.6-Gemma2-9B"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
        img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
        dtype="half",
        # use sdpa mode for hf runner since ovis2 didn't work with flash_attn
        hf_model_kwargs={"llm_attn_implementation": "sdpa"},
        patch_hf_runner=model_utils.ovis_patch_hf_runner,
        marks=[large_gpu_mark(min_gb=32)],
    ),
    "ovis1_6": VLMTestInfo(
        models=["AIDC-AI/Ovis1.6-Llama3.2-3B"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful and honest multimodal assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
        img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
        dtype="half",
        # use sdpa mode for hf runner since ovis2 didn't work with flash_attn
        hf_model_kwargs={"llm_attn_implementation": "sdpa"},
        patch_hf_runner=model_utils.ovis_patch_hf_runner,
    ),
555
556
557
558
559
560
561
562
563
564
    "ovis2": VLMTestInfo(
        models=["AIDC-AI/Ovis2-1B"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
        img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
        dtype="half",
        # use sdpa mode for hf runner since ovis2 didn't work with flash_attn
        hf_model_kwargs={"llm_attn_implementation": "sdpa"},
565
        patch_hf_runner=model_utils.ovis_patch_hf_runner,
566
    ),
567
568
569
570
571
572
573
574
    "phi3v": VLMTestInfo(
        models=["microsoft/Phi-3.5-vision-instruct"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501
        img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
        max_model_len=4096,
        max_num_seqs=2,
        task="generate",
575
576
        # use sdpa mode for hf runner since phi3v didn't work with flash_attn
        hf_model_kwargs={"_attn_implementation": "sdpa"},
577
578
579
580
        use_tokenizer_eos=True,
        vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output,
        num_logprobs=10,
    ),
581
582
583
584
585
586
587
    "pixtral_hf": VLMTestInfo(
        models=["nm-testing/pixtral-12b-FP8-dynamic"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<s>[INST]{img_prompt}[/INST]",
        img_idx_to_prompt=lambda idx: "[IMG]",
        max_model_len=8192,
        max_num_seqs=2,
588
        auto_cls=AutoModelForImageTextToText,
589
        marks=[large_gpu_mark(min_gb=48)],
590
    ),
591
    "qwen_vl": VLMTestInfo(
592
593
594
595
596
597
598
599
        models=["Qwen/Qwen-VL"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=identity,
        img_idx_to_prompt=lambda idx: f"Picture {idx}: <img></img>\n",
        max_model_len=1024,
        max_num_seqs=2,
        vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output,
        prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
600
601
        # FIXME: https://github.com/huggingface/transformers/issues/38358
        marks=[pytest.mark.skip("Model initialization fails")],
602
    ),
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
    "qwen2_vl": VLMTestInfo(
        models=["Qwen/Qwen2-VL-2B-Instruct"],
        test_type=(
            VLMTestType.IMAGE,
            VLMTestType.MULTI_IMAGE,
            VLMTestType.VIDEO
        ),
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
        marks=[pytest.mark.cpu_model],
    ),
620
621
622
623
624
625
626
627
628
629
630
631
632
633
    "skywork_r1v": VLMTestInfo(
        models=["Skywork/Skywork-R1V-38B"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|begin▁of▁sentence|><|User|>\n{img_prompt}<|Assistant|><think>\n", # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "<image>\nWhat's the content in the center of the image?",  # noqa: E501
            "cherry_blossom": "<image>\nWhat is the season?",
        }),
        multi_image_prompt="<image>\n<image>\nDescribe the two images in short.",  # noqa: E501
        max_model_len=4096,
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.skyworkr1v_patch_hf_runner,
        marks=[large_gpu_mark(min_gb=80)],
    ),
634
635
636
637
638
639
640
641
642
643
    "smolvlm": VLMTestInfo(
        models=["HuggingFaceTB/SmolVLM2-2.2B-Instruct"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt:f"<|im_start|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
        img_idx_to_prompt=lambda idx: "<image>",
        max_model_len=8192,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
        hf_output_post_proc=model_utils.smolvlm_trunc_hf_output,
    ),
644
    ### Tensor parallel / multi-gpu broadcast tests
645
    "chameleon-broadcast": VLMTestInfo(
646
647
648
        models=["facebook/chameleon-7b"],
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        max_model_len=4096,
649
        auto_cls=AutoModelForImageTextToText,
650
651
652
        vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
        hf_output_post_proc = lambda hf_output, model: hf_output[:2],
        comparator=check_outputs_equal,
653
        marks=multi_gpu_marks(num_gpus=2),
654
655
        **COMMON_BROADCAST_SETTINGS # type: ignore
    ),
656
    "llava-broadcast": VLMTestInfo(
657
658
659
        models=["llava-hf/llava-1.5-7b-hf"],
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        max_model_len=4096,
660
        auto_cls=AutoModelForImageTextToText,
661
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
662
        marks=multi_gpu_marks(num_gpus=2),
663
664
        **COMMON_BROADCAST_SETTINGS # type: ignore
    ),
665
    "llava_next-broadcast": VLMTestInfo(
666
667
668
        models=["llava-hf/llava-v1.6-mistral-7b-hf"],
        prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
        max_model_len=10240,
669
        auto_cls=AutoModelForImageTextToText,
670
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
671
        marks=multi_gpu_marks(num_gpus=2),
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
        **COMMON_BROADCAST_SETTINGS # type: ignore
    ),
    ### Custom input edge-cases for specific models
    "intern_vl-diff-patches": VLMTestInfo(
        models=["OpenGVLab/InternVL2-2B"],
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
        test_type=VLMTestType.CUSTOM_INPUTS,
        max_model_len=4096,
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
        custom_test_opts=[
            CustomTestOptions(
                inputs=inp,
                limit_mm_per_prompt={"image": 2},
            ) for inp in custom_inputs.different_patch_input_cases_internvl()
        ],
    ),
689
    "llava_onevision-multiple-images": VLMTestInfo(
690
691
692
693
694
        models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
        test_type=VLMTestType.CUSTOM_INPUTS,
        max_model_len=16384,
        max_num_seqs=2,
        auto_cls=AutoModelForVision2Seq,
695
        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),   # noqa: E501
696
697
698
699
700
701
702
703
        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
            inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
                formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
            ),
            limit_mm_per_prompt={"image": 4},
        )],
    ),
704
705
706
707
708
709
710
711
712
713
714
715
716
    # regression test for https://github.com/vllm-project/vllm/issues/15122
    "qwen2_5_vl-windows-attention": VLMTestInfo(
        models=["Qwen/Qwen2.5-VL-3B-Instruct"],
        test_type=VLMTestType.CUSTOM_INPUTS,
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
            inputs=custom_inputs.windows_attention_image_qwen2_5_vl(),
            limit_mm_per_prompt={"image": 1},
        )],
    ),
717
718
719
720
}
# yapf: enable


721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
def _mark_splits(
    test_settings: dict[str, VLMTestInfo],
    *,
    num_groups: int,
) -> dict[str, VLMTestInfo]:
    name_by_test_info_id = {id(v): k for k, v in test_settings.items()}
    test_infos_by_model = defaultdict[str, list[VLMTestInfo]](list)

    for info in test_settings.values():
        for model in info.models:
            test_infos_by_model[model].append(info)

    models = sorted(test_infos_by_model.keys())
    split_size = math.ceil(len(models) / num_groups)

    new_test_settings = dict[str, VLMTestInfo]()

    for i in range(num_groups):
        models_in_group = models[i * split_size:(i + 1) * split_size]

        for model in models_in_group:
            for info in test_infos_by_model[model]:
                new_marks = (info.marks or []) + [pytest.mark.split(group=i)]
                new_info = info._replace(marks=new_marks)
                new_test_settings[name_by_test_info_id[id(info)]] = new_info

    missing_keys = test_settings.keys() - new_test_settings.keys()
    assert not missing_keys, f"Missing keys: {missing_keys}"

    return new_test_settings


VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)


756
757
758
759
760
761
### Test wrappers
# Wrappers around the core test running func for:
# - single image
# - multi-image
# - image embeddings
# - video
762
# - audio
763
# - custom inputs
764
765
766
767
768
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.IMAGE,
769
        create_new_process_for_each_test=False,
770
    ))
771
772
def test_single_image_models(tmp_path: PosixPath, model_type: str,
                             test_case: ExpandableVLMTestArgs,
773
774
                             hf_runner: type[HfRunner],
                             vllm_runner: type[VllmRunner],
775
                             image_assets: ImageTestAssets, monkeypatch):
776
777
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
778
779
780
781
782
783
784
785
786
787
788
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_single_image_test(
        tmp_path=tmp_path,
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        image_assets=image_assets,
    )


789
790
791
792
793
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.MULTI_IMAGE,
794
        create_new_process_for_each_test=False,
795
    ))
796
797
def test_multi_image_models(tmp_path: PosixPath, model_type: str,
                            test_case: ExpandableVLMTestArgs,
798
799
                            hf_runner: type[HfRunner],
                            vllm_runner: type[VllmRunner],
800
                            image_assets: ImageTestAssets, monkeypatch):
801
802
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
803
804
805
806
807
808
809
810
811
812
813
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_multi_image_test(
        tmp_path=tmp_path,
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        image_assets=image_assets,
    )


814
815
816
817
818
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.EMBEDDING,
819
        create_new_process_for_each_test=False,
820
    ))
821
822
def test_image_embedding_models(model_type: str,
                                test_case: ExpandableVLMTestArgs,
823
824
                                hf_runner: type[HfRunner],
                                vllm_runner: type[VllmRunner],
825
                                image_assets: ImageTestAssets, monkeypatch):
826
827
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
828
829
830
831
832
833
834
835
836
837
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_embedding_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        image_assets=image_assets,
    )


838
839
840
841
842
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.VIDEO,
843
        create_new_process_for_each_test=False,
844
    ))
845
def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
846
                      hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
847
                      video_assets: VideoTestAssets, monkeypatch):
848
849
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
850
851
852
853
854
855
856
857
858
859
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_video_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        video_assets=video_assets,
    )


860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.AUDIO,
        create_new_process_for_each_test=False,
    ))
def test_audio_models(model_type: str, test_case: ExpandableVLMTestArgs,
                      hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
                      audio_assets: AudioTestAssets, monkeypatch):
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_audio_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        audio_assets=audio_assets,
    )


882
883
884
885
886
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.CUSTOM_INPUTS,
887
        create_new_process_for_each_test=False,
888
    ))
889
890
891
def test_custom_inputs_models(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
892
893
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
894
    monkeypatch,
895
):
896
897
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
898
899
900
901
902
903
904
905
906
907
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_custom_inputs_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
    )


#### Tests filtering for things running each test as a new process
908
909
910
911
912
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.IMAGE,
913
        create_new_process_for_each_test=True,
914
    ))
915
@create_new_process_for_each_test()
916
917
def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
                                   test_case: ExpandableVLMTestArgs,
918
919
                                   hf_runner: type[HfRunner],
                                   vllm_runner: type[VllmRunner],
920
                                   image_assets: ImageTestAssets, monkeypatch):
921
922
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
923
924
925
926
927
928
929
930
931
932
933
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_single_image_test(
        tmp_path=tmp_path,
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        image_assets=image_assets,
    )


934
935
936
937
938
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.MULTI_IMAGE,
939
        create_new_process_for_each_test=True,
940
    ))
941
@create_new_process_for_each_test()
942
943
def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
                                  test_case: ExpandableVLMTestArgs,
944
945
                                  hf_runner: type[HfRunner],
                                  vllm_runner: type[VllmRunner],
946
                                  image_assets: ImageTestAssets, monkeypatch):
947
948
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
949
950
951
952
953
954
955
956
957
958
959
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_multi_image_test(
        tmp_path=tmp_path,
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        image_assets=image_assets,
    )


960
961
962
963
964
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.EMBEDDING,
965
        create_new_process_for_each_test=True,
966
    ))
967
@create_new_process_for_each_test()
968
969
def test_image_embedding_models_heavy(model_type: str,
                                      test_case: ExpandableVLMTestArgs,
970
971
                                      hf_runner: type[HfRunner],
                                      vllm_runner: type[VllmRunner],
972
973
                                      image_assets: ImageTestAssets,
                                      monkeypatch):
974
975
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
976
977
978
979
980
981
982
983
984
985
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_embedding_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        image_assets=image_assets,
    )


986
987
988
989
990
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.VIDEO,
991
        create_new_process_for_each_test=True,
992
    ))
993
def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
994
995
                            hf_runner: type[HfRunner],
                            vllm_runner: type[VllmRunner],
996
                            video_assets: VideoTestAssets, monkeypatch):
997
998
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_video_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        video_assets=video_assets,
    )


1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.AUDIO,
        create_new_process_for_each_test=True,
    ))
def test_audio_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
                            hf_runner: type[HfRunner],
                            vllm_runner: type[VllmRunner],
                            audio_assets: AudioTestAssets, monkeypatch):
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_audio_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        audio_assets=audio_assets,
    )


1032
1033
1034
1035
1036
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.CUSTOM_INPUTS,
1037
        create_new_process_for_each_test=True,
1038
    ))
1039
@create_new_process_for_each_test()
1040
1041
1042
def test_custom_inputs_models_heavy(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
1043
1044
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
1045
    monkeypatch,
1046
):
1047
1048
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
1049
1050
1051
1052
1053
1054
1055
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_custom_inputs_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
    )