test_common.py 51.5 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
"""Common tests for testing .generate() functionality for single / multiple
image, embedding, and video support for different VLMs in vLLM.
"""
6
import math
7
import os
8
from collections import defaultdict
9
10
from pathlib import PosixPath

zhuwenwen's avatar
zhuwenwen committed
11
import os
12
import pytest
13
from transformers import (AutoModel, AutoModelForImageTextToText,
14
                          AutoModelForTextToWaveform, AutoModelForVision2Seq)
15
16

from vllm.platforms import current_platform
17
from vllm.utils import identity
18

19
20
from ....conftest import (IMAGE_ASSETS, AudioTestAssets, HfRunner,
                          ImageTestAssets, VideoTestAssets, VllmRunner)
21
from ....utils import (create_new_process_for_each_test, large_gpu_mark,
22
                       multi_gpu_marks)
23
24
25
26
27
from ...utils import check_outputs_equal
from .vlm_utils import custom_inputs, model_utils, runners
from .vlm_utils.case_filtering import get_parametrized_options
from .vlm_utils.types import (CustomTestOptions, ExpandableVLMTestArgs,
                              VLMTestInfo, VLMTestType)
zhuwenwen's avatar
zhuwenwen committed
28
from ....utils import models_path_prefix
29
30
31
32
33
34
35
36

# This hack is needed for phi3v & paligemma models
# ROCm Triton FA can run into shared memory issues with these models,
# use other backends in the meantime
# FIXME (mattwong, gshtrasb, hongxiayan)
if current_platform.is_rocm():
    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"

37
38
39
REQUIRES_V0_MODELS = [
    # V1 Test: not enough KV cache space in C1.
    "fuyu",
40
41
    # V1 Test: Deadlock issue when processing mm_inputs
    "llava-onevision-transformers",
42
43
]

44
45
46
47
48
49
# yapf: disable
COMMON_BROADCAST_SETTINGS = {
    "test_type": VLMTestType.IMAGE,
    "dtype": "half",
    "max_tokens": 5,
    "tensor_parallel_size": 2,
50
    "hf_model_kwargs": {"device_map": "auto"},
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
    "image_size_factors": [(.25, 0.5, 1.0)],
    "distributed_executor_backend": (
        "ray",
        "mp",
    )
}

### Test configuration for specific models
# NOTE: The convention of the test settings below is to lead each test key
# with the name of the model arch used in the test, using underscores in place
# of hyphens; this makes it more convenient to filter tests for a specific kind
# of model. For example....
#
# To run all test types for a specific key:
#     use the k flag to substring match with a leading square bracket; if the
#     model arch happens to be a substring of another one, you can add a
#     trailing hyphen. E.g.,
#                 - pytest $TEST_FILE -k "[llava-"
#     prevents matching on "[llava_next-" & will match just the enabled cases
#     for llava, i.e., single image, image embedding, and custom input tests.
#
# To run a test for a Test Info for just one of multiple models:
#     use the k flag to substring match the model name, e.g.,
#                 - pytest $TEST_FILE -k OpenGVLab/InternVL2-1B
#     prevents matching on nGVLab/InternVL2-2B.
#
# You can also combine substrings to match more granularly.
#     ex 1:
#        pytest $TEST_FILE -k "test_single_image and OpenGVLab/InternVL2-1B"
#     will run only test_single_image* for OpenGVLab/InternVL2-1B; this would
#     match both wrappers for single image tests, since it also matches
#     test_single_image_heavy (which forks if we have a distributed backend)
#     ex 2:
#        pytest $TEST_FILE -k  "[llava- or [intern_vl-"
#     will run all of the tests for only llava & internvl.
#
# NOTE you can add --collect-only to any of the above commands to see
# which cases would be selected and deselected by pytest. In general,
# this is a good idea for checking your command first, since tests are slow.

VLM_TEST_SETTINGS = {
92
93
    #### Core tests to always run in the CI
    "llava": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
94
        models=[os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf")],
95
96
97
98
99
100
101
102
        test_type=(
            VLMTestType.EMBEDDING,
            VLMTestType.IMAGE,
            VLMTestType.CUSTOM_INPUTS
        ),
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        convert_assets_to_embeddings=model_utils.get_llava_embeddings,
        max_model_len=4096,
103
        auto_cls=AutoModelForImageTextToText,
104
105
106
107
108
109
110
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
            inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
                formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:"
            ),
            limit_mm_per_prompt={"image": 4},
        )],
111
112
        # TODO: Revert to "auto" when CPU backend can use torch > 2.6
        dtype="bfloat16" if current_platform.is_cpu() else "auto",
113
        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
114
115
    ),
    "paligemma": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
116
        models=[os.path.join(models_path_prefix, "google/paligemma-3b-mix-224")],
117
118
119
120
121
122
123
124
        test_type=VLMTestType.IMAGE,
        prompt_formatter=identity,
        img_idx_to_prompt = lambda idx: "",
        # Paligemma uses its own sample prompts because the default one fails
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "caption es",
            "cherry_blossom": "What is in the picture?",
        }),
125
        auto_cls=AutoModelForImageTextToText,
126
        vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
127
128
        dtype="bfloat16",
        marks=[pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")],  # noqa: E501
129
    ),
zhuwenwen's avatar
zhuwenwen committed
130

Roger Wang's avatar
Roger Wang committed
131
    "qwen2_5_vl": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
132
        models=[os.path.join(models_path_prefix, "Qwen/Qwen2.5-VL-3B-Instruct")],
133
134
135
136
137
138
139
140
141
142
143
144
145
        test_type=(
            VLMTestType.IMAGE,
            VLMTestType.MULTI_IMAGE,
            VLMTestType.VIDEO
        ),
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
146
        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
147
    ),
148
    "qwen2_5_omni": VLMTestInfo(
149
        models=["Qwen/Qwen2.5-Omni-3B"],
150
151
152
153
154
155
156
157
158
159
        test_type=(
            VLMTestType.IMAGE,
            VLMTestType.MULTI_IMAGE,
            VLMTestType.VIDEO
        ),
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
        img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>", # noqa: E501
        video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
160
        num_logprobs= 6 if current_platform.is_cpu() else 5,
161
        auto_cls=AutoModelForTextToWaveform,
162
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
163
        patch_hf_runner=model_utils.qwen2_5_omni_patch_hf_runner,
164
165
166
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
    ),
167
168
169
170
171
172
173
174
175
176
177
    "ultravox": VLMTestInfo(
        models = ["fixie-ai/ultravox-v0_5-llama-3_2-1b"],
        test_type=VLMTestType.AUDIO,
        prompt_formatter=lambda audio_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{audio_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
        audio_idx_to_prompt=lambda idx: "<|audio|>",
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModel,
        hf_output_post_proc=model_utils.ultravox_trunc_hf_output,
        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
    ),
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
    #### Transformers fallback to test
    ## To reduce test burden, we only test batching arbitrary image size
    # Dynamic image length and number of patches
    "llava-onevision-transformers": VLMTestInfo(
        models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
        max_model_len=16384,
        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),   # noqa: E501
        auto_cls=AutoModelForImageTextToText,
        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
        image_size_factors=[(0.25, 0.5, 1.0)],
        vllm_runner_kwargs={
            "model_impl": "transformers",
        },
        marks=[pytest.mark.core_model],
    ),
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
    "idefics3-transformers": VLMTestInfo(
        models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
        img_idx_to_prompt=lambda idx: "<image>",
        max_model_len=8192,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
        hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
        image_size_factors=[(0.25, 0.5, 1.0)],
        vllm_runner_kwargs={
            "model_impl": "transformers",
        },
        marks=[pytest.mark.core_model],
    ),
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
    # Pixel values from processor are not 4D or 5D arrays
    "qwen2_5_vl-transformers": VLMTestInfo(
        models=["Qwen/Qwen2.5-VL-3B-Instruct"],
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
        image_size_factors=[(0.25, 0.2, 0.15)],
        vllm_runner_kwargs={
            "model_impl": "transformers",
        },
        marks=[large_gpu_mark(min_gb=32)],
    ),
226
    #### Extended model tests
227
    "aria": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
228
        models=[os.path.join(models_path_prefix, "rhymes-ai/Aria")],
229
230
231
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
        img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
Roger Wang's avatar
Roger Wang committed
232
233
        max_model_len=4096,
        max_num_seqs=2,
234
235
236
237
238
239
240
241
242
243
244
        auto_cls=AutoModelForImageTextToText,
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "<vlm_image>Please describe the image shortly.",
            "cherry_blossom": "<vlm_image>Please infer the season with reason.",  # noqa: E501
        }),
        multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.",    # noqa: E501
        stop_str=["<|im_end|>"],
        image_size_factors=[(0.10, 0.15)],
        max_tokens=64,
        marks=[large_gpu_mark(min_gb=64)],
    ),
Jennifer Zhao's avatar
Jennifer Zhao committed
245
    "aya_vision": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
246
        models=[os.path.join(models_path_prefix, "CohereForAI/aya-vision-8b")],
247
        test_type=(VLMTestType.IMAGE),
Jennifer Zhao's avatar
Jennifer Zhao committed
248
249
250
251
252
253
        prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "<image>What's the content in the center of the image?",  # noqa: E501
            "cherry_blossom": "<image>What is the season?",  # noqa: E501
        }),
        multi_image_prompt="<image><image>Describe the two images in detail.",  # noqa: E501
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
        vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}},
    ),
    "aya_vision-multi_image": VLMTestInfo(
        models=["CohereForAI/aya-vision-8b"],
        test_type=(VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "<image>What's the content in the center of the image?",  # noqa: E501
            "cherry_blossom": "<image>What is the season?",  # noqa: E501
        }),
        multi_image_prompt="<image><image>Describe the two images in detail.",  # noqa: E501
        max_model_len=4096,
Jennifer Zhao's avatar
Jennifer Zhao committed
269
270
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
271
272
        vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}},
        marks=[large_gpu_mark(min_gb=32)],
Roger Wang's avatar
Roger Wang committed
273
    ),
274
    "blip2": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
275
        models=[os.path.join(models_path_prefix,"Salesforce/blip2-opt-2.7b")],
276
277
278
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
        img_idx_to_prompt=lambda idx: "",
279
        auto_cls=AutoModelForImageTextToText,
280
        vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output,
281
282
        # FIXME: https://github.com/huggingface/transformers/pull/38510
        marks=[pytest.mark.skip("Model is broken")],
283
284
    ),
    "chameleon": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
285
        models=[os.path.join(models_path_prefix, "facebook/chameleon-7b")],
286
287
288
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        max_model_len=4096,
289
        max_num_seqs=2,
290
        auto_cls=AutoModelForImageTextToText,
291
292
293
294
295
296
297
        # For chameleon, we only compare the sequences
        vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
        hf_output_post_proc = lambda hf_output, model: hf_output[:2],
        comparator=check_outputs_equal,
        max_tokens=8,
        dtype="bfloat16",
    ),
298
    "deepseek_vl_v2": VLMTestInfo(
299
        models=["Isotr0py/deepseek-vl2-tiny"], # model repo using dynamic module
300
301
302
303
304
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
        single_image_prompts=IMAGE_ASSETS.prompts({
305
306
            "stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
            "cherry_blossom": "<image>\nPlease infer the season with reason in details.",   # noqa: E501
307
        }),
308
        multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?",    # noqa: E501
309
310
311
        patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
        hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
        stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"],  # noqa: E501
312
        image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
313
    ),
314
    "fuyu": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
315
        models=[os.path.join(models_path_prefix, "adept/fuyu-8b")],
316
317
318
319
320
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda img_prompt: f"{img_prompt}\n",
        img_idx_to_prompt=lambda idx: "",
        max_model_len=2048,
        max_num_seqs=2,
321
        auto_cls=AutoModelForImageTextToText,
322
323
324
325
326
        use_tokenizer_eos=True,
        vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
        num_logprobs=10,
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
    ),
327
    "gemma3": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
328
        models=[os.path.join(models_path_prefix, "google/gemma-3-4b-it")],
329
330
331
332
333
334
335
336
337
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "<start_of_image>What's the content in the center of the image?",  # noqa: E501
            "cherry_blossom": "<start_of_image>What is the season?",  # noqa: E501
        }),
        multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.",  # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
338
        auto_cls=AutoModelForImageTextToText,
339
340
        vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
        patch_hf_runner=model_utils.gemma3_patch_hf_runner,
341
        num_logprobs=10,
342
    ),
343
    "glm4v": VLMTestInfo(
344
        models=[os.path.join(models_path_prefix, "zai-org/glm-4v-9b")],
345
        test_type=VLMTestType.IMAGE,
346
347
348
349
350
        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",  # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?",  # noqa: E501
            "cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?",  # noqa: E501
        }),
351
352
353
        max_model_len=2048,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: [151329, 151336, 151338],
354
355
356
357
358
359
        patch_hf_runner=model_utils.glm4v_patch_hf_runner,
        # The image embeddings match with HF but the outputs of the language
        # decoder are only consistent up to 2 decimal places.
        # So, we need to reduce the number of tokens for the test to pass.
        max_tokens=8,
        num_logprobs=10,
360
        marks=[large_gpu_mark(min_gb=32)],
361
    ),
362
    "glm4_1v": VLMTestInfo(
363
        models=["zai-org/GLM-4.1V-9B-Thinking"],
364
365
366
367
368
369
370
371
372
373
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",  # noqa: E501
        img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", # noqa: E501
        video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>", # noqa: E501
        max_model_len=2048,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: [151329, 151336, 151338],
        num_logprobs=10,
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
        auto_cls=AutoModelForImageTextToText,
374
        marks=[large_gpu_mark(min_gb=32)],
375
376
    ),
    "glm4_1v-video": VLMTestInfo(
377
        models=["zai-org/GLM-4.1V-9B-Thinking"],
378
379
380
381
382
383
384
385
386
387
        # GLM4.1V require include video metadata for input
        test_type=VLMTestType.CUSTOM_INPUTS,
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
        patch_hf_runner=model_utils.glm4_1v_patch_hf_runner,
        custom_test_opts=[CustomTestOptions(
            inputs=custom_inputs.video_with_metadata_glm4_1v(),
            limit_mm_per_prompt={"video": 1},
        )],
388
        marks=[large_gpu_mark(min_gb=32)],
389
    ),
390
391
    "h2ovl": VLMTestInfo(
        models = [
zhuwenwen's avatar
zhuwenwen committed
392
            os.path.join(models_path_prefix, "h2oai/h2ovl-mississippi-800m"),
zhuwenwen's avatar
zhuwenwen committed
393
            os.path.join(models_path_prefix, "h2oai/h2ovl-mississippi-2b"),
394
395
396
397
398
399
400
401
402
403
        ],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "<image>\nWhat's the content in the center of the image?",  # noqa: E501
            "cherry_blossom": "<image>\nWhat is the season?",
        }),
        multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.",  # noqa: E501
        max_model_len=8192,
        use_tokenizer_eos=True,
404
        num_logprobs=10,
405
        patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
406
    ),
407
    "idefics3": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
408
        models=[os.path.join(models_path_prefix, "HuggingFaceTB/SmolVLM-256M-Instruct")],
409
410
411
412
413
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
        img_idx_to_prompt=lambda idx: "<image>",
        max_model_len=8192,
        max_num_seqs=2,
414
        auto_cls=AutoModelForImageTextToText,
415
        hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
416
    ),
417
418
    "intern_vl": VLMTestInfo(
        models=[
zhuwenwen's avatar
zhuwenwen committed
419
420
            os.path.join(models_path_prefix, "OpenGVLab/InternVL2-1B"),
            os.path.join(models_path_prefix, "OpenGVLab/InternVL2-2B"),
421
422
            # FIXME: Config cannot be loaded in transformers 4.52
            # "OpenGVLab/Mono-InternVL-2B",
423
424
425
426
427
428
429
430
431
432
433
434
        ],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "<image>\nWhat's the content in the center of the image?",  # noqa: E501
            "cherry_blossom": "<image>\nWhat is the season?",
        }),
        multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.",  # noqa: E501
        max_model_len=4096,
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
    ),
435
436
437
438
439
440
441
442
443
444
445
    "intern_vl-video": VLMTestInfo(
        models=[
            "OpenGVLab/InternVL3-1B",
        ],
        test_type=VLMTestType.VIDEO,
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
        video_idx_to_prompt=lambda idx: "<video>",
        max_model_len=8192,
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
    ),
446
447
448
449
450
451
452
453
454
455
456
457
458
459
    "intern_vl-hf": VLMTestInfo(
        models=["OpenGVLab/InternVL3-1B-hf"],
        test_type=(
            VLMTestType.IMAGE,
            VLMTestType.MULTI_IMAGE,
            VLMTestType.VIDEO,
        ),
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
        img_idx_to_prompt=lambda idx: "<IMG_CONTEXT>",
        video_idx_to_prompt=lambda idx: "<video>",
        max_model_len=8192,
        use_tokenizer_eos=True,
        auto_cls=AutoModelForImageTextToText,
    ),
460
461
462
463
464
465
466
467
468
469
470
471
    "kimi_vl": VLMTestInfo(
        models=["moonshotai/Kimi-VL-A3B-Instruct"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|im_user|>user<|im_middle|>{img_prompt}<|im_end|><|im_assistant|>assistant<|im_middle|>", # noqa: E501
        img_idx_to_prompt=lambda _: "<|media_start|>image<|media_content|><|media_pad|><|media_end|>",  # noqa: E501
        max_model_len=8192,
        max_num_seqs=2,
        dtype="bfloat16",
        tensor_parallel_size=1,
        vllm_output_post_proc=model_utils.kimiv_vl_vllm_to_hf_output,
        marks=[large_gpu_mark(min_gb=48)],
    ),
472
473
474
475
476
477
478
479
480
481
482
483
    "llama4": VLMTestInfo(
        models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n", # noqa: E501
        img_idx_to_prompt=lambda _: "<|image|>",
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        distributed_executor_backend="mp",
        image_size_factors=[(.25, 0.5, 1.0)],
        hf_model_kwargs={"device_map": "auto"},
        max_model_len=8192,
        max_num_seqs=4,
        dtype="bfloat16",
        auto_cls=AutoModelForImageTextToText,
484
485
        tensor_parallel_size=4,
        marks=multi_gpu_marks(num_gpus=4),
486
    ),
487
    "llava_next": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
488
        models=[os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")],
489
490
491
        test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
        prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
        max_model_len=10240,
492
        auto_cls=AutoModelForImageTextToText,
493
494
495
496
497
498
499
500
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
            inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
                formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]"
            ),
            limit_mm_per_prompt={"image": 4},
        )],
    ),
501
    "llava_onevision": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
502
        models=[os.path.join(models_path_prefix, "llava-hf/llava-onevision-qwen2-0.5b-ov-hf")],
503
504
505
506
        test_type=VLMTestType.CUSTOM_INPUTS,
        prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
        num_video_frames=16,
        max_model_len=16384,
507
        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),   # noqa: E501
508
509
510
511
512
513
514
515
516
517
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
            inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs(
                formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
            ),
            limit_mm_per_prompt={"video": 4},
        )],
    ),
    "llava_next_video": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
518
        models=[os.path.join(models_path_prefix, "llava-hf/LLaVA-NeXT-Video-7B-hf")],
519
520
521
522
        test_type=VLMTestType.VIDEO,
        prompt_formatter=lambda vid_prompt: f"USER: {vid_prompt} ASSISTANT:",
        num_video_frames=16,
        max_model_len=4096,
523
        max_num_seqs=2,
524
525
526
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output,
    ),
527
    "mantis": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
528
        models=[os.path.join(models_path_prefix, "TIGER-Lab/Mantis-8B-siglip-llama3")],
529
530
531
532
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
        max_model_len=4096,
        get_stop_token_ids=lambda tok: [128009],
533
        auto_cls=AutoModelForImageTextToText,
534
535
536
        vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
        patch_hf_runner=model_utils.mantis_patch_hf_runner,
    ),
537
    "minicpmv_25": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
538
        models=[os.path.join(models_path_prefix, "openbmb/MiniCPM-Llama3-V-2_5")],
539
        test_type=VLMTestType.IMAGE,
540
541
542
543
544
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
        max_model_len=4096,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
545
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
546
        patch_hf_runner=model_utils.minicpmv_25_patch_hf_runner,
547
548
        # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
        marks=[pytest.mark.skip("HF import fails")],
549
    ),
550
551
552
553
554
555
556
557
558
    "minicpmo_26": VLMTestInfo(
        models=["openbmb/MiniCPM-o-2_6"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
        max_model_len=4096,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
559
        patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
560
        # FIXME: https://huggingface.co/openbmb/MiniCPM-o-2_6/discussions/49
561
        marks=[pytest.mark.skip("HF import fails")],
562
    ),
563
    "minicpmv_26": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
564
        models=[os.path.join(models_path_prefix, "openbmb/MiniCPM-V-2_6")],
565
566
567
568
569
570
571
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
        max_model_len=4096,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
572
        patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
573
    ),
574
575
576
577
578
579
580
581
582
583
584
585
    "minimax_vl_01": VLMTestInfo(
        models=["MiniMaxAI/MiniMax-VL-01"],
        prompt_formatter=lambda img_prompt: f"<beginning_of_sentence>user: {img_prompt} assistant:<end_of_sentence>", # noqa: E501
        img_idx_to_prompt=lambda _: "<image>",
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        max_model_len=8192,
        max_num_seqs=4,
        dtype="bfloat16",
        hf_output_post_proc=model_utils.minimax_vl_01_hf_output,
        patch_hf_runner=model_utils.minimax_vl_01_patch_hf_runner,
        auto_cls=AutoModelForImageTextToText,
        marks=[large_gpu_mark(min_gb=80)],
586
    ),
587
588
    "molmo": VLMTestInfo(
        models=["allenai/Molmo-7B-D-0924"],
589
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
590
        prompt_formatter=identity,
591
592
        max_model_len=4096,
        max_num_seqs=2,
593
        patch_hf_runner=model_utils.molmo_patch_hf_runner,
594
    ),
595
596
597
598
599
600
601
602
603
604
605
606
607
    "ovis1_6-gemma2": VLMTestInfo(
        models=["AIDC-AI/Ovis1.6-Gemma2-9B"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
        img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
        dtype="half",
        # use sdpa mode for hf runner since ovis2 didn't work with flash_attn
        hf_model_kwargs={"llm_attn_implementation": "sdpa"},
        patch_hf_runner=model_utils.ovis_patch_hf_runner,
        marks=[large_gpu_mark(min_gb=32)],
    ),
608
609
610
611
612
613
614
615
616
617
    "ovis2": VLMTestInfo(
        models=["AIDC-AI/Ovis2-1B"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
        img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
        dtype="half",
        # use sdpa mode for hf runner since ovis2 didn't work with flash_attn
        hf_model_kwargs={"llm_attn_implementation": "sdpa"},
618
        patch_hf_runner=model_utils.ovis_patch_hf_runner,
619
    ),
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
    "ovis2_5": VLMTestInfo(
        models=["AIDC-AI/Ovis2.5-2B"],
        test_type=(
            VLMTestType.IMAGE,
            VLMTestType.MULTI_IMAGE,
            VLMTestType.VIDEO
        ),
        prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
        img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
        video_idx_to_prompt=lambda idx: "<video>\n",
        max_model_len=4096,
        max_num_seqs=2,
        dtype="half",
        num_logprobs=10,
        patch_hf_runner=model_utils.ovis2_5_patch_hf_runner,
635
        hf_model_kwargs={"revision": "refs/pr/5"},
636
    ),
637
638
639
640
641
642
643
    "phi3v": VLMTestInfo(
        models=["microsoft/Phi-3.5-vision-instruct"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501
        img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
        max_model_len=4096,
        max_num_seqs=2,
644
        runner="generate",
645
646
        # use sdpa mode for hf runner since phi3v didn't work with flash_attn
        hf_model_kwargs={"_attn_implementation": "sdpa"},
647
648
649
650
        use_tokenizer_eos=True,
        vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output,
        num_logprobs=10,
    ),
651
    "pixtral_hf": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
652
        models=[os.path.join(models_path_prefix, "nm-testing/pixtral-12b-FP8-dynamic")],
653
654
655
656
657
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<s>[INST]{img_prompt}[/INST]",
        img_idx_to_prompt=lambda idx: "[IMG]",
        max_model_len=8192,
        max_num_seqs=2,
658
        auto_cls=AutoModelForImageTextToText,
659
        marks=[large_gpu_mark(min_gb=48)],
660
    ),
661
    "qwen_vl": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
662
        models=[os.path.join(models_path_prefix, "Qwen/Qwen-VL")],
663
664
665
666
667
668
669
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=identity,
        img_idx_to_prompt=lambda idx: f"Picture {idx}: <img></img>\n",
        max_model_len=1024,
        max_num_seqs=2,
        vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output,
        prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
670
671
        # FIXME: https://github.com/huggingface/transformers/issues/38358
        marks=[pytest.mark.skip("Model initialization fails")],
672
    ),
673
674
675
676
677
678
679
680
681
682
    "qwen2_vl": VLMTestInfo(
        models=["Qwen/Qwen2-VL-2B-Instruct"],
        test_type=(
            VLMTestType.IMAGE,
            VLMTestType.MULTI_IMAGE,
            VLMTestType.VIDEO
        ),
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
683
        multi_image_prompt="Picture 1: <vlm_image>\nPicture 2: <vlm_image>\nDescribe these two images with one paragraph respectively.",    # noqa: E501
684
685
686
687
688
689
690
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
        marks=[pytest.mark.cpu_model],
    ),
691
692
693
694
695
696
697
698
699
700
701
702
703
704
    "skywork_r1v": VLMTestInfo(
        models=["Skywork/Skywork-R1V-38B"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|begin▁of▁sentence|><|User|>\n{img_prompt}<|Assistant|><think>\n", # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "<image>\nWhat's the content in the center of the image?",  # noqa: E501
            "cherry_blossom": "<image>\nWhat is the season?",
        }),
        multi_image_prompt="<image>\n<image>\nDescribe the two images in short.",  # noqa: E501
        max_model_len=4096,
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.skyworkr1v_patch_hf_runner,
        marks=[large_gpu_mark(min_gb=80)],
    ),
705
706
707
    "smolvlm": VLMTestInfo(
        models=["HuggingFaceTB/SmolVLM2-2.2B-Instruct"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
708
        prompt_formatter=lambda img_prompt: f"<|im_start|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
709
710
711
712
713
714
        img_idx_to_prompt=lambda idx: "<image>",
        max_model_len=8192,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
        hf_output_post_proc=model_utils.smolvlm_trunc_hf_output,
    ),
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
    "tarsier": VLMTestInfo(
        models=["omni-research/Tarsier-7b"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt} ASSISTANT:",
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
        patch_hf_runner=model_utils.tarsier_patch_hf_runner,
    ),
    "tarsier2": VLMTestInfo(
        models=["omni-research/Tarsier2-Recap-7b"],
        test_type=(
            VLMTestType.IMAGE,
            VLMTestType.MULTI_IMAGE,
            VLMTestType.VIDEO,
        ),
        prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
        marks=[pytest.mark.skip("Model initialization hangs")],
    ),
740
    ### Tensor parallel / multi-gpu broadcast tests
741
    "chameleon-broadcast": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
742
        models=[os.path.join(models_path_prefix, "facebook/chameleon-7b")],
743
744
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        max_model_len=4096,
745
        auto_cls=AutoModelForImageTextToText,
746
747
748
        vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
        hf_output_post_proc = lambda hf_output, model: hf_output[:2],
        comparator=check_outputs_equal,
749
        marks=multi_gpu_marks(num_gpus=2),
750
751
        **COMMON_BROADCAST_SETTINGS # type: ignore
    ),
752
    "llava-broadcast": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
753
        models=[os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf")],
754
755
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        max_model_len=4096,
756
        auto_cls=AutoModelForImageTextToText,
757
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
758
        marks=multi_gpu_marks(num_gpus=2),
759
760
        **COMMON_BROADCAST_SETTINGS # type: ignore
    ),
761
    "llava_next-broadcast": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
762
        models=[os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")],
763
764
        prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
        max_model_len=10240,
765
        auto_cls=AutoModelForImageTextToText,
766
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
767
        marks=multi_gpu_marks(num_gpus=2),
768
769
770
771
        **COMMON_BROADCAST_SETTINGS # type: ignore
    ),
    ### Custom input edge-cases for specific models
    "intern_vl-diff-patches": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
772
        models=[os.path.join(models_path_prefix, "OpenGVLab/InternVL2-2B")],
773
774
775
776
777
778
779
780
781
782
783
784
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
        test_type=VLMTestType.CUSTOM_INPUTS,
        max_model_len=4096,
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
        custom_test_opts=[
            CustomTestOptions(
                inputs=inp,
                limit_mm_per_prompt={"image": 2},
            ) for inp in custom_inputs.different_patch_input_cases_internvl()
        ],
    ),
785
    "llava_onevision-multiple-images": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
786
        models=[os.path.join(models_path_prefix, "llava-hf/llava-onevision-qwen2-0.5b-ov-hf")],
787
788
789
790
        test_type=VLMTestType.CUSTOM_INPUTS,
        max_model_len=16384,
        max_num_seqs=2,
        auto_cls=AutoModelForVision2Seq,
791
        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),   # noqa: E501
792
793
794
795
796
797
798
799
        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
            inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
                formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
            ),
            limit_mm_per_prompt={"image": 4},
        )],
    ),
800
801
802
803
804
805
806
807
808
809
810
811
812
    # regression test for https://github.com/vllm-project/vllm/issues/15122
    "qwen2_5_vl-windows-attention": VLMTestInfo(
        models=["Qwen/Qwen2.5-VL-3B-Instruct"],
        test_type=VLMTestType.CUSTOM_INPUTS,
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
            inputs=custom_inputs.windows_attention_image_qwen2_5_vl(),
            limit_mm_per_prompt={"image": 1},
        )],
    ),
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
    "llama4": VLMTestInfo(
        models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n", # noqa: E501
        img_idx_to_prompt=lambda _: "<|image|>",
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        distributed_executor_backend="mp",
        image_size_factors=[(.25, 0.5, 1.0)],
        hf_model_kwargs={"device_map": "auto"},
        max_model_len=8192,
        max_num_seqs=4,
        dtype="bfloat16",
        auto_cls=AutoModelForImageTextToText,
        tensor_parallel_size=8,
        vllm_runner_kwargs={"gpu_memory_utilization": 0.8},
        marks=[large_gpu_mark(min_gb=80), multi_gpu_marks(num_gpus=8)],
    ),
829
830
831
832
}
# yapf: enable


833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
def _mark_splits(
    test_settings: dict[str, VLMTestInfo],
    *,
    num_groups: int,
) -> dict[str, VLMTestInfo]:
    name_by_test_info_id = {id(v): k for k, v in test_settings.items()}
    test_infos_by_model = defaultdict[str, list[VLMTestInfo]](list)

    for info in test_settings.values():
        for model in info.models:
            test_infos_by_model[model].append(info)

    models = sorted(test_infos_by_model.keys())
    split_size = math.ceil(len(models) / num_groups)

    new_test_settings = dict[str, VLMTestInfo]()

    for i in range(num_groups):
        models_in_group = models[i * split_size:(i + 1) * split_size]

        for model in models_in_group:
            for info in test_infos_by_model[model]:
                new_marks = (info.marks or []) + [pytest.mark.split(group=i)]
                new_info = info._replace(marks=new_marks)
                new_test_settings[name_by_test_info_id[id(info)]] = new_info

    missing_keys = test_settings.keys() - new_test_settings.keys()
    assert not missing_keys, f"Missing keys: {missing_keys}"

    return new_test_settings


VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)


868
869
870
871
872
873
### Test wrappers
# Wrappers around the core test running func for:
# - single image
# - multi-image
# - image embeddings
# - video
874
# - audio
875
# - custom inputs
876
877
878
879
880
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.IMAGE,
881
        create_new_process_for_each_test=False,
882
    ))
883
884
def test_single_image_models(tmp_path: PosixPath, model_type: str,
                             test_case: ExpandableVLMTestArgs,
885
886
                             hf_runner: type[HfRunner],
                             vllm_runner: type[VllmRunner],
887
                             image_assets: ImageTestAssets, monkeypatch):
888
889
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
890
891
892
893
894
895
896
897
898
899
900
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_single_image_test(
        tmp_path=tmp_path,
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        image_assets=image_assets,
    )


901
902
903
904
905
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.MULTI_IMAGE,
906
        create_new_process_for_each_test=False,
907
    ))
908
909
def test_multi_image_models(tmp_path: PosixPath, model_type: str,
                            test_case: ExpandableVLMTestArgs,
910
911
                            hf_runner: type[HfRunner],
                            vllm_runner: type[VllmRunner],
912
                            image_assets: ImageTestAssets, monkeypatch):
913
914
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
915
916
917
918
919
920
921
922
923
924
925
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_multi_image_test(
        tmp_path=tmp_path,
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        image_assets=image_assets,
    )


926
927
928
929
930
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.EMBEDDING,
931
        create_new_process_for_each_test=False,
932
    ))
933
934
def test_image_embedding_models(model_type: str,
                                test_case: ExpandableVLMTestArgs,
935
936
                                hf_runner: type[HfRunner],
                                vllm_runner: type[VllmRunner],
937
                                image_assets: ImageTestAssets, monkeypatch):
938
939
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
940
941
942
943
944
945
946
947
948
949
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_embedding_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        image_assets=image_assets,
    )


950
951
952
953
954
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.VIDEO,
955
        create_new_process_for_each_test=False,
956
    ))
957
def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
958
                      hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
959
                      video_assets: VideoTestAssets, monkeypatch):
960
961
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
962
963
964
965
966
967
968
969
970
971
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_video_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        video_assets=video_assets,
    )


972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.AUDIO,
        create_new_process_for_each_test=False,
    ))
def test_audio_models(model_type: str, test_case: ExpandableVLMTestArgs,
                      hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
                      audio_assets: AudioTestAssets, monkeypatch):
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_audio_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        audio_assets=audio_assets,
    )


994
995
996
997
998
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.CUSTOM_INPUTS,
999
        create_new_process_for_each_test=False,
1000
    ))
1001
1002
1003
def test_custom_inputs_models(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
1004
1005
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
1006
    monkeypatch,
1007
):
1008
1009
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_custom_inputs_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
    )


#### Tests filtering for things running each test as a new process
1020
1021
1022
1023
1024
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.IMAGE,
1025
        create_new_process_for_each_test=True,
1026
    ))
1027
@create_new_process_for_each_test()
1028
1029
def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
                                   test_case: ExpandableVLMTestArgs,
1030
1031
                                   hf_runner: type[HfRunner],
                                   vllm_runner: type[VllmRunner],
1032
                                   image_assets: ImageTestAssets, monkeypatch):
1033
1034
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_single_image_test(
        tmp_path=tmp_path,
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        image_assets=image_assets,
    )


1046
1047
1048
1049
1050
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.MULTI_IMAGE,
1051
        create_new_process_for_each_test=True,
1052
    ))
1053
@create_new_process_for_each_test()
1054
1055
def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
                                  test_case: ExpandableVLMTestArgs,
1056
1057
                                  hf_runner: type[HfRunner],
                                  vllm_runner: type[VllmRunner],
1058
                                  image_assets: ImageTestAssets, monkeypatch):
1059
1060
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_multi_image_test(
        tmp_path=tmp_path,
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        image_assets=image_assets,
    )


1072
1073
1074
1075
1076
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.EMBEDDING,
1077
        create_new_process_for_each_test=True,
1078
    ))
1079
@create_new_process_for_each_test()
1080
1081
def test_image_embedding_models_heavy(model_type: str,
                                      test_case: ExpandableVLMTestArgs,
1082
1083
                                      hf_runner: type[HfRunner],
                                      vllm_runner: type[VllmRunner],
1084
1085
                                      image_assets: ImageTestAssets,
                                      monkeypatch):
1086
1087
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_embedding_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        image_assets=image_assets,
    )


1098
1099
1100
1101
1102
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.VIDEO,
1103
        create_new_process_for_each_test=True,
1104
    ))
1105
def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
1106
1107
                            hf_runner: type[HfRunner],
                            vllm_runner: type[VllmRunner],
1108
                            video_assets: VideoTestAssets, monkeypatch):
1109
1110
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_video_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        video_assets=video_assets,
    )


1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.AUDIO,
        create_new_process_for_each_test=True,
    ))
def test_audio_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
                            hf_runner: type[HfRunner],
                            vllm_runner: type[VllmRunner],
                            audio_assets: AudioTestAssets, monkeypatch):
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_audio_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        audio_assets=audio_assets,
    )


1144
1145
1146
1147
1148
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.CUSTOM_INPUTS,
1149
        create_new_process_for_each_test=True,
1150
    ))
1151
@create_new_process_for_each_test()
1152
1153
1154
def test_custom_inputs_models_heavy(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
1155
1156
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
1157
    monkeypatch,
1158
):
1159
1160
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
1161
1162
1163
1164
1165
1166
1167
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_custom_inputs_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
    )