test_common.py 50.6 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
"""Common tests for testing .generate() functionality for single / multiple
image, embedding, and video support for different VLMs in vLLM.
"""
6
import math
7
import os
8
from collections import defaultdict
9
10
from pathlib import PosixPath

zhuwenwen's avatar
zhuwenwen committed
11
import os
12
import pytest
13
from transformers import (AutoModel, AutoModelForImageTextToText,
14
                          AutoModelForTextToWaveform, AutoModelForVision2Seq)
15
16

from vllm.platforms import current_platform
17
from vllm.utils import identity
18

19
20
from ....conftest import (IMAGE_ASSETS, AudioTestAssets, HfRunner,
                          ImageTestAssets, VideoTestAssets, VllmRunner)
21
from ....utils import (create_new_process_for_each_test, large_gpu_mark,
22
                       multi_gpu_marks)
23
24
25
26
27
from ...utils import check_outputs_equal
from .vlm_utils import custom_inputs, model_utils, runners
from .vlm_utils.case_filtering import get_parametrized_options
from .vlm_utils.types import (CustomTestOptions, ExpandableVLMTestArgs,
                              VLMTestInfo, VLMTestType)
zhuwenwen's avatar
zhuwenwen committed
28
from ....utils import models_path_prefix
29
30
31
32
33
34
35
36

# This hack is needed for phi3v & paligemma models
# ROCm Triton FA can run into shared memory issues with these models,
# use other backends in the meantime
# FIXME (mattwong, gshtrasb, hongxiayan)
if current_platform.is_rocm():
    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"

37
38
39
REQUIRES_V0_MODELS = [
    # V1 Test: not enough KV cache space in C1.
    "fuyu",
40
41
    # V1 Test: Deadlock issue when processing mm_inputs
    "llava-onevision-transformers",
42
43
]

44
45
46
47
48
49
# yapf: disable
COMMON_BROADCAST_SETTINGS = {
    "test_type": VLMTestType.IMAGE,
    "dtype": "half",
    "max_tokens": 5,
    "tensor_parallel_size": 2,
50
    "hf_model_kwargs": {"device_map": "auto"},
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
    "image_size_factors": [(.25, 0.5, 1.0)],
    "distributed_executor_backend": (
        "ray",
        "mp",
    )
}

### Test configuration for specific models
# NOTE: The convention of the test settings below is to lead each test key
# with the name of the model arch used in the test, using underscores in place
# of hyphens; this makes it more convenient to filter tests for a specific kind
# of model. For example....
#
# To run all test types for a specific key:
#     use the k flag to substring match with a leading square bracket; if the
#     model arch happens to be a substring of another one, you can add a
#     trailing hyphen. E.g.,
#                 - pytest $TEST_FILE -k "[llava-"
#     prevents matching on "[llava_next-" & will match just the enabled cases
#     for llava, i.e., single image, image embedding, and custom input tests.
#
# To run a test for a Test Info for just one of multiple models:
#     use the k flag to substring match the model name, e.g.,
#                 - pytest $TEST_FILE -k OpenGVLab/InternVL2-1B
#     prevents matching on nGVLab/InternVL2-2B.
#
# You can also combine substrings to match more granularly.
#     ex 1:
#        pytest $TEST_FILE -k "test_single_image and OpenGVLab/InternVL2-1B"
#     will run only test_single_image* for OpenGVLab/InternVL2-1B; this would
#     match both wrappers for single image tests, since it also matches
#     test_single_image_heavy (which forks if we have a distributed backend)
#     ex 2:
#        pytest $TEST_FILE -k  "[llava- or [intern_vl-"
#     will run all of the tests for only llava & internvl.
#
# NOTE you can add --collect-only to any of the above commands to see
# which cases would be selected and deselected by pytest. In general,
# this is a good idea for checking your command first, since tests are slow.

VLM_TEST_SETTINGS = {
92
93
    #### Core tests to always run in the CI
    "llava": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
94
        models=[os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf")],
95
96
97
98
99
100
101
102
        test_type=(
            VLMTestType.EMBEDDING,
            VLMTestType.IMAGE,
            VLMTestType.CUSTOM_INPUTS
        ),
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        convert_assets_to_embeddings=model_utils.get_llava_embeddings,
        max_model_len=4096,
103
        auto_cls=AutoModelForImageTextToText,
104
105
106
107
108
109
110
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
            inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
                formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:"
            ),
            limit_mm_per_prompt={"image": 4},
        )],
111
112
        # TODO: Revert to "auto" when CPU backend can use torch > 2.6
        dtype="bfloat16" if current_platform.is_cpu() else "auto",
113
        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
114
115
    ),
    "paligemma": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
116
        models=[os.path.join(models_path_prefix, "google/paligemma-3b-mix-224")],
117
118
119
120
121
122
123
124
        test_type=VLMTestType.IMAGE,
        prompt_formatter=identity,
        img_idx_to_prompt = lambda idx: "",
        # Paligemma uses its own sample prompts because the default one fails
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "caption es",
            "cherry_blossom": "What is in the picture?",
        }),
125
        auto_cls=AutoModelForImageTextToText,
126
        vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
127
128
        dtype="bfloat16",
        marks=[pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")],  # noqa: E501
129
    ),
zhuwenwen's avatar
zhuwenwen committed
130

Roger Wang's avatar
Roger Wang committed
131
    "qwen2_5_vl": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
132
        models=[os.path.join(models_path_prefix, "Qwen/Qwen2.5-VL-3B-Instruct")],
133
134
135
136
137
138
139
140
141
142
143
144
145
        test_type=(
            VLMTestType.IMAGE,
            VLMTestType.MULTI_IMAGE,
            VLMTestType.VIDEO
        ),
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
146
        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
147
    ),
148
    "qwen2_5_omni": VLMTestInfo(
149
        models=["Qwen/Qwen2.5-Omni-3B"],
150
151
152
153
154
155
156
157
158
159
        test_type=(
            VLMTestType.IMAGE,
            VLMTestType.MULTI_IMAGE,
            VLMTestType.VIDEO
        ),
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
        img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>", # noqa: E501
        video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
160
        num_logprobs= 6 if current_platform.is_cpu() else 5,
161
        auto_cls=AutoModelForTextToWaveform,
162
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
163
        patch_hf_runner=model_utils.qwen2_5_omni_patch_hf_runner,
164
165
166
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
    ),
167
168
169
170
171
172
173
174
175
176
177
    "ultravox": VLMTestInfo(
        models = ["fixie-ai/ultravox-v0_5-llama-3_2-1b"],
        test_type=VLMTestType.AUDIO,
        prompt_formatter=lambda audio_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{audio_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
        audio_idx_to_prompt=lambda idx: "<|audio|>",
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModel,
        hf_output_post_proc=model_utils.ultravox_trunc_hf_output,
        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
    ),
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
    #### Transformers fallback to test
    ## To reduce test burden, we only test batching arbitrary image size
    # Dynamic image length and number of patches
    "llava-onevision-transformers": VLMTestInfo(
        models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
        max_model_len=16384,
        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),   # noqa: E501
        auto_cls=AutoModelForImageTextToText,
        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
        image_size_factors=[(0.25, 0.5, 1.0)],
        vllm_runner_kwargs={
            "model_impl": "transformers",
        },
        marks=[pytest.mark.core_model],
    ),
    # FIXME(Isotr0py): Enable this test after
    # https://github.com/huggingface/transformers/pull/39470 released
    # "idefics3-transformers": VLMTestInfo(
    #     models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
    #     test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
    #     prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
    #     img_idx_to_prompt=lambda idx: "<image>",
    #     max_model_len=8192,
    #     max_num_seqs=2,
    #     auto_cls=AutoModelForImageTextToText,
    #     hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
    #     image_size_factors=[(0.25, 0.5, 1.0)],
    #     vllm_runner_kwargs={
    #         "model_impl": "transformers",
    #     },
    #     marks=[pytest.mark.core_model],
    # ),
    # Pixel values from processor are not 4D or 5D arrays
    "qwen2_5_vl-transformers": VLMTestInfo(
        models=["Qwen/Qwen2.5-VL-3B-Instruct"],
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
        image_size_factors=[(0.25, 0.2, 0.15)],
        vllm_runner_kwargs={
            "model_impl": "transformers",
        },
        marks=[large_gpu_mark(min_gb=32)],
    ),
    # Check "auto" with fallback to transformers
    "internvl-transformers": VLMTestInfo(
        models=["OpenGVLab/InternVL3-1B-hf"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
        img_idx_to_prompt=lambda idx: "<IMG_CONTEXT>",
        max_model_len=4096,
        use_tokenizer_eos=True,
        image_size_factors=[(0.25, 0.5, 1.0)],
        vllm_runner_kwargs={
            "model_impl": "auto",
        },
        auto_cls=AutoModelForImageTextToText,
        marks=[pytest.mark.core_model],
    ),
243
    #### Extended model tests
244
    "aria": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
245
        models=[os.path.join(models_path_prefix, "rhymes-ai/Aria")],
246
247
248
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
        img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
Roger Wang's avatar
Roger Wang committed
249
250
        max_model_len=4096,
        max_num_seqs=2,
251
252
253
254
255
256
257
258
259
260
261
        auto_cls=AutoModelForImageTextToText,
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "<vlm_image>Please describe the image shortly.",
            "cherry_blossom": "<vlm_image>Please infer the season with reason.",  # noqa: E501
        }),
        multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.",    # noqa: E501
        stop_str=["<|im_end|>"],
        image_size_factors=[(0.10, 0.15)],
        max_tokens=64,
        marks=[large_gpu_mark(min_gb=64)],
    ),
Jennifer Zhao's avatar
Jennifer Zhao committed
262
    "aya_vision": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
263
        models=[os.path.join(models_path_prefix, "CohereForAI/aya-vision-8b")],
264
        test_type=(VLMTestType.IMAGE),
Jennifer Zhao's avatar
Jennifer Zhao committed
265
266
267
268
269
270
        prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "<image>What's the content in the center of the image?",  # noqa: E501
            "cherry_blossom": "<image>What is the season?",  # noqa: E501
        }),
        multi_image_prompt="<image><image>Describe the two images in detail.",  # noqa: E501
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
        vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}},
    ),
    "aya_vision-multi_image": VLMTestInfo(
        models=["CohereForAI/aya-vision-8b"],
        test_type=(VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "<image>What's the content in the center of the image?",  # noqa: E501
            "cherry_blossom": "<image>What is the season?",  # noqa: E501
        }),
        multi_image_prompt="<image><image>Describe the two images in detail.",  # noqa: E501
        max_model_len=4096,
Jennifer Zhao's avatar
Jennifer Zhao committed
286
287
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
288
289
        vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}},
        marks=[large_gpu_mark(min_gb=32)],
Roger Wang's avatar
Roger Wang committed
290
    ),
291
    "blip2": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
292
        models=[os.path.join(models_path_prefix,"Salesforce/blip2-opt-2.7b")],
293
294
295
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
        img_idx_to_prompt=lambda idx: "",
296
        auto_cls=AutoModelForImageTextToText,
297
        vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output,
298
299
        # FIXME: https://github.com/huggingface/transformers/pull/38510
        marks=[pytest.mark.skip("Model is broken")],
300
301
    ),
    "chameleon": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
302
        models=[os.path.join(models_path_prefix, "facebook/chameleon-7b")],
303
304
305
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        max_model_len=4096,
306
        max_num_seqs=2,
307
        auto_cls=AutoModelForImageTextToText,
308
309
310
311
312
313
314
        # For chameleon, we only compare the sequences
        vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
        hf_output_post_proc = lambda hf_output, model: hf_output[:2],
        comparator=check_outputs_equal,
        max_tokens=8,
        dtype="bfloat16",
    ),
315
    "deepseek_vl_v2": VLMTestInfo(
316
        models=["Isotr0py/deepseek-vl2-tiny"], # model repo using dynamic module
317
318
319
320
321
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
        single_image_prompts=IMAGE_ASSETS.prompts({
322
323
            "stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
            "cherry_blossom": "<image>\nPlease infer the season with reason in details.",   # noqa: E501
324
        }),
325
        multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?",    # noqa: E501
326
327
328
        patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
        hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
        stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"],  # noqa: E501
329
        image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
330
    ),
331
    "fuyu": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
332
        models=[os.path.join(models_path_prefix, "adept/fuyu-8b")],
333
334
335
336
337
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda img_prompt: f"{img_prompt}\n",
        img_idx_to_prompt=lambda idx: "",
        max_model_len=2048,
        max_num_seqs=2,
338
        auto_cls=AutoModelForImageTextToText,
339
340
341
342
343
        use_tokenizer_eos=True,
        vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
        num_logprobs=10,
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
    ),
344
    "gemma3": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
345
        models=[os.path.join(models_path_prefix, "google/gemma-3-4b-it")],
346
347
348
349
350
351
352
353
354
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "<start_of_image>What's the content in the center of the image?",  # noqa: E501
            "cherry_blossom": "<start_of_image>What is the season?",  # noqa: E501
        }),
        multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.",  # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
355
        auto_cls=AutoModelForImageTextToText,
356
357
        vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
        patch_hf_runner=model_utils.gemma3_patch_hf_runner,
358
        num_logprobs=10,
359
    ),
360
    "glm4v": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
361
        models=[os.path.join(models_path_prefix, "THUDM/glm-4v-9b")],
362
        test_type=VLMTestType.IMAGE,
363
364
365
366
367
        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",  # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?",  # noqa: E501
            "cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?",  # noqa: E501
        }),
368
369
370
        max_model_len=2048,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: [151329, 151336, 151338],
371
372
373
374
375
376
        patch_hf_runner=model_utils.glm4v_patch_hf_runner,
        # The image embeddings match with HF but the outputs of the language
        # decoder are only consistent up to 2 decimal places.
        # So, we need to reduce the number of tokens for the test to pass.
        max_tokens=8,
        num_logprobs=10,
377
        marks=[large_gpu_mark(min_gb=32)],
378
    ),
379
380
381
382
383
384
385
386
387
388
389
390
    "glm4_1v": VLMTestInfo(
        models=["THUDM/GLM-4.1V-9B-Thinking"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",  # noqa: E501
        img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", # noqa: E501
        video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>", # noqa: E501
        max_model_len=2048,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: [151329, 151336, 151338],
        num_logprobs=10,
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
        auto_cls=AutoModelForImageTextToText,
391
        marks=[large_gpu_mark(min_gb=32)],
392
393
394
395
396
397
398
399
400
401
402
403
404
    ),
    "glm4_1v-video": VLMTestInfo(
        models=["THUDM/GLM-4.1V-9B-Thinking"],
        # GLM4.1V require include video metadata for input
        test_type=VLMTestType.CUSTOM_INPUTS,
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
        patch_hf_runner=model_utils.glm4_1v_patch_hf_runner,
        custom_test_opts=[CustomTestOptions(
            inputs=custom_inputs.video_with_metadata_glm4_1v(),
            limit_mm_per_prompt={"video": 1},
        )],
405
        marks=[large_gpu_mark(min_gb=32)],
406
    ),
407
408
    "h2ovl": VLMTestInfo(
        models = [
zhuwenwen's avatar
zhuwenwen committed
409
            os.path.join(models_path_prefix, "h2oai/h2ovl-mississippi-800m"),
zhuwenwen's avatar
zhuwenwen committed
410
            os.path.join(models_path_prefix, "h2oai/h2ovl-mississippi-2b"),
411
412
413
414
415
416
417
418
419
420
        ],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "<image>\nWhat's the content in the center of the image?",  # noqa: E501
            "cherry_blossom": "<image>\nWhat is the season?",
        }),
        multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.",  # noqa: E501
        max_model_len=8192,
        use_tokenizer_eos=True,
421
        num_logprobs=10,
422
        patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
423
    ),
424
    "idefics3": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
425
        models=[os.path.join(models_path_prefix, "HuggingFaceTB/SmolVLM-256M-Instruct")],
426
427
428
429
430
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
        img_idx_to_prompt=lambda idx: "<image>",
        max_model_len=8192,
        max_num_seqs=2,
431
        auto_cls=AutoModelForImageTextToText,
432
        hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
433
    ),
434
435
    "intern_vl": VLMTestInfo(
        models=[
zhuwenwen's avatar
zhuwenwen committed
436
437
            os.path.join(models_path_prefix, "OpenGVLab/InternVL2-1B"),
            os.path.join(models_path_prefix, "OpenGVLab/InternVL2-2B"),
438
439
            # FIXME: Config cannot be loaded in transformers 4.52
            # "OpenGVLab/Mono-InternVL-2B",
440
441
442
443
444
445
446
447
448
449
450
451
        ],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "<image>\nWhat's the content in the center of the image?",  # noqa: E501
            "cherry_blossom": "<image>\nWhat is the season?",
        }),
        multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.",  # noqa: E501
        max_model_len=4096,
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
    ),
452
453
454
455
456
457
458
459
460
461
462
    "intern_vl-video": VLMTestInfo(
        models=[
            "OpenGVLab/InternVL3-1B",
        ],
        test_type=VLMTestType.VIDEO,
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
        video_idx_to_prompt=lambda idx: "<video>",
        max_model_len=8192,
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
    ),
463
464
465
466
467
468
469
470
471
472
473
474
    "kimi_vl": VLMTestInfo(
        models=["moonshotai/Kimi-VL-A3B-Instruct"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|im_user|>user<|im_middle|>{img_prompt}<|im_end|><|im_assistant|>assistant<|im_middle|>", # noqa: E501
        img_idx_to_prompt=lambda _: "<|media_start|>image<|media_content|><|media_pad|><|media_end|>",  # noqa: E501
        max_model_len=8192,
        max_num_seqs=2,
        dtype="bfloat16",
        tensor_parallel_size=1,
        vllm_output_post_proc=model_utils.kimiv_vl_vllm_to_hf_output,
        marks=[large_gpu_mark(min_gb=48)],
    ),
475
476
477
478
479
480
481
482
483
484
485
486
    "llama4": VLMTestInfo(
        models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n", # noqa: E501
        img_idx_to_prompt=lambda _: "<|image|>",
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        distributed_executor_backend="mp",
        image_size_factors=[(.25, 0.5, 1.0)],
        hf_model_kwargs={"device_map": "auto"},
        max_model_len=8192,
        max_num_seqs=4,
        dtype="bfloat16",
        auto_cls=AutoModelForImageTextToText,
487
488
        tensor_parallel_size=4,
        marks=multi_gpu_marks(num_gpus=4),
489
    ),
490
    "llava_next": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
491
        models=[os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")],
492
493
494
        test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
        prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
        max_model_len=10240,
495
        auto_cls=AutoModelForImageTextToText,
496
497
498
499
500
501
502
503
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
            inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
                formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]"
            ),
            limit_mm_per_prompt={"image": 4},
        )],
    ),
504
    "llava_onevision": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
505
        models=[os.path.join(models_path_prefix, "llava-hf/llava-onevision-qwen2-0.5b-ov-hf")],
506
507
508
509
        test_type=VLMTestType.CUSTOM_INPUTS,
        prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
        num_video_frames=16,
        max_model_len=16384,
510
        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),   # noqa: E501
511
512
513
514
515
516
517
518
519
520
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
            inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs(
                formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
            ),
            limit_mm_per_prompt={"video": 4},
        )],
    ),
    "llava_next_video": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
521
        models=[os.path.join(models_path_prefix, "llava-hf/LLaVA-NeXT-Video-7B-hf")],
522
523
524
525
        test_type=VLMTestType.VIDEO,
        prompt_formatter=lambda vid_prompt: f"USER: {vid_prompt} ASSISTANT:",
        num_video_frames=16,
        max_model_len=4096,
526
        max_num_seqs=2,
527
528
529
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output,
    ),
530
    "mantis": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
531
        models=[os.path.join(models_path_prefix, "TIGER-Lab/Mantis-8B-siglip-llama3")],
532
533
534
535
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
        max_model_len=4096,
        get_stop_token_ids=lambda tok: [128009],
536
        auto_cls=AutoModelForImageTextToText,
537
538
539
        vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
        patch_hf_runner=model_utils.mantis_patch_hf_runner,
    ),
540
    "minicpmv_25": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
541
        models=[os.path.join(models_path_prefix, "openbmb/MiniCPM-Llama3-V-2_5")],
542
        test_type=VLMTestType.IMAGE,
543
544
545
546
547
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
        max_model_len=4096,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
548
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
549
        patch_hf_runner=model_utils.minicpmv_25_patch_hf_runner,
550
551
        # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
        marks=[pytest.mark.skip("HF import fails")],
552
    ),
553
554
555
556
557
558
559
560
561
    "minicpmo_26": VLMTestInfo(
        models=["openbmb/MiniCPM-o-2_6"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
        max_model_len=4096,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
562
        patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
563
564
        # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
        marks=[pytest.mark.skip("HF import fails")],
565
    ),
566
    "minicpmv_26": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
567
        models=[os.path.join(models_path_prefix, "openbmb/MiniCPM-V-2_6")],
568
569
570
571
572
573
574
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
        max_model_len=4096,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
575
        patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
576
577
        # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
        marks=[pytest.mark.skip("HF import fails")],
578
    ),
579
580
581
582
583
584
585
586
587
588
589
590
    "minimax_vl_01": VLMTestInfo(
        models=["MiniMaxAI/MiniMax-VL-01"],
        prompt_formatter=lambda img_prompt: f"<beginning_of_sentence>user: {img_prompt} assistant:<end_of_sentence>", # noqa: E501
        img_idx_to_prompt=lambda _: "<image>",
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        max_model_len=8192,
        max_num_seqs=4,
        dtype="bfloat16",
        hf_output_post_proc=model_utils.minimax_vl_01_hf_output,
        patch_hf_runner=model_utils.minimax_vl_01_patch_hf_runner,
        auto_cls=AutoModelForImageTextToText,
        marks=[large_gpu_mark(min_gb=80)],
591
    ),
592
593
    "molmo": VLMTestInfo(
        models=["allenai/Molmo-7B-D-0924"],
594
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
595
        prompt_formatter=identity,
596
597
        max_model_len=4096,
        max_num_seqs=2,
598
        patch_hf_runner=model_utils.molmo_patch_hf_runner,
599
    ),
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
    "ovis1_6-gemma2": VLMTestInfo(
        models=["AIDC-AI/Ovis1.6-Gemma2-9B"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
        img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
        dtype="half",
        # use sdpa mode for hf runner since ovis2 didn't work with flash_attn
        hf_model_kwargs={"llm_attn_implementation": "sdpa"},
        patch_hf_runner=model_utils.ovis_patch_hf_runner,
        marks=[large_gpu_mark(min_gb=32)],
    ),
    "ovis1_6": VLMTestInfo(
        models=["AIDC-AI/Ovis1.6-Llama3.2-3B"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful and honest multimodal assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
        img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
        dtype="half",
        # use sdpa mode for hf runner since ovis2 didn't work with flash_attn
        hf_model_kwargs={"llm_attn_implementation": "sdpa"},
        patch_hf_runner=model_utils.ovis_patch_hf_runner,
    ),
625
626
627
628
629
630
631
632
633
634
    "ovis2": VLMTestInfo(
        models=["AIDC-AI/Ovis2-1B"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
        img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
        dtype="half",
        # use sdpa mode for hf runner since ovis2 didn't work with flash_attn
        hf_model_kwargs={"llm_attn_implementation": "sdpa"},
635
        patch_hf_runner=model_utils.ovis_patch_hf_runner,
636
    ),
637
638
639
640
641
642
643
644
    "phi3v": VLMTestInfo(
        models=["microsoft/Phi-3.5-vision-instruct"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501
        img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
        max_model_len=4096,
        max_num_seqs=2,
        task="generate",
645
646
        # use sdpa mode for hf runner since phi3v didn't work with flash_attn
        hf_model_kwargs={"_attn_implementation": "sdpa"},
647
648
649
650
        use_tokenizer_eos=True,
        vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output,
        num_logprobs=10,
    ),
651
    "pixtral_hf": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
652
        models=[os.path.join(models_path_prefix, "nm-testing/pixtral-12b-FP8-dynamic")],
653
654
655
656
657
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<s>[INST]{img_prompt}[/INST]",
        img_idx_to_prompt=lambda idx: "[IMG]",
        max_model_len=8192,
        max_num_seqs=2,
658
        auto_cls=AutoModelForImageTextToText,
659
        marks=[large_gpu_mark(min_gb=48)],
660
    ),
661
    "qwen_vl": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
662
        models=[os.path.join(models_path_prefix, "Qwen/Qwen-VL")],
663
664
665
666
667
668
669
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=identity,
        img_idx_to_prompt=lambda idx: f"Picture {idx}: <img></img>\n",
        max_model_len=1024,
        max_num_seqs=2,
        vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output,
        prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
670
671
        # FIXME: https://github.com/huggingface/transformers/issues/38358
        marks=[pytest.mark.skip("Model initialization fails")],
672
    ),
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
    "qwen2_vl": VLMTestInfo(
        models=["Qwen/Qwen2-VL-2B-Instruct"],
        test_type=(
            VLMTestType.IMAGE,
            VLMTestType.MULTI_IMAGE,
            VLMTestType.VIDEO
        ),
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
        marks=[pytest.mark.cpu_model],
    ),
690
691
692
693
694
695
696
697
698
699
700
701
702
703
    "skywork_r1v": VLMTestInfo(
        models=["Skywork/Skywork-R1V-38B"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|begin▁of▁sentence|><|User|>\n{img_prompt}<|Assistant|><think>\n", # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts({
            "stop_sign": "<image>\nWhat's the content in the center of the image?",  # noqa: E501
            "cherry_blossom": "<image>\nWhat is the season?",
        }),
        multi_image_prompt="<image>\n<image>\nDescribe the two images in short.",  # noqa: E501
        max_model_len=4096,
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.skyworkr1v_patch_hf_runner,
        marks=[large_gpu_mark(min_gb=80)],
    ),
704
705
706
707
708
709
710
711
712
713
    "smolvlm": VLMTestInfo(
        models=["HuggingFaceTB/SmolVLM2-2.2B-Instruct"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt:f"<|im_start|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
        img_idx_to_prompt=lambda idx: "<image>",
        max_model_len=8192,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
        hf_output_post_proc=model_utils.smolvlm_trunc_hf_output,
    ),
714
    ### Tensor parallel / multi-gpu broadcast tests
715
    "chameleon-broadcast": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
716
        models=[os.path.join(models_path_prefix, "facebook/chameleon-7b")],
717
718
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        max_model_len=4096,
719
        auto_cls=AutoModelForImageTextToText,
720
721
722
        vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
        hf_output_post_proc = lambda hf_output, model: hf_output[:2],
        comparator=check_outputs_equal,
723
        marks=multi_gpu_marks(num_gpus=2),
724
725
        **COMMON_BROADCAST_SETTINGS # type: ignore
    ),
726
    "llava-broadcast": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
727
        models=[os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf")],
728
729
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        max_model_len=4096,
730
        auto_cls=AutoModelForImageTextToText,
731
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
732
        marks=multi_gpu_marks(num_gpus=2),
733
734
        **COMMON_BROADCAST_SETTINGS # type: ignore
    ),
735
    "llava_next-broadcast": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
736
        models=[os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")],
737
738
        prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
        max_model_len=10240,
739
        auto_cls=AutoModelForImageTextToText,
740
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
741
        marks=multi_gpu_marks(num_gpus=2),
742
743
744
745
        **COMMON_BROADCAST_SETTINGS # type: ignore
    ),
    ### Custom input edge-cases for specific models
    "intern_vl-diff-patches": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
746
        models=[os.path.join(models_path_prefix, "OpenGVLab/InternVL2-2B")],
747
748
749
750
751
752
753
754
755
756
757
758
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
        test_type=VLMTestType.CUSTOM_INPUTS,
        max_model_len=4096,
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
        custom_test_opts=[
            CustomTestOptions(
                inputs=inp,
                limit_mm_per_prompt={"image": 2},
            ) for inp in custom_inputs.different_patch_input_cases_internvl()
        ],
    ),
759
    "llava_onevision-multiple-images": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
760
        models=[os.path.join(models_path_prefix, "llava-hf/llava-onevision-qwen2-0.5b-ov-hf")],
761
762
763
764
        test_type=VLMTestType.CUSTOM_INPUTS,
        max_model_len=16384,
        max_num_seqs=2,
        auto_cls=AutoModelForVision2Seq,
765
        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),   # noqa: E501
766
767
768
769
770
771
772
773
        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
            inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
                formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
            ),
            limit_mm_per_prompt={"image": 4},
        )],
    ),
774
775
776
777
778
779
780
781
782
783
784
785
786
    # regression test for https://github.com/vllm-project/vllm/issues/15122
    "qwen2_5_vl-windows-attention": VLMTestInfo(
        models=["Qwen/Qwen2.5-VL-3B-Instruct"],
        test_type=VLMTestType.CUSTOM_INPUTS,
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
            inputs=custom_inputs.windows_attention_image_qwen2_5_vl(),
            limit_mm_per_prompt={"image": 1},
        )],
    ),
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
    "llama4": VLMTestInfo(
        models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n", # noqa: E501
        img_idx_to_prompt=lambda _: "<|image|>",
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        distributed_executor_backend="mp",
        image_size_factors=[(.25, 0.5, 1.0)],
        hf_model_kwargs={"device_map": "auto"},
        max_model_len=8192,
        max_num_seqs=4,
        dtype="bfloat16",
        auto_cls=AutoModelForImageTextToText,
        tensor_parallel_size=8,
        vllm_runner_kwargs={"gpu_memory_utilization": 0.8},
        marks=[large_gpu_mark(min_gb=80), multi_gpu_marks(num_gpus=8)],
    ),
803
804
805
806
}
# yapf: enable


807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
def _mark_splits(
    test_settings: dict[str, VLMTestInfo],
    *,
    num_groups: int,
) -> dict[str, VLMTestInfo]:
    name_by_test_info_id = {id(v): k for k, v in test_settings.items()}
    test_infos_by_model = defaultdict[str, list[VLMTestInfo]](list)

    for info in test_settings.values():
        for model in info.models:
            test_infos_by_model[model].append(info)

    models = sorted(test_infos_by_model.keys())
    split_size = math.ceil(len(models) / num_groups)

    new_test_settings = dict[str, VLMTestInfo]()

    for i in range(num_groups):
        models_in_group = models[i * split_size:(i + 1) * split_size]

        for model in models_in_group:
            for info in test_infos_by_model[model]:
                new_marks = (info.marks or []) + [pytest.mark.split(group=i)]
                new_info = info._replace(marks=new_marks)
                new_test_settings[name_by_test_info_id[id(info)]] = new_info

    missing_keys = test_settings.keys() - new_test_settings.keys()
    assert not missing_keys, f"Missing keys: {missing_keys}"

    return new_test_settings


VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)


842
843
844
845
846
847
### Test wrappers
# Wrappers around the core test running func for:
# - single image
# - multi-image
# - image embeddings
# - video
848
# - audio
849
# - custom inputs
850
851
852
853
854
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.IMAGE,
855
        create_new_process_for_each_test=False,
856
    ))
857
858
def test_single_image_models(tmp_path: PosixPath, model_type: str,
                             test_case: ExpandableVLMTestArgs,
859
860
                             hf_runner: type[HfRunner],
                             vllm_runner: type[VllmRunner],
861
                             image_assets: ImageTestAssets, monkeypatch):
862
863
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
864
865
866
867
868
869
870
871
872
873
874
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_single_image_test(
        tmp_path=tmp_path,
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        image_assets=image_assets,
    )


875
876
877
878
879
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.MULTI_IMAGE,
880
        create_new_process_for_each_test=False,
881
    ))
882
883
def test_multi_image_models(tmp_path: PosixPath, model_type: str,
                            test_case: ExpandableVLMTestArgs,
884
885
                            hf_runner: type[HfRunner],
                            vllm_runner: type[VllmRunner],
886
                            image_assets: ImageTestAssets, monkeypatch):
887
888
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
889
890
891
892
893
894
895
896
897
898
899
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_multi_image_test(
        tmp_path=tmp_path,
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        image_assets=image_assets,
    )


900
901
902
903
904
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.EMBEDDING,
905
        create_new_process_for_each_test=False,
906
    ))
907
908
def test_image_embedding_models(model_type: str,
                                test_case: ExpandableVLMTestArgs,
909
910
                                hf_runner: type[HfRunner],
                                vllm_runner: type[VllmRunner],
911
                                image_assets: ImageTestAssets, monkeypatch):
912
913
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
914
915
916
917
918
919
920
921
922
923
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_embedding_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        image_assets=image_assets,
    )


924
925
926
927
928
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.VIDEO,
929
        create_new_process_for_each_test=False,
930
    ))
931
def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
932
                      hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
933
                      video_assets: VideoTestAssets, monkeypatch):
934
935
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
936
937
938
939
940
941
942
943
944
945
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_video_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        video_assets=video_assets,
    )


946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.AUDIO,
        create_new_process_for_each_test=False,
    ))
def test_audio_models(model_type: str, test_case: ExpandableVLMTestArgs,
                      hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
                      audio_assets: AudioTestAssets, monkeypatch):
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_audio_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        audio_assets=audio_assets,
    )


968
969
970
971
972
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.CUSTOM_INPUTS,
973
        create_new_process_for_each_test=False,
974
    ))
975
976
977
def test_custom_inputs_models(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
978
979
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
980
    monkeypatch,
981
):
982
983
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
984
985
986
987
988
989
990
991
992
993
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_custom_inputs_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
    )


#### Tests filtering for things running each test as a new process
994
995
996
997
998
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.IMAGE,
999
        create_new_process_for_each_test=True,
1000
    ))
1001
@create_new_process_for_each_test()
1002
1003
def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
                                   test_case: ExpandableVLMTestArgs,
1004
1005
                                   hf_runner: type[HfRunner],
                                   vllm_runner: type[VllmRunner],
1006
                                   image_assets: ImageTestAssets, monkeypatch):
1007
1008
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_single_image_test(
        tmp_path=tmp_path,
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        image_assets=image_assets,
    )


1020
1021
1022
1023
1024
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.MULTI_IMAGE,
1025
        create_new_process_for_each_test=True,
1026
    ))
1027
@create_new_process_for_each_test()
1028
1029
def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
                                  test_case: ExpandableVLMTestArgs,
1030
1031
                                  hf_runner: type[HfRunner],
                                  vllm_runner: type[VllmRunner],
1032
                                  image_assets: ImageTestAssets, monkeypatch):
1033
1034
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_multi_image_test(
        tmp_path=tmp_path,
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        image_assets=image_assets,
    )


1046
1047
1048
1049
1050
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.EMBEDDING,
1051
        create_new_process_for_each_test=True,
1052
    ))
1053
@create_new_process_for_each_test()
1054
1055
def test_image_embedding_models_heavy(model_type: str,
                                      test_case: ExpandableVLMTestArgs,
1056
1057
                                      hf_runner: type[HfRunner],
                                      vllm_runner: type[VllmRunner],
1058
1059
                                      image_assets: ImageTestAssets,
                                      monkeypatch):
1060
1061
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_embedding_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        image_assets=image_assets,
    )


1072
1073
1074
1075
1076
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.VIDEO,
1077
        create_new_process_for_each_test=True,
1078
    ))
1079
def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
1080
1081
                            hf_runner: type[HfRunner],
                            vllm_runner: type[VllmRunner],
1082
                            video_assets: VideoTestAssets, monkeypatch):
1083
1084
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_video_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        video_assets=video_assets,
    )


1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.AUDIO,
        create_new_process_for_each_test=True,
    ))
def test_audio_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
                            hf_runner: type[HfRunner],
                            vllm_runner: type[VllmRunner],
                            audio_assets: AudioTestAssets, monkeypatch):
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_audio_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        audio_assets=audio_assets,
    )


1118
1119
1120
1121
1122
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.CUSTOM_INPUTS,
1123
        create_new_process_for_each_test=True,
1124
    ))
1125
@create_new_process_for_each_test()
1126
1127
1128
def test_custom_inputs_models_heavy(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
1129
1130
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
1131
    monkeypatch,
1132
):
1133
1134
    if model_type in REQUIRES_V0_MODELS:
        monkeypatch.setenv("VLLM_USE_V1", "0")
1135
1136
1137
1138
1139
1140
1141
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_custom_inputs_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
    )