"vllm/vscode:/vscode.git/clone" did not exist on "5a3f1eb62fb8a5d114001488832f8bd7f93df5b8"
test_common.py 55.5 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
"""Common tests for testing .generate() functionality for single / multiple
image, embedding, and video support for different VLMs in vLLM.
"""
6

7
8
import math
from collections import defaultdict
9
10
from pathlib import PosixPath

zhuwenwen's avatar
zhuwenwen committed
11
import os
12
import pytest
13
from packaging.version import Version
14
15
from transformers import (
    AutoModel,
16
    AutoModelForCausalLM,
17
18
19
    AutoModelForImageTextToText,
    AutoModelForTextToWaveform,
)
20
from transformers import __version__ as TRANSFORMERS_VERSION
21
22

from vllm.platforms import current_platform
23
from vllm.utils.func_utils import identity
24

25
26
27
28
29
30
31
32
33
from ....conftest import (
    IMAGE_ASSETS,
    AudioTestAssets,
    HfRunner,
    ImageTestAssets,
    VideoTestAssets,
    VllmRunner,
)
from ....utils import create_new_process_for_each_test, large_gpu_mark, multi_gpu_marks
34
35
36
from ...utils import check_outputs_equal
from .vlm_utils import custom_inputs, model_utils, runners
from .vlm_utils.case_filtering import get_parametrized_options
37
38
39
40
41
42
from .vlm_utils.types import (
    CustomTestOptions,
    ExpandableVLMTestArgs,
    VLMTestInfo,
    VLMTestType,
)
zhuwenwen's avatar
zhuwenwen committed
43
from ....utils import models_path_prefix
44
45
46
47
48
49

COMMON_BROADCAST_SETTINGS = {
    "test_type": VLMTestType.IMAGE,
    "dtype": "half",
    "max_tokens": 5,
    "tensor_parallel_size": 2,
50
    "hf_model_kwargs": {"device_map": "auto"},
51
    "image_size_factors": [(0.25, 0.5, 1.0)],
52
53
54
    "distributed_executor_backend": (
        "ray",
        "mp",
55
    ),
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
}

### Test configuration for specific models
# NOTE: The convention of the test settings below is to lead each test key
# with the name of the model arch used in the test, using underscores in place
# of hyphens; this makes it more convenient to filter tests for a specific kind
# of model. For example....
#
# To run all test types for a specific key:
#     use the k flag to substring match with a leading square bracket; if the
#     model arch happens to be a substring of another one, you can add a
#     trailing hyphen. E.g.,
#                 - pytest $TEST_FILE -k "[llava-"
#     prevents matching on "[llava_next-" & will match just the enabled cases
#     for llava, i.e., single image, image embedding, and custom input tests.
#
# To run a test for a Test Info for just one of multiple models:
#     use the k flag to substring match the model name, e.g.,
#                 - pytest $TEST_FILE -k OpenGVLab/InternVL2-1B
#     prevents matching on nGVLab/InternVL2-2B.
#
# You can also combine substrings to match more granularly.
#     ex 1:
#        pytest $TEST_FILE -k "test_single_image and OpenGVLab/InternVL2-1B"
#     will run only test_single_image* for OpenGVLab/InternVL2-1B; this would
#     match both wrappers for single image tests, since it also matches
#     test_single_image_heavy (which forks if we have a distributed backend)
#     ex 2:
#        pytest $TEST_FILE -k  "[llava- or [intern_vl-"
#     will run all of the tests for only llava & internvl.
#
# NOTE you can add --collect-only to any of the above commands to see
# which cases would be selected and deselected by pytest. In general,
# this is a good idea for checking your command first, since tests are slow.

VLM_TEST_SETTINGS = {
92
93
    #### Core tests to always run in the CI
    "llava": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
94
        models=[os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf")],
95
        test_type=(VLMTestType.EMBEDDING, VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
96
97
98
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        convert_assets_to_embeddings=model_utils.get_llava_embeddings,
        max_model_len=4096,
99
        auto_cls=AutoModelForImageTextToText,
100
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
101
102
103
104
105
106
107
108
        custom_test_opts=[
            CustomTestOptions(
                inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
                    formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:"
                ),
                limit_mm_per_prompt={"image": 4},
            )
        ],
109
        vllm_runner_kwargs={"enable_mm_embeds": True},
110
        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
111
112
    ),
    "paligemma": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
113
        models=[os.path.join(models_path_prefix, "google/paligemma-3b-mix-224")],
114
115
        test_type=VLMTestType.IMAGE,
        prompt_formatter=identity,
116
        img_idx_to_prompt=lambda idx: "",
117
        # Paligemma uses its own sample prompts because the default one fails
118
119
120
121
122
123
        single_image_prompts=IMAGE_ASSETS.prompts(
            {
                "stop_sign": "caption es",
                "cherry_blossom": "What is in the picture?",
            }
        ),
124
        auto_cls=AutoModelForImageTextToText,
125
126
        vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
    ),
zhuwenwen's avatar
zhuwenwen committed
127

Roger Wang's avatar
Roger Wang committed
128
    "qwen2_5_vl": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
129
        models=[os.path.join(models_path_prefix, "Qwen/Qwen2.5-VL-3B-Instruct")],
130
131
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
132
133
        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>",
134
        enforce_eager=False,
135
136
        max_model_len=4096,
        max_num_seqs=2,
137
        auto_cls=AutoModelForImageTextToText,
138
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
139
        image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
140
        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
141
    ),
142
    "qwen2_5_omni": VLMTestInfo(
143
        models=[os.path.join(models_path_prefix, "Qwen/Qwen2.5-Omni-3B")],
144
145
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
146
147
        img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>",
        video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>",
148
149
        max_model_len=4096,
        max_num_seqs=2,
150
        num_logprobs=6 if current_platform.is_cpu() else 5,
151
        auto_cls=AutoModelForTextToWaveform,
152
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
153
        patch_hf_runner=model_utils.qwen2_5_omni_patch_hf_runner,
154
        image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
155
156
        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
    ),
157
    "qwen3_vl": VLMTestInfo(
158
        models=[os.path.join(models_path_prefix, "Qwen/Qwen3-VL-4B-Instruct")],
159
160
161
        test_type=(
            VLMTestType.IMAGE,
            VLMTestType.MULTI_IMAGE,
162
            VLMTestType.VIDEO,
163
        ),
164
        enforce_eager=False,
165
166
167
168
        needs_video_metadata=True,
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",  # noqa: E501
        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>",  # noqa: E501
169
170
        max_model_len=4096,
        max_num_seqs=2,
171
172
        num_logprobs=20,
        auto_cls=AutoModelForImageTextToText,
173
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
174
        patch_hf_runner=model_utils.qwen3_vl_patch_hf_runner,
175
176
177
178
179
180
181
        vllm_runner_kwargs={
            "attention_config": {
                "backend": "ROCM_AITER_FA",
            },
        }
        if current_platform.is_rocm()
        else None,
182
        image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
183
184
185
        marks=[
            pytest.mark.core_model,
        ],
186
    ),
187
    "ultravox": VLMTestInfo(
188
        models=[os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_5-llama-3_2-1b")],
189
        test_type=VLMTestType.AUDIO,
190
        prompt_formatter=lambda audio_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{audio_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
191
192
193
194
195
196
197
        audio_idx_to_prompt=lambda idx: "<|audio|>",
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModel,
        hf_output_post_proc=model_utils.ultravox_trunc_hf_output,
        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
    ),
198
199
200
201
    #### Transformers fallback to test
    ## To reduce test burden, we only test batching arbitrary image size
    # Dynamic image length and number of patches
    "llava-onevision-transformers": VLMTestInfo(
202
        models=[os.path.join(models_path_prefix, "llava-hf/llava-onevision-qwen2-0.5b-ov-hf")],
203
        test_type=VLMTestType.IMAGE,
204
        prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
205
        max_model_len=16384,
206
        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
207
            os.path.join(models_path_prefix,"llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
208
        ),
209
210
211
212
213
        auto_cls=AutoModelForImageTextToText,
        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
        image_size_factors=[(0.25, 0.5, 1.0)],
        vllm_runner_kwargs={
            "model_impl": "transformers",
214
            "default_torch_num_threads": 1,
215
        },
216
217
218
        # FIXME: Investigate why the test hangs
        # when processing the 3rd prompt in vLLM
        marks=[pytest.mark.core_model, pytest.mark.skip(reason="Test hangs")],
219
    ),
220
221
    # Gemma3 has bidirectional mask on images
    "gemma3-transformers": VLMTestInfo(
222
        models=[os.path.join(models_path_prefix, "google/gemma-3-4b-it")],
223
224
225
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda vid_prompt: f"<'<bos><start_of_turn>user\n{vid_prompt}<start_of_image><end_of_turn>\n<start_of_turn>model\n",  # noqa: E501
        max_model_len=4096,
226
227
228
229
230
231
232
233
        auto_cls=AutoModelForImageTextToText,
        vllm_output_post_proc=model_utils.gemma3_vllm_to_hf_output,
        image_size_factors=[(0.25, 0.5, 1.0)],
        vllm_runner_kwargs={
            "model_impl": "transformers",
        },
        marks=[pytest.mark.core_model],
    ),
234
    "idefics3-transformers": VLMTestInfo(
235
        models=[os.path.join(models_path_prefix, "HuggingFaceTB/SmolVLM-256M-Instruct")],
236
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
237
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
238
239
240
241
242
243
244
245
246
247
248
        img_idx_to_prompt=lambda idx: "<image>",
        max_model_len=8192,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
        hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
        image_size_factors=[(0.25, 0.5, 1.0)],
        vllm_runner_kwargs={
            "model_impl": "transformers",
        },
        marks=[pytest.mark.core_model],
    ),
249
250
    # Pixel values from processor are not 4D or 5D arrays
    "qwen2_5_vl-transformers": VLMTestInfo(
251
        models=[os.path.join(models_path_prefix, "Qwen/Qwen2.5-VL-3B-Instruct")],
252
        test_type=VLMTestType.IMAGE,
253
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
254
        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
255
256
257
258
259
260
261
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
        image_size_factors=[(0.25, 0.2, 0.15)],
        vllm_runner_kwargs={
            "model_impl": "transformers",
262
263
264
265
266
267
268
269
270
271
272
            # TODO: [ROCm] Revert this once issue #30167 is resolved
            **(
                {
                    "mm_processor_kwargs": {
                        "min_pixels": 256 * 28 * 28,
                        "max_pixels": 1280 * 28 * 28,
                    },
                }
                if current_platform.is_rocm()
                else {}
            ),
273
        },
274
        marks=[large_gpu_mark(min_gb=80 if current_platform.is_rocm() else 32)],
275
    ),
276
    #### Extended model tests
277
    "aria": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
278
        models=[os.path.join(models_path_prefix, "rhymes-ai/Aria")],
279
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
280
        prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ",  # noqa: E501
281
        img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
Roger Wang's avatar
Roger Wang committed
282
283
        max_model_len=4096,
        max_num_seqs=2,
284
        auto_cls=AutoModelForImageTextToText,
285
286
287
        single_image_prompts=IMAGE_ASSETS.prompts(
            {
                "stop_sign": "<vlm_image>Please describe the image shortly.",
288
                "cherry_blossom": "<vlm_image>Please infer the season with reason.",
289
290
            }
        ),
291
        multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.",
292
293
294
295
296
        stop_str=["<|im_end|>"],
        image_size_factors=[(0.10, 0.15)],
        max_tokens=64,
        marks=[large_gpu_mark(min_gb=64)],
    ),
Jennifer Zhao's avatar
Jennifer Zhao committed
297
    "aya_vision": VLMTestInfo(
298
        models=[os.path.join(models_path_prefix, "CohereLabs/aya-vision-8b")],
299
        test_type=(VLMTestType.IMAGE),
300
301
302
        prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",  # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts(
            {
303
304
                "stop_sign": "<image>What's the content in the center of the image?",
                "cherry_blossom": "<image>What is the season?",
305
306
            }
        ),
307
        multi_image_prompt="<image><image>Describe the two images in detail.",
308
309
310
311
312
313
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
        vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}},
    ),
    "aya_vision-multi_image": VLMTestInfo(
314
        models=["CohereLabs/aya-vision-8b"],
315
        test_type=(VLMTestType.MULTI_IMAGE),
316
317
318
        prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",  # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts(
            {
319
320
                "stop_sign": "<image>What's the content in the center of the image?",
                "cherry_blossom": "<image>What is the season?",
321
322
            }
        ),
323
        multi_image_prompt="<image><image>Describe the two images in detail.",
324
        max_model_len=4096,
Jennifer Zhao's avatar
Jennifer Zhao committed
325
326
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
327
328
        vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}},
        marks=[large_gpu_mark(min_gb=32)],
Roger Wang's avatar
Roger Wang committed
329
    ),
330
    "blip2": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
331
        models=[os.path.join(models_path_prefix,"Salesforce/blip2-opt-2.7b")],
332
333
334
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
        img_idx_to_prompt=lambda idx: "",
335
        auto_cls=AutoModelForImageTextToText,
336
        vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output,
337
338
        # FIXME: https://github.com/huggingface/transformers/pull/38510
        marks=[pytest.mark.skip("Model is broken")],
339
340
    ),
    "chameleon": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
341
        models=[os.path.join(models_path_prefix, "facebook/chameleon-7b")],
342
343
344
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        max_model_len=4096,
345
        max_num_seqs=2,
346
        auto_cls=AutoModelForImageTextToText,
347
        # For chameleon, we only compare the sequences
348
349
        vllm_output_post_proc=lambda vllm_output, model: vllm_output[:2],
        hf_output_post_proc=lambda hf_output, model: hf_output[:2],
350
351
352
353
        comparator=check_outputs_equal,
        max_tokens=8,
        dtype="bfloat16",
    ),
354
    "deepseek_vl_v2": VLMTestInfo(
355
        models=["Isotr0py/deepseek-vl2-tiny"],  # model repo using dynamic module
356
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
357
        prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ",  # noqa: E501
358
359
        max_model_len=4096,
        max_num_seqs=2,
360
361
        single_image_prompts=IMAGE_ASSETS.prompts(
            {
362
                "stop_sign": "<image>\nWhat's the content in the center of the image?",
363
364
365
366
                "cherry_blossom": "<image>\nPlease infer the season with reason in details.",  # noqa: E501
            }
        ),
        multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?",  # noqa: E501
367
368
        patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
        hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
369
        stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"],
370
        image_size_factors=[(1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
371
    ),
372
    "fuyu": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
373
        models=[os.path.join(models_path_prefix, "adept/fuyu-8b")],
374
375
376
377
378
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda img_prompt: f"{img_prompt}\n",
        img_idx_to_prompt=lambda idx: "",
        max_model_len=2048,
        max_num_seqs=2,
379
        auto_cls=AutoModelForImageTextToText,
380
381
382
383
        use_tokenizer_eos=True,
        vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
        num_logprobs=10,
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
384
        marks=[large_gpu_mark(min_gb=32)],
385
    ),
386
    "gemma3": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
387
        models=[os.path.join(models_path_prefix, "google/gemma-3-4b-it")],
388
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
389
390
391
392
393
394
395
        prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n",  # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts(
            {
                "stop_sign": "<start_of_image>What's the content in the center of the image?",  # noqa: E501
                "cherry_blossom": "<start_of_image>What is the season?",
            }
        ),
396
397
398
        multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.",  # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
399
        auto_cls=AutoModelForImageTextToText,
400
401
402
        vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
        patch_hf_runner=model_utils.gemma3_patch_hf_runner,
    ),
403
404
405
406
407
408
409
410
    "granite_vision": VLMTestInfo(
        models=["ibm-granite/granite-vision-3.3-2b"],
        test_type=(VLMTestType.IMAGE),
        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}\n<|assistant|>\n",
        max_model_len=8192,
        auto_cls=AutoModelForImageTextToText,
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
    ),
411
    "glm4v": VLMTestInfo(
412
        models=[os.path.join(models_path_prefix, "zai-org/glm-4v-9b")],
413
        test_type=VLMTestType.IMAGE,
414
        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",
415
416
417
418
419
420
        single_image_prompts=IMAGE_ASSETS.prompts(
            {
                "stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?",  # noqa: E501
                "cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?",  # noqa: E501
            }
        ),
421
422
423
        max_model_len=2048,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: [151329, 151336, 151338],
424
425
426
427
428
429
        patch_hf_runner=model_utils.glm4v_patch_hf_runner,
        # The image embeddings match with HF but the outputs of the language
        # decoder are only consistent up to 2 decimal places.
        # So, we need to reduce the number of tokens for the test to pass.
        max_tokens=8,
        num_logprobs=10,
430
        auto_cls=AutoModelForCausalLM,
431
        marks=[large_gpu_mark(min_gb=32)],
432
    ),
433
    "glm4_1v": VLMTestInfo(
434
        models=["zai-org/GLM-4.1V-9B-Thinking"],
435
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
436
        prompt_formatter=lambda img_prompt: f"[gMASK]<|user|>\n{img_prompt}<|assistant|>\n",  # noqa: E501
437
438
        img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>",
        video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>",
439
440
441
442
443
444
        max_model_len=2048,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: [151329, 151336, 151338],
        num_logprobs=10,
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
        auto_cls=AutoModelForImageTextToText,
445
        marks=[large_gpu_mark(min_gb=32)],
446
447
    ),
    "glm4_1v-video": VLMTestInfo(
448
        models=["zai-org/GLM-4.1V-9B-Thinking"],
449
450
        # GLM4.1V require include video metadata for input
        test_type=VLMTestType.CUSTOM_INPUTS,
451
        prompt_formatter=lambda vid_prompt: f"[gMASK]<|user|>\n{vid_prompt}<|assistant|>\n",  # noqa: E501
452
453
454
455
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
        patch_hf_runner=model_utils.glm4_1v_patch_hf_runner,
456
457
458
459
460
461
        custom_test_opts=[
            CustomTestOptions(
                inputs=custom_inputs.video_with_metadata_glm4_1v(),
                limit_mm_per_prompt={"video": 1},
            )
        ],
462
        marks=[large_gpu_mark(min_gb=32)],
463
    ),
464
465
466
467
468
469
470
471
472
473
474
475
476
477
    "glm_ocr": VLMTestInfo(
        models=["zai-org/GLM-OCR"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"[gMASK]<|user|>\n{img_prompt}<|assistant|>\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>",
        video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>",
        max_model_len=2048,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: [151329, 151336, 151338],
        num_logprobs=10,
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
        auto_cls=AutoModelForImageTextToText,
        marks=[large_gpu_mark(min_gb=32)],
    ),
478
    "h2ovl": VLMTestInfo(
479
        models=[
480
481
            os.path.join(models_path_prefix,"h2oai/h2ovl-mississippi-800m"),
            os.path.join(models_path_prefix,"h2oai/h2ovl-mississippi-2b"),
482
483
        ],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
484
        prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>",
485
486
        single_image_prompts=IMAGE_ASSETS.prompts(
            {
487
                "stop_sign": "<image>\nWhat's the content in the center of the image?",
488
489
490
                "cherry_blossom": "<image>\nWhat is the season?",
            }
        ),
491
492
493
        multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.",  # noqa: E501
        max_model_len=8192,
        use_tokenizer_eos=True,
494
        num_logprobs=10,
495
        patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
496
    ),
497
    "idefics3": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
498
        models=[os.path.join(models_path_prefix, "HuggingFaceTB/SmolVLM-256M-Instruct")],
499
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
500
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
501
502
503
        img_idx_to_prompt=lambda idx: "<image>",
        max_model_len=8192,
        max_num_seqs=2,
504
        auto_cls=AutoModelForImageTextToText,
505
        hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
506
    ),
507
508
    "intern_vl": VLMTestInfo(
        models=[
zhuwenwen's avatar
zhuwenwen committed
509
510
            os.path.join(models_path_prefix, "OpenGVLab/InternVL2-1B"),
            os.path.join(models_path_prefix, "OpenGVLab/InternVL2-2B"),
511
512
            # FIXME: Config cannot be loaded in transformers 4.52
            # "OpenGVLab/Mono-InternVL-2B",
513
514
        ],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
515
516
517
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts(
            {
518
                "stop_sign": "<image>\nWhat's the content in the center of the image?",
519
520
521
                "cherry_blossom": "<image>\nWhat is the season?",
            }
        ),
522
523
524
525
526
        multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.",  # noqa: E501
        max_model_len=4096,
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
    ),
527
528
529
530
531
    "intern_vl-video": VLMTestInfo(
        models=[
            "OpenGVLab/InternVL3-1B",
        ],
        test_type=VLMTestType.VIDEO,
532
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
533
534
535
536
        video_idx_to_prompt=lambda idx: "<video>",
        max_model_len=8192,
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
537
        num_logprobs=10 if current_platform.is_rocm() else 5,
538
    ),
539
540
541
542
543
544
545
    "intern_vl-hf": VLMTestInfo(
        models=["OpenGVLab/InternVL3-1B-hf"],
        test_type=(
            VLMTestType.IMAGE,
            VLMTestType.MULTI_IMAGE,
            VLMTestType.VIDEO,
        ),
546
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
547
548
549
550
551
552
        img_idx_to_prompt=lambda idx: "<IMG_CONTEXT>",
        video_idx_to_prompt=lambda idx: "<video>",
        max_model_len=8192,
        use_tokenizer_eos=True,
        auto_cls=AutoModelForImageTextToText,
    ),
oscardev256's avatar
oscardev256 committed
553
    "isaac": VLMTestInfo(
554
555
556
557
        models=[
            "PerceptronAI/Isaac-0.1",
            "PerceptronAI/Isaac-0.2-2B-Preview",
        ],
oscardev256's avatar
oscardev256 committed
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: (
            f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n"
        ),
        img_idx_to_prompt=lambda idx: "<image>",
        single_image_prompts=IMAGE_ASSETS.prompts(
            {
                "stop_sign": "<vlm_image>Please describe the image shortly.",
                "cherry_blossom": "<vlm_image>Please infer the season with reason.",
            }
        ),
        multi_image_prompt=(
            "Picture 1: <vlm_image>\n"
            "Picture 2: <vlm_image>\n"
            "Describe these two images with one paragraph respectively."
        ),
        enforce_eager=False,
        max_model_len=4096,
        max_num_seqs=2,
        hf_model_kwargs={"device_map": "auto"},
        patch_hf_runner=model_utils.isaac_patch_hf_runner,
        image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
    ),
581
582
583
    "kimi_vl": VLMTestInfo(
        models=["moonshotai/Kimi-VL-A3B-Instruct"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
584
        prompt_formatter=lambda img_prompt: f"<|im_user|>user<|im_middle|>{img_prompt}<|im_end|><|im_assistant|>assistant<|im_middle|>",  # noqa: E501
585
586
587
588
589
590
591
592
        img_idx_to_prompt=lambda _: "<|media_start|>image<|media_content|><|media_pad|><|media_end|>",  # noqa: E501
        max_model_len=8192,
        max_num_seqs=2,
        dtype="bfloat16",
        tensor_parallel_size=1,
        vllm_output_post_proc=model_utils.kimiv_vl_vllm_to_hf_output,
        marks=[large_gpu_mark(min_gb=48)],
    ),
593
594
    "llama4": VLMTestInfo(
        models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
595
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n",  # noqa: E501
596
597
598
        img_idx_to_prompt=lambda _: "<|image|>",
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        distributed_executor_backend="mp",
599
        image_size_factors=[(0.25, 0.5, 1.0)],
600
601
602
603
604
        hf_model_kwargs={"device_map": "auto"},
        max_model_len=8192,
        max_num_seqs=4,
        dtype="bfloat16",
        auto_cls=AutoModelForImageTextToText,
605
606
        tensor_parallel_size=4,
        marks=multi_gpu_marks(num_gpus=4),
607
    ),
608
    "llava_next": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
609
        models=[os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")],
610
611
612
        test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
        prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
        max_model_len=10240,
613
        auto_cls=AutoModelForImageTextToText,
614
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
615
616
617
618
619
620
621
622
        custom_test_opts=[
            CustomTestOptions(
                inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
                    formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]"
                ),
                limit_mm_per_prompt={"image": 4},
            )
        ],
623
    ),
624
    "llava_onevision": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
625
        models=[os.path.join(models_path_prefix, "llava-hf/llava-onevision-qwen2-0.5b-ov-hf")],
626
        test_type=VLMTestType.CUSTOM_INPUTS,
627
        prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
628
629
        num_video_frames=16,
        max_model_len=16384,
630
631
        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
            "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
632
        ),
633
        auto_cls=AutoModelForImageTextToText,
634
        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
635
636
637
638
639
640
641
642
        custom_test_opts=[
            CustomTestOptions(
                inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs(
                    formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
                ),
                limit_mm_per_prompt={"video": 4},
            )
        ],
643
644
    ),
    "llava_next_video": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
645
        models=[os.path.join(models_path_prefix, "llava-hf/LLaVA-NeXT-Video-7B-hf")],
646
647
648
649
        test_type=VLMTestType.VIDEO,
        prompt_formatter=lambda vid_prompt: f"USER: {vid_prompt} ASSISTANT:",
        num_video_frames=16,
        max_model_len=4096,
650
        max_num_seqs=2,
651
        auto_cls=AutoModelForImageTextToText,
652
653
        vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output,
    ),
654
    "mantis": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
655
        models=[os.path.join(models_path_prefix, "TIGER-Lab/Mantis-8B-siglip-llama3")],
656
657
658
659
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
        max_model_len=4096,
        get_stop_token_ids=lambda tok: [128009],
660
        auto_cls=AutoModelForImageTextToText,
661
662
663
        vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
        patch_hf_runner=model_utils.mantis_patch_hf_runner,
    ),
664
    "minicpmv_25": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
665
        models=[os.path.join(models_path_prefix, "openbmb/MiniCPM-Llama3-V-2_5")],
666
        test_type=VLMTestType.IMAGE,
667
668
669
670
671
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
        max_model_len=4096,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
672
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
673
        patch_hf_runner=model_utils.minicpmv_25_patch_hf_runner,
674
675
        # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
        marks=[pytest.mark.skip("HF import fails")],
676
    ),
677
678
679
680
681
682
683
    "minicpmo_26": VLMTestInfo(
        models=["openbmb/MiniCPM-o-2_6"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
        max_model_len=4096,
        max_num_seqs=2,
684
685
        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(
            ["<|im_end|>", "<|endoftext|>"]
686
        ),
687
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
688
        patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
689
        # FIXME: https://huggingface.co/openbmb/MiniCPM-o-2_6/discussions/49
690
        marks=[pytest.mark.skip("HF import fails")],
691
    ),
692
    "minicpmv_26": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
693
        models=[os.path.join(models_path_prefix, "openbmb/MiniCPM-V-2_6")],
694
695
696
697
698
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
        max_model_len=4096,
        max_num_seqs=2,
699
700
        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(
            ["<|im_end|>", "<|endoftext|>"]
701
        ),
702
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
703
        patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
704
    ),
705
706
    "minimax_vl_01": VLMTestInfo(
        models=["MiniMaxAI/MiniMax-VL-01"],
707
        prompt_formatter=lambda img_prompt: f"<beginning_of_sentence>user: {img_prompt} assistant:<end_of_sentence>",  # noqa: E501
708
709
710
711
712
713
714
715
        img_idx_to_prompt=lambda _: "<image>",
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        max_model_len=8192,
        max_num_seqs=4,
        dtype="bfloat16",
        hf_output_post_proc=model_utils.minimax_vl_01_hf_output,
        patch_hf_runner=model_utils.minimax_vl_01_patch_hf_runner,
        auto_cls=AutoModelForImageTextToText,
716
717
718
719
720
721
722
723
724
725
726
        marks=[
            large_gpu_mark(min_gb=80),
            # TODO: [ROCm] Fix pickle issue with ROCm spawn and tp>1
            pytest.mark.skipif(
                current_platform.is_rocm(),
                reason=(
                    "ROCm: Model too large for single GPU; "
                    "multi-GPU blocked by HF _LazyConfigMapping pickle issue with spawn"
                ),
            ),
        ],
727
    ),
728
729
    "molmo": VLMTestInfo(
        models=["allenai/Molmo-7B-D-0924"],
730
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
731
        prompt_formatter=identity,
732
733
        max_model_len=4096,
        max_num_seqs=2,
734
        patch_hf_runner=model_utils.molmo_patch_hf_runner,
735
    ),
736
737
738
    "ovis1_6-gemma2": VLMTestInfo(
        models=["AIDC-AI/Ovis1.6-Gemma2-9B"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
739
        prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n",  # noqa: E501
740
        img_idx_to_prompt=lambda idx: "<image>\n",
741
742
743
744
745
746
747
748
        max_model_len=4096,
        max_num_seqs=2,
        dtype="half",
        # use sdpa mode for hf runner since ovis2 didn't work with flash_attn
        hf_model_kwargs={"llm_attn_implementation": "sdpa"},
        patch_hf_runner=model_utils.ovis_patch_hf_runner,
        marks=[large_gpu_mark(min_gb=32)],
    ),
749
750
751
    "ovis2": VLMTestInfo(
        models=["AIDC-AI/Ovis2-1B"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
752
        prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
753
        img_idx_to_prompt=lambda idx: "<image>\n",
754
755
756
757
758
        max_model_len=4096,
        max_num_seqs=2,
        dtype="half",
        # use sdpa mode for hf runner since ovis2 didn't work with flash_attn
        hf_model_kwargs={"llm_attn_implementation": "sdpa"},
759
        patch_hf_runner=model_utils.ovis_patch_hf_runner,
760
    ),
761
762
    "ovis2_5": VLMTestInfo(
        models=["AIDC-AI/Ovis2.5-2B"],
763
764
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
        prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
765
        img_idx_to_prompt=lambda idx: "<image>\n",
766
767
768
769
770
771
        video_idx_to_prompt=lambda idx: "<video>\n",
        max_model_len=4096,
        max_num_seqs=2,
        dtype="half",
        num_logprobs=10,
        patch_hf_runner=model_utils.ovis2_5_patch_hf_runner,
772
        hf_model_kwargs={"revision": "refs/pr/5"},
773
    ),
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
    "paddleocr_vl": VLMTestInfo(
        models=["PaddlePaddle/PaddleOCR-VL"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        img_idx_to_prompt=lambda idx: (
            "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>"
        ),
        multi_image_prompt=(
            "Image-1: <|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>\n"
            "Image-2: <|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>\n"
            "Describe these two images separately."
        ),
        max_model_len=8192,
        max_num_seqs=2,
        auto_cls=AutoModelForCausalLM,
789
        image_size_factors=[(0.25,)],
790
791
792
793
794
795
        marks=[
            pytest.mark.skipif(
                Version(TRANSFORMERS_VERSION) == Version("4.57.3"),
                reason="This model is broken in Transformers v4.57.3",
            )
        ],
796
    ),
797
798
799
    "phi3v": VLMTestInfo(
        models=["microsoft/Phi-3.5-vision-instruct"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
800
        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n",  # noqa: E501
801
802
803
        img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
        max_model_len=4096,
        max_num_seqs=2,
804
        runner="generate",
805
806
        # use sdpa mode for hf runner since phi3v didn't work with flash_attn
        hf_model_kwargs={"_attn_implementation": "sdpa"},
807
808
809
810
        use_tokenizer_eos=True,
        vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output,
        num_logprobs=10,
    ),
811
    "pixtral_hf": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
812
        models=[os.path.join(models_path_prefix, "nm-testing/pixtral-12b-FP8-dynamic")],
813
814
815
816
817
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<s>[INST]{img_prompt}[/INST]",
        img_idx_to_prompt=lambda idx: "[IMG]",
        max_model_len=8192,
        max_num_seqs=2,
818
        auto_cls=AutoModelForImageTextToText,
819
820
821
822
823
824
825
        marks=[
            large_gpu_mark(min_gb=48),
            pytest.mark.skipif(
                current_platform.is_rocm(),
                reason="Model produces a vector of <UNK> output in HF on ROCm",
            ),
        ],
826
    ),
827
    "qwen_vl": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
828
        models=[os.path.join(models_path_prefix, "Qwen/Qwen-VL")],
829
830
831
832
833
834
835
836
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=identity,
        img_idx_to_prompt=lambda idx: f"Picture {idx}: <img></img>\n",
        max_model_len=1024,
        max_num_seqs=2,
        vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output,
        prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
    ),
837
838
    "qwen2_vl": VLMTestInfo(
        models=["Qwen/Qwen2-VL-2B-Instruct"],
839
840
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
841
842
        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>",
843
        multi_image_prompt="Picture 1: <vlm_image>\nPicture 2: <vlm_image>\nDescribe these two images with one paragraph respectively.",  # noqa: E501
844
845
        max_model_len=4096,
        max_num_seqs=2,
846
        auto_cls=AutoModelForImageTextToText,
847
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
848
        image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
849
850
        marks=[pytest.mark.cpu_model],
    ),
851
852
853
    "skywork_r1v": VLMTestInfo(
        models=["Skywork/Skywork-R1V-38B"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
854
855
856
        prompt_formatter=lambda img_prompt: f"<|begin▁of▁sentence|><|User|>\n{img_prompt}<|Assistant|><think>\n",  # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts(
            {
857
                "stop_sign": "<image>\nWhat's the content in the center of the image?",
858
859
860
                "cherry_blossom": "<image>\nWhat is the season?",
            }
        ),
861
        multi_image_prompt="<image>\n<image>\nDescribe the two images in short.",
862
863
864
865
866
        max_model_len=4096,
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.skyworkr1v_patch_hf_runner,
        marks=[large_gpu_mark(min_gb=80)],
    ),
867
868
869
    "smolvlm": VLMTestInfo(
        models=["HuggingFaceTB/SmolVLM2-2.2B-Instruct"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
870
        prompt_formatter=lambda img_prompt: f"<|im_start|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
871
872
873
874
875
        img_idx_to_prompt=lambda idx: "<image>",
        max_model_len=8192,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
        hf_output_post_proc=model_utils.smolvlm_trunc_hf_output,
876
        num_logprobs=10,
877
    ),
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
    "tarsier": VLMTestInfo(
        models=["omni-research/Tarsier-7b"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt} ASSISTANT:",
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
        patch_hf_runner=model_utils.tarsier_patch_hf_runner,
    ),
    "tarsier2": VLMTestInfo(
        models=["omni-research/Tarsier2-Recap-7b"],
        test_type=(
            VLMTestType.IMAGE,
            VLMTestType.MULTI_IMAGE,
            VLMTestType.VIDEO,
        ),
894
        prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
895
896
        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>",
897
898
899
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
900
        image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
901
902
        marks=[pytest.mark.skip("Model initialization hangs")],
    ),
903
    ### Tensor parallel / multi-gpu broadcast tests
904
    "chameleon-broadcast": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
905
        models=[os.path.join(models_path_prefix, "facebook/chameleon-7b")],
906
907
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        max_model_len=4096,
908
        auto_cls=AutoModelForImageTextToText,
909
910
        vllm_output_post_proc=lambda vllm_output, model: vllm_output[:2],
        hf_output_post_proc=lambda hf_output, model: hf_output[:2],
911
        comparator=check_outputs_equal,
912
        marks=multi_gpu_marks(num_gpus=2),
913
        **COMMON_BROADCAST_SETTINGS,  # type: ignore
914
    ),
915
    "llava-broadcast": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
916
        models=[os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf")],
917
918
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        max_model_len=4096,
919
        auto_cls=AutoModelForImageTextToText,
920
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
921
        marks=multi_gpu_marks(num_gpus=2),
922
        **COMMON_BROADCAST_SETTINGS,  # type: ignore
923
    ),
924
    "llava_next-broadcast": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
925
        models=[os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")],
926
927
        prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
        max_model_len=10240,
928
        auto_cls=AutoModelForImageTextToText,
929
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
930
        marks=multi_gpu_marks(num_gpus=2),
931
        **COMMON_BROADCAST_SETTINGS,  # type: ignore
932
933
934
    ),
    ### Custom input edge-cases for specific models
    "intern_vl-diff-patches": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
935
        models=[os.path.join(models_path_prefix, "OpenGVLab/InternVL2-2B")],
936
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
937
938
939
940
941
942
943
944
        test_type=VLMTestType.CUSTOM_INPUTS,
        max_model_len=4096,
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
        custom_test_opts=[
            CustomTestOptions(
                inputs=inp,
                limit_mm_per_prompt={"image": 2},
945
946
            )
            for inp in custom_inputs.different_patch_input_cases_internvl()
947
948
        ],
    ),
949
    "llava_onevision-multiple-images": VLMTestInfo(
zhuwenwen's avatar
zhuwenwen committed
950
        models=[os.path.join(models_path_prefix, "llava-hf/llava-onevision-qwen2-0.5b-ov-hf")],
951
952
953
        test_type=VLMTestType.CUSTOM_INPUTS,
        max_model_len=16384,
        max_num_seqs=2,
954
        auto_cls=AutoModelForImageTextToText,
955
956
        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
            "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
957
        ),
958
        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
959
960
961
962
963
964
965
966
        custom_test_opts=[
            CustomTestOptions(
                inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
                    formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
                ),
                limit_mm_per_prompt={"image": 4},
            )
        ],
967
968
969
970
971
972
        marks=[
            pytest.mark.skipif(
                Version(TRANSFORMERS_VERSION) == Version("4.57.1"),
                reason="This model is broken in Transformers v4.57.1",
            )
        ],
973
    ),
974
975
976
977
978
979
    # regression test for https://github.com/vllm-project/vllm/issues/15122
    "qwen2_5_vl-windows-attention": VLMTestInfo(
        models=["Qwen/Qwen2.5-VL-3B-Instruct"],
        test_type=VLMTestType.CUSTOM_INPUTS,
        max_model_len=4096,
        max_num_seqs=2,
980
        auto_cls=AutoModelForImageTextToText,
981
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
982
983
984
985
986
987
        custom_test_opts=[
            CustomTestOptions(
                inputs=custom_inputs.windows_attention_image_qwen2_5_vl(),
                limit_mm_per_prompt={"image": 1},
            )
        ],
988
    ),
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
    "llama4": VLMTestInfo(
        models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n", # noqa: E501
        img_idx_to_prompt=lambda _: "<|image|>",
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        distributed_executor_backend="mp",
        image_size_factors=[(.25, 0.5, 1.0)],
        hf_model_kwargs={"device_map": "auto"},
        max_model_len=8192,
        max_num_seqs=4,
        dtype="bfloat16",
        auto_cls=AutoModelForImageTextToText,
        tensor_parallel_size=8,
        vllm_runner_kwargs={"gpu_memory_utilization": 0.8},
        marks=[large_gpu_mark(min_gb=80), multi_gpu_marks(num_gpus=8)],
    ),
1005
1006
1007
}


1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
def _mark_splits(
    test_settings: dict[str, VLMTestInfo],
    *,
    num_groups: int,
) -> dict[str, VLMTestInfo]:
    name_by_test_info_id = {id(v): k for k, v in test_settings.items()}
    test_infos_by_model = defaultdict[str, list[VLMTestInfo]](list)

    for info in test_settings.values():
        for model in info.models:
            test_infos_by_model[model].append(info)

    models = sorted(test_infos_by_model.keys())
    split_size = math.ceil(len(models) / num_groups)

    new_test_settings = dict[str, VLMTestInfo]()

    for i in range(num_groups):
1026
        models_in_group = models[i * split_size : (i + 1) * split_size]
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042

        for model in models_in_group:
            for info in test_infos_by_model[model]:
                new_marks = (info.marks or []) + [pytest.mark.split(group=i)]
                new_info = info._replace(marks=new_marks)
                new_test_settings[name_by_test_info_id[id(info)]] = new_info

    missing_keys = test_settings.keys() - new_test_settings.keys()
    assert not missing_keys, f"Missing keys: {missing_keys}"

    return new_test_settings


VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)


1043
1044
1045
1046
1047
1048
### Test wrappers
# Wrappers around the core test running func for:
# - single image
# - multi-image
# - image embeddings
# - video
1049
# - audio
1050
# - custom inputs
1051
1052
1053
1054
1055
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.IMAGE,
1056
        create_new_process_for_each_test=False,
1057
1058
    ),
)
1059
1060
1061
1062
1063
1064
1065
1066
def test_single_image_models(
    tmp_path: PosixPath,
    model_type: str,
    test_case: ExpandableVLMTestArgs,
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
    image_assets: ImageTestAssets,
):
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_single_image_test(
        tmp_path=tmp_path,
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        image_assets=image_assets,
    )


1078
1079
1080
1081
1082
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.MULTI_IMAGE,
1083
        create_new_process_for_each_test=False,
1084
1085
    ),
)
1086
1087
1088
1089
1090
1091
1092
1093
def test_multi_image_models(
    tmp_path: PosixPath,
    model_type: str,
    test_case: ExpandableVLMTestArgs,
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
    image_assets: ImageTestAssets,
):
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_multi_image_test(
        tmp_path=tmp_path,
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        image_assets=image_assets,
    )


1105
1106
1107
1108
1109
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.EMBEDDING,
1110
        create_new_process_for_each_test=False,
1111
1112
    ),
)
1113
1114
1115
1116
1117
1118
1119
def test_image_embedding_models(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
    image_assets: ImageTestAssets,
):
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_embedding_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        image_assets=image_assets,
    )


1130
1131
1132
1133
1134
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.VIDEO,
1135
        create_new_process_for_each_test=False,
1136
1137
    ),
)
1138
1139
1140
1141
1142
1143
1144
def test_video_models(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
    video_assets: VideoTestAssets,
):
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_video_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        video_assets=video_assets,
    )


1155
1156
1157
1158
1159
1160
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.AUDIO,
        create_new_process_for_each_test=False,
1161
1162
    ),
)
1163
1164
1165
1166
1167
1168
1169
def test_audio_models(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
    audio_assets: AudioTestAssets,
):
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_audio_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        audio_assets=audio_assets,
    )


1180
1181
1182
1183
1184
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.CUSTOM_INPUTS,
1185
        create_new_process_for_each_test=False,
1186
1187
    ),
)
1188
1189
1190
def test_custom_inputs_models(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
1191
1192
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
):
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_custom_inputs_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
    )


#### Tests filtering for things running each test as a new process
1204
1205
1206
1207
1208
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.IMAGE,
1209
        create_new_process_for_each_test=True,
1210
1211
    ),
)
1212
@create_new_process_for_each_test()
1213
1214
1215
1216
1217
1218
1219
1220
def test_single_image_models_heavy(
    tmp_path: PosixPath,
    model_type: str,
    test_case: ExpandableVLMTestArgs,
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
    image_assets: ImageTestAssets,
):
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_single_image_test(
        tmp_path=tmp_path,
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        image_assets=image_assets,
    )


1232
1233
1234
1235
1236
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.MULTI_IMAGE,
1237
        create_new_process_for_each_test=True,
1238
1239
    ),
)
1240
@create_new_process_for_each_test()
1241
1242
1243
1244
1245
1246
1247
1248
def test_multi_image_models_heavy(
    tmp_path: PosixPath,
    model_type: str,
    test_case: ExpandableVLMTestArgs,
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
    image_assets: ImageTestAssets,
):
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_multi_image_test(
        tmp_path=tmp_path,
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        image_assets=image_assets,
    )


1260
1261
1262
1263
1264
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.EMBEDDING,
1265
        create_new_process_for_each_test=True,
1266
1267
    ),
)
1268
@create_new_process_for_each_test()
1269
1270
1271
1272
1273
1274
1275
def test_image_embedding_models_heavy(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
    image_assets: ImageTestAssets,
):
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_embedding_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        image_assets=image_assets,
    )


1286
1287
1288
1289
1290
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.VIDEO,
1291
        create_new_process_for_each_test=True,
1292
1293
    ),
)
1294
1295
1296
1297
1298
1299
1300
def test_video_models_heavy(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
    video_assets: VideoTestAssets,
):
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_video_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        video_assets=video_assets,
    )


1311
1312
1313
1314
1315
1316
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.AUDIO,
        create_new_process_for_each_test=True,
1317
1318
    ),
)
1319
1320
1321
1322
1323
1324
1325
def test_audio_models_heavy(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
    audio_assets: AudioTestAssets,
):
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_audio_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
        audio_assets=audio_assets,
    )


1336
1337
1338
1339
1340
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.CUSTOM_INPUTS,
1341
        create_new_process_for_each_test=True,
1342
1343
    ),
)
1344
@create_new_process_for_each_test()
1345
1346
1347
def test_custom_inputs_models_heavy(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
1348
1349
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
1350
1351
1352
1353
1354
1355
1356
1357
):
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_custom_inputs_test(
        model_test_info=model_test_info,
        test_case=test_case,
        hf_runner=hf_runner,
        vllm_runner=vllm_runner,
    )