"vllm/vscode:/vscode.git/clone" did not exist on "5e5630a478fe75bc99e4ceea304f9ea68de5aaa6"
test_utils.py 17.5 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
import asyncio
5
6
import base64
import mimetypes
7
8
import os
from tempfile import NamedTemporaryFile, TemporaryDirectory
9
10
11

import numpy as np
import pytest
12
import torch
13
from PIL import Image, ImageChops
14

15
from vllm.multimodal.image import convert_image_mode
16
from vllm.multimodal.inputs import PlaceholderRange
17
from vllm.multimodal.utils import MediaConnector, argsort_mm_positions
18
19

# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
20
TEST_IMAGE_ASSETS = [
21
22
23
24
    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
    "Grayscale_8bits_palette_sample_image.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/Grayscale_8bits_palette_sample_image.png",
    "1280px-Venn_diagram_rgb.svg.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/1280px-Venn_diagram_rgb.svg.png",
    "RGBA_comp.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png",
25
26
]

27
28
TEST_VIDEO_URLS = [
    "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4",
29
    "https://github.com/opencv/opencv/raw/refs/tags/4.12.0/samples/data/vtest.avi",
30
31
]

32

33
@pytest.fixture(scope="module")
34
def url_images(local_asset_server) -> dict[str, Image.Image]:
35
    return {
36
37
        image_url: local_asset_server.get_image_asset(image_url)
        for image_url in TEST_IMAGE_ASSETS
38
    }
39
40


41
def get_supported_suffixes() -> tuple[str, ...]:
42
    # We should at least test the file types mentioned in GPT-4 with Vision
43
    OPENAI_SUPPORTED_SUFFIXES = (".png", ".jpeg", ".jpg", ".webp", ".gif")
44
45

    # Additional file types that are supported by us
46
    EXTRA_SUPPORTED_SUFFIXES = (".bmp", ".tiff")
47
48
49
50
51

    return OPENAI_SUPPORTED_SUFFIXES + EXTRA_SUPPORTED_SUFFIXES


def _image_equals(a: Image.Image, b: Image.Image) -> bool:
52
    return (np.asarray(a) == np.asarray(convert_image_mode(b, a.mode))).all()
53
54


55
@pytest.mark.asyncio
56
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
57
async def test_fetch_image_http(image_url: str):
58
59
60
61
    connector = MediaConnector()

    image_sync = connector.fetch_image(image_url)
    image_async = await connector.fetch_image_async(image_url)
62
63
64
    assert _image_equals(image_sync, image_async)


65
@pytest.mark.asyncio
66
@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
67
@pytest.mark.parametrize("suffix", get_supported_suffixes())
68
69
70
async def test_fetch_image_base64(
    url_images: dict[str, Image.Image], raw_image_url: str, suffix: str
):
71
72
73
74
75
    connector = MediaConnector(
        # Domain restriction should not apply to data URLs.
        allowed_media_domains=[
            "www.bogotobogo.com",
            "github.com",
76
77
        ]
    )
78
    url_image = url_images[raw_image_url]
79
80
81
82
83
84
85

    try:
        mime_type = Image.MIME[Image.registered_extensions()[suffix]]
    except KeyError:
        try:
            mime_type = mimetypes.types_map[suffix]
        except KeyError:
86
            pytest.skip("No MIME type")
87
88
89
90
91

    with NamedTemporaryFile(suffix=suffix) as f:
        try:
            url_image.save(f.name)
        except Exception as e:
92
93
            if e.args[0] == "cannot write mode RGBA as JPEG":
                pytest.skip("Conversion not supported")
94
95
96
97
98
99

            raise

        base64_image = base64.b64encode(f.read()).decode("utf-8")
        data_url = f"data:{mime_type};base64,{base64_image}"

100
        data_image_sync = connector.fetch_image(data_url)
101
        if _image_equals(url_image, Image.open(f)):
102
            assert _image_equals(url_image, data_image_sync)
103
104
        else:
            pass  # Lossy format; only check that image can be opened
105

106
        data_image_async = await connector.fetch_image_async(data_url)
107
        assert _image_equals(data_image_sync, data_image_async)
108
109


110
@pytest.mark.asyncio
111
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
112
async def test_fetch_image_local_files(image_url: str):
113
114
    connector = MediaConnector()

115
    with TemporaryDirectory() as temp_dir:
116
117
118
        local_connector = MediaConnector(allowed_local_media_path=temp_dir)

        origin_image = connector.fetch_image(image_url)
119
120
121
122
123
        origin_image.save(
            os.path.join(temp_dir, os.path.basename(image_url)),
            quality=100,
            icc_profile=origin_image.info.get("icc_profile"),
        )
124

125
        image_async = await local_connector.fetch_image_async(
126
127
            f"file://{temp_dir}/{os.path.basename(image_url)}"
        )
128
        image_sync = local_connector.fetch_image(
129
130
            f"file://{temp_dir}/{os.path.basename(image_url)}"
        )
131
132
133
        # Check that the images are equal
        assert not ImageChops.difference(image_sync, image_async).getbbox()

134
135
        with pytest.raises(ValueError, match="must be a subpath"):
            await local_connector.fetch_image_async(
136
137
                f"file://{temp_dir}/../{os.path.basename(image_url)}"
            )
138
139
        with pytest.raises(RuntimeError, match="Cannot load local files"):
            await connector.fetch_image_async(
140
141
                f"file://{temp_dir}/../{os.path.basename(image_url)}"
            )
142

143
144
        with pytest.raises(ValueError, match="must be a subpath"):
            local_connector.fetch_image(
145
146
                f"file://{temp_dir}/../{os.path.basename(image_url)}"
            )
147
        with pytest.raises(RuntimeError, match="Cannot load local files"):
148
            connector.fetch_image(f"file://{temp_dir}/../{os.path.basename(image_url)}")
149
150


151
@pytest.mark.asyncio
152
153
@pytest.mark.parametrize("image_url", [TEST_IMAGE_ASSETS[0]], indirect=True)
async def test_fetch_image_local_files_with_space_in_name(image_url: str):
154
155
156
157
158
159
160
    connector = MediaConnector()

    with TemporaryDirectory() as temp_dir:
        local_connector = MediaConnector(allowed_local_media_path=temp_dir)

        origin_image = connector.fetch_image(image_url)
        filename = "file name with space.jpg"
161
162
163
164
165
        origin_image.save(
            os.path.join(temp_dir, filename),
            quality=100,
            icc_profile=origin_image.info.get("icc_profile"),
        )
166
167
168

        try:
            image_async = await local_connector.fetch_image_async(
169
170
171
                f"file://{temp_dir}/{filename}"
            )
            image_sync = local_connector.fetch_image(f"file://{temp_dir}/{filename}")
172
        except FileNotFoundError as e:
173
            pytest.fail("Failed to fetch image with space in name: {}".format(e))
174
175
176
177
        # Check that the images are equal
        assert not ImageChops.difference(image_sync, image_async).getbbox()


178
179
180
181
182
183
184
185
186
187
188
189
190
@pytest.mark.asyncio
async def test_fetch_image_error_conversion():
    connector = MediaConnector()
    broken_img = "data:image/png;base64,aGVsbG9fdmxsbV9jb21tdW5pdHkK"

    # PIL.UnidentifiedImageError should be converted to ValueError
    with pytest.raises(ValueError):
        await connector.fetch_image_async(broken_img)

    with pytest.raises(ValueError):
        connector.fetch_image(broken_img)


191
@pytest.mark.flaky(reruns=3, reruns_delay=5)
192
193
194
195
@pytest.mark.asyncio
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
@pytest.mark.parametrize("num_frames", [-1, 32, 1800])
async def test_fetch_video_http(video_url: str, num_frames: int):
196
    connector = MediaConnector(
197
198
199
200
201
202
        media_io_kwargs={
            "video": {
                "num_frames": num_frames,
            }
        }
    )
203

204
205
206
207
208
209
    try:
        video_sync, metadata_sync = connector.fetch_video(video_url)
        video_async, metadata_async = await connector.fetch_video_async(video_url)
    except (TimeoutError, asyncio.TimeoutError) as e:
        pytest.skip(f"Timeout fetching video (CI network flakiness): {e}")

210
211
    assert np.array_equal(video_sync, video_async)
    assert metadata_sync == metadata_async
212
213


214
215
216
217
218
@pytest.mark.asyncio
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
@pytest.mark.parametrize("max_duration", [1, 60, 1800])
@pytest.mark.parametrize("requested_fps", [2, 24])
async def test_fetch_video_http_with_dynamic_loader(
219
220
221
222
223
    video_url: str,
    max_duration: int,
    requested_fps: int,
    monkeypatch: pytest.MonkeyPatch,
):
224
225
226
227
228
229
230
231
    with monkeypatch.context() as m:
        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic")
        connector = MediaConnector(
            media_io_kwargs={
                "video": {
                    "max_duration": max_duration,
                    "requested_fps": requested_fps,
                }
232
233
            }
        )
234
235

        video_sync, metadata_sync = connector.fetch_video(video_url)
236
        video_async, metadata_async = await connector.fetch_video_async(video_url)
237
238
239
240
241
242

        assert np.array_equal(video_sync, video_async)
        assert metadata_sync == metadata_async
        assert metadata_sync["video_backend"] == "opencv_dynamic"


243
244
245
@pytest.mark.parametrize(
    "case",
    [
246
247
        # Single modality
        ## Internally sorted
248
        dict(
249
250
251
252
253
254
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=2),
                    PlaceholderRange(offset=3, length=2),
                ]
            },
255
256
257
            expected_modality_idxs=[
                ("image", 0),
                ("image", 1),
258
259
            ],
        ),
260
        ## Internally unsorted
261
        dict(
262
263
            mm_positions={
                "image": [
264
                    PlaceholderRange(offset=3, length=2),
265
266
267
                    PlaceholderRange(offset=0, length=2),
                ]
            },
268
269
270
            expected_modality_idxs=[
                ("image", 1),
                ("image", 0),
271
272
            ],
        ),
273
274
        # Two modalities
        ## Internally sorted
275
        dict(
276
277
278
279
280
281
282
283
            mm_positions={
                "image": [
                    PlaceholderRange(offset=7, length=4),
                    PlaceholderRange(offset=11, length=5),
                ],
                "audio": [
                    PlaceholderRange(offset=0, length=2),
                    PlaceholderRange(offset=2, length=3),
284
                ],
285
            },
286
287
288
289
290
            expected_modality_idxs=[
                ("audio", 0),
                ("audio", 1),
                ("image", 0),
                ("image", 1),
291
            ],
292
293
        ),
        ## Interleaved, internally sorted
294
        dict(
295
296
297
298
299
300
301
302
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=4),
                    PlaceholderRange(offset=8, length=2),
                ],
                "audio": [
                    PlaceholderRange(offset=5, length=2),
                    PlaceholderRange(offset=11, length=4),
303
                ],
304
305
306
307
308
309
            },
            expected_modality_idxs=[
                ("image", 0),
                ("audio", 0),
                ("image", 1),
                ("audio", 1),
310
311
            ],
        ),
312
        ## Interleaved, internally unsorted
313
        dict(
314
315
            mm_positions={
                "image": [
316
317
                    PlaceholderRange(offset=8, length=2),
                    PlaceholderRange(offset=0, length=4),
318
319
                ],
                "audio": [
320
321
                    PlaceholderRange(offset=11, length=4),
                    PlaceholderRange(offset=5, length=2),
322
                ],
323
            },
324
325
326
327
328
            expected_modality_idxs=[
                ("image", 1),
                ("audio", 1),
                ("image", 0),
                ("audio", 0),
329
330
331
            ],
        ),
        # Three modalities
332
        ## Internally sorted
333
        dict(
334
335
336
337
338
339
340
341
342
343
344
345
            mm_positions={
                "image": [
                    PlaceholderRange(offset=15, length=7),
                    PlaceholderRange(offset=22, length=8),
                ],
                "audio": [
                    PlaceholderRange(offset=0, length=2),
                ],
                "video": [
                    PlaceholderRange(offset=3, length=4),
                    PlaceholderRange(offset=7, length=5),
                    PlaceholderRange(offset=12, length=6),
346
                ],
347
            },
348
349
350
351
352
353
354
            expected_modality_idxs=[
                ("audio", 0),
                ("video", 0),
                ("video", 1),
                ("video", 2),
                ("image", 0),
                ("image", 1),
355
            ],
356
        ),
357
        ## Interleaved, internally sorted
358
        dict(
359
360
361
362
363
364
365
366
367
368
369
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=2),
                    PlaceholderRange(offset=2, length=3),
                    PlaceholderRange(offset=20, length=4),
                ],
                "audio": [
                    PlaceholderRange(offset=5, length=2),
                ],
                "video": [
                    PlaceholderRange(offset=8, length=5),
370
                ],
371
            },
372
373
374
375
376
377
            expected_modality_idxs=[
                ("image", 0),
                ("image", 1),
                ("audio", 0),
                ("video", 0),
                ("image", 2),
378
            ],
379
        ),
380
381
        ## Interleaved, internally unsorted
        dict(
382
383
384
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=2),
385
386
                    PlaceholderRange(offset=20, length=4),
                    PlaceholderRange(offset=2, length=3),
387
388
                ],
                "audio": [
389
                    PlaceholderRange(offset=5, length=2),
390
391
                ],
                "video": [
392
                    PlaceholderRange(offset=8, length=5),
393
                ],
394
            },
395
396
397
398
399
400
            expected_modality_idxs=[
                ("image", 0),
                ("image", 2),
                ("audio", 0),
                ("video", 0),
                ("image", 1),
401
402
            ],
        ),
403
404
405
406
407
    ],
)
def test_argsort_mm_positions(case):
    mm_positions = case["mm_positions"]
    expected_modality_idxs = case["expected_modality_idxs"]
408

409
    modality_idxs = argsort_mm_positions(mm_positions)
410

411
    assert modality_idxs == expected_modality_idxs
412
413


414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
@pytest.mark.parametrize(
    "is_embed,expected",
    [
        (None, 5),
        (torch.tensor([True, True, True, True, True]), 5),
        (torch.tensor([False, False, False, False, False]), 0),
        (torch.tensor([True, False, True, False, True]), 3),
        (torch.tensor([True]), 1),
    ],
)
def test_placeholder_range_get_num_embeds(is_embed, expected):
    length = len(is_embed) if is_embed is not None else 5
    pr = PlaceholderRange(offset=0, length=length, is_embed=is_embed)
    assert pr.get_num_embeds == expected


@pytest.mark.parametrize(
    "is_embed,expected",
    [
        (None, None),
        (
            torch.tensor([False, True, False, True, True]),
            torch.tensor([0, 1, 1, 2, 3]),
        ),
        (torch.tensor([True, True, True]), torch.tensor([1, 2, 3])),
    ],
)
def test_placeholder_range_embeds_cumsum(is_embed, expected):
    length = len(is_embed) if is_embed is not None else 5
    pr = PlaceholderRange(offset=0, length=length, is_embed=is_embed)

    if expected is None:
        assert pr.embeds_cumsum is None
        return

    assert torch.equal(pr.embeds_cumsum, expected)
    # cached_property should return the same object on repeated access
    assert pr.embeds_cumsum is pr.embeds_cumsum


@pytest.mark.parametrize(
    "is_embed,start_idx,end_idx,expected",
    [
        (None, 2, 4, (2, 4)),
        (
            torch.tensor([False, True, False, True, True]),
            3,
            5,
            (1, 3),
        ),
        (
            torch.tensor([False, True, False, True, True]),
            0,
            2,
            (0, 1),
        ),
        (
            torch.tensor([True, False, True, False]),
            2,
            2,
            (1, 1),
        ),
    ],
)
def test_placeholder_range_get_embeds_indices_in_range(
    is_embed, start_idx, end_idx, expected
):
    length = len(is_embed) if is_embed is not None else 5
    pr = PlaceholderRange(offset=0, length=length, is_embed=is_embed)
    assert pr.get_embeds_indices_in_range(start_idx, end_idx) == expected


@pytest.mark.parametrize(
    "offset,is_embed,expected",
    [
        (0, None, [(0, 4)]),
        (
            2,
            torch.tensor([False, True, False, True, True]),
            [(3, 3), (5, 6)],
        ),
        (0, torch.tensor([True, True, True, True]), [(0, 3)]),
        (0, torch.tensor([False, False, False, False]), []),
    ],
)
def test_placeholder_range_extract_embeds_range(offset, is_embed, expected):
    length = len(is_embed) if is_embed is not None else 5
    pr = PlaceholderRange(offset=offset, length=length, is_embed=is_embed)
    assert pr.extract_embeds_range() == expected


505
506
507
508
509
@pytest.mark.asyncio
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
@pytest.mark.parametrize("num_frames", [-1, 32, 1800])
async def test_allowed_media_domains(video_url: str, num_frames: int):
    connector = MediaConnector(
510
511
512
513
514
        media_io_kwargs={
            "video": {
                "num_frames": num_frames,
            }
        },
515
516
517
        allowed_media_domains=[
            "www.bogotobogo.com",
            "github.com",
518
519
        ],
    )
520
521
522
523
524
525
526
527
528
529
530
531

    video_sync, metadata_sync = connector.fetch_video(video_url)
    video_async, metadata_async = await connector.fetch_video_async(video_url)
    assert np.array_equal(video_sync, video_async)
    assert metadata_sync == metadata_async

    disallowed_url = "https://upload.wikimedia.org/wikipedia/commons/4/47/PNG_transparency_demonstration_1.png"
    with pytest.raises(ValueError):
        _, _ = connector.fetch_video(disallowed_url)

    with pytest.raises(ValueError):
        _, _ = await connector.fetch_video_async(disallowed_url)