"tests/vscode:/vscode.git/clone" did not exist on "7560ae5cafbae3af9967ac7dc979cb31a40fc572"
test_utils.py 14.8 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
import asyncio
5
6
import base64
import mimetypes
7
8
import os
from tempfile import NamedTemporaryFile, TemporaryDirectory
9
10
11

import numpy as np
import pytest
12
from PIL import Image, ImageChops
13

14
from vllm.multimodal.image import convert_image_mode
15
from vllm.multimodal.inputs import PlaceholderRange
16
from vllm.multimodal.utils import MediaConnector, argsort_mm_positions
17
18

# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
19
TEST_IMAGE_ASSETS = [
20
21
22
23
    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
    "Grayscale_8bits_palette_sample_image.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/Grayscale_8bits_palette_sample_image.png",
    "1280px-Venn_diagram_rgb.svg.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/1280px-Venn_diagram_rgb.svg.png",
    "RGBA_comp.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png",
24
25
]

26
27
TEST_VIDEO_URLS = [
    "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4",
28
    "https://github.com/opencv/opencv/raw/refs/tags/4.12.0/samples/data/vtest.avi",
29
30
]

31

32
@pytest.fixture(scope="module")
33
def url_images(local_asset_server) -> dict[str, Image.Image]:
34
    return {
35
36
        image_url: local_asset_server.get_image_asset(image_url)
        for image_url in TEST_IMAGE_ASSETS
37
    }
38
39


40
def get_supported_suffixes() -> tuple[str, ...]:
41
    # We should at least test the file types mentioned in GPT-4 with Vision
42
    OPENAI_SUPPORTED_SUFFIXES = (".png", ".jpeg", ".jpg", ".webp", ".gif")
43
44

    # Additional file types that are supported by us
45
    EXTRA_SUPPORTED_SUFFIXES = (".bmp", ".tiff")
46
47
48
49
50

    return OPENAI_SUPPORTED_SUFFIXES + EXTRA_SUPPORTED_SUFFIXES


def _image_equals(a: Image.Image, b: Image.Image) -> bool:
51
    return (np.asarray(a) == np.asarray(convert_image_mode(b, a.mode))).all()
52
53


54
@pytest.mark.asyncio
55
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
56
async def test_fetch_image_http(image_url: str):
57
58
59
60
    connector = MediaConnector()

    image_sync = connector.fetch_image(image_url)
    image_async = await connector.fetch_image_async(image_url)
61
62
63
    assert _image_equals(image_sync, image_async)


64
@pytest.mark.asyncio
65
@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
66
@pytest.mark.parametrize("suffix", get_supported_suffixes())
67
68
69
async def test_fetch_image_base64(
    url_images: dict[str, Image.Image], raw_image_url: str, suffix: str
):
70
71
72
73
74
    connector = MediaConnector(
        # Domain restriction should not apply to data URLs.
        allowed_media_domains=[
            "www.bogotobogo.com",
            "github.com",
75
76
        ]
    )
77
    url_image = url_images[raw_image_url]
78
79
80
81
82
83
84

    try:
        mime_type = Image.MIME[Image.registered_extensions()[suffix]]
    except KeyError:
        try:
            mime_type = mimetypes.types_map[suffix]
        except KeyError:
85
            pytest.skip("No MIME type")
86
87
88
89
90

    with NamedTemporaryFile(suffix=suffix) as f:
        try:
            url_image.save(f.name)
        except Exception as e:
91
92
            if e.args[0] == "cannot write mode RGBA as JPEG":
                pytest.skip("Conversion not supported")
93
94
95
96
97
98

            raise

        base64_image = base64.b64encode(f.read()).decode("utf-8")
        data_url = f"data:{mime_type};base64,{base64_image}"

99
        data_image_sync = connector.fetch_image(data_url)
100
        if _image_equals(url_image, Image.open(f)):
101
            assert _image_equals(url_image, data_image_sync)
102
103
        else:
            pass  # Lossy format; only check that image can be opened
104

105
        data_image_async = await connector.fetch_image_async(data_url)
106
        assert _image_equals(data_image_sync, data_image_async)
107
108


109
@pytest.mark.asyncio
110
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
111
async def test_fetch_image_local_files(image_url: str):
112
113
    connector = MediaConnector()

114
    with TemporaryDirectory() as temp_dir:
115
116
117
        local_connector = MediaConnector(allowed_local_media_path=temp_dir)

        origin_image = connector.fetch_image(image_url)
118
119
120
121
122
        origin_image.save(
            os.path.join(temp_dir, os.path.basename(image_url)),
            quality=100,
            icc_profile=origin_image.info.get("icc_profile"),
        )
123

124
        image_async = await local_connector.fetch_image_async(
125
126
            f"file://{temp_dir}/{os.path.basename(image_url)}"
        )
127
        image_sync = local_connector.fetch_image(
128
129
            f"file://{temp_dir}/{os.path.basename(image_url)}"
        )
130
131
132
        # Check that the images are equal
        assert not ImageChops.difference(image_sync, image_async).getbbox()

133
134
        with pytest.raises(ValueError, match="must be a subpath"):
            await local_connector.fetch_image_async(
135
136
                f"file://{temp_dir}/../{os.path.basename(image_url)}"
            )
137
138
        with pytest.raises(RuntimeError, match="Cannot load local files"):
            await connector.fetch_image_async(
139
140
                f"file://{temp_dir}/../{os.path.basename(image_url)}"
            )
141

142
143
        with pytest.raises(ValueError, match="must be a subpath"):
            local_connector.fetch_image(
144
145
                f"file://{temp_dir}/../{os.path.basename(image_url)}"
            )
146
        with pytest.raises(RuntimeError, match="Cannot load local files"):
147
            connector.fetch_image(f"file://{temp_dir}/../{os.path.basename(image_url)}")
148
149


150
@pytest.mark.asyncio
151
152
@pytest.mark.parametrize("image_url", [TEST_IMAGE_ASSETS[0]], indirect=True)
async def test_fetch_image_local_files_with_space_in_name(image_url: str):
153
154
155
156
157
158
159
    connector = MediaConnector()

    with TemporaryDirectory() as temp_dir:
        local_connector = MediaConnector(allowed_local_media_path=temp_dir)

        origin_image = connector.fetch_image(image_url)
        filename = "file name with space.jpg"
160
161
162
163
164
        origin_image.save(
            os.path.join(temp_dir, filename),
            quality=100,
            icc_profile=origin_image.info.get("icc_profile"),
        )
165
166
167

        try:
            image_async = await local_connector.fetch_image_async(
168
169
170
                f"file://{temp_dir}/{filename}"
            )
            image_sync = local_connector.fetch_image(f"file://{temp_dir}/{filename}")
171
        except FileNotFoundError as e:
172
            pytest.fail("Failed to fetch image with space in name: {}".format(e))
173
174
175
176
        # Check that the images are equal
        assert not ImageChops.difference(image_sync, image_async).getbbox()


177
178
179
180
181
182
183
184
185
186
187
188
189
@pytest.mark.asyncio
async def test_fetch_image_error_conversion():
    connector = MediaConnector()
    broken_img = "data:image/png;base64,aGVsbG9fdmxsbV9jb21tdW5pdHkK"

    # PIL.UnidentifiedImageError should be converted to ValueError
    with pytest.raises(ValueError):
        await connector.fetch_image_async(broken_img)

    with pytest.raises(ValueError):
        connector.fetch_image(broken_img)


190
@pytest.mark.flaky(reruns=3, reruns_delay=5)
191
192
193
194
@pytest.mark.asyncio
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
@pytest.mark.parametrize("num_frames", [-1, 32, 1800])
async def test_fetch_video_http(video_url: str, num_frames: int):
195
    connector = MediaConnector(
196
197
198
199
200
201
        media_io_kwargs={
            "video": {
                "num_frames": num_frames,
            }
        }
    )
202

203
204
205
206
207
208
    try:
        video_sync, metadata_sync = connector.fetch_video(video_url)
        video_async, metadata_async = await connector.fetch_video_async(video_url)
    except (TimeoutError, asyncio.TimeoutError) as e:
        pytest.skip(f"Timeout fetching video (CI network flakiness): {e}")

209
210
    assert np.array_equal(video_sync, video_async)
    assert metadata_sync == metadata_async
211
212


213
214
215
216
217
@pytest.mark.asyncio
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
@pytest.mark.parametrize("max_duration", [1, 60, 1800])
@pytest.mark.parametrize("requested_fps", [2, 24])
async def test_fetch_video_http_with_dynamic_loader(
218
219
220
221
222
    video_url: str,
    max_duration: int,
    requested_fps: int,
    monkeypatch: pytest.MonkeyPatch,
):
223
224
225
226
227
228
229
230
    with monkeypatch.context() as m:
        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic")
        connector = MediaConnector(
            media_io_kwargs={
                "video": {
                    "max_duration": max_duration,
                    "requested_fps": requested_fps,
                }
231
232
            }
        )
233
234

        video_sync, metadata_sync = connector.fetch_video(video_url)
235
        video_async, metadata_async = await connector.fetch_video_async(video_url)
236
237
238
239
240
241

        assert np.array_equal(video_sync, video_async)
        assert metadata_sync == metadata_async
        assert metadata_sync["video_backend"] == "opencv_dynamic"


242
243
244
@pytest.mark.parametrize(
    "case",
    [
245
246
        # Single modality
        ## Internally sorted
247
        dict(
248
249
250
251
252
253
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=2),
                    PlaceholderRange(offset=3, length=2),
                ]
            },
254
255
256
            expected_modality_idxs=[
                ("image", 0),
                ("image", 1),
257
258
            ],
        ),
259
        ## Internally unsorted
260
        dict(
261
262
            mm_positions={
                "image": [
263
                    PlaceholderRange(offset=3, length=2),
264
265
266
                    PlaceholderRange(offset=0, length=2),
                ]
            },
267
268
269
            expected_modality_idxs=[
                ("image", 1),
                ("image", 0),
270
271
            ],
        ),
272
273
        # Two modalities
        ## Internally sorted
274
        dict(
275
276
277
278
279
280
281
282
            mm_positions={
                "image": [
                    PlaceholderRange(offset=7, length=4),
                    PlaceholderRange(offset=11, length=5),
                ],
                "audio": [
                    PlaceholderRange(offset=0, length=2),
                    PlaceholderRange(offset=2, length=3),
283
                ],
284
            },
285
286
287
288
289
            expected_modality_idxs=[
                ("audio", 0),
                ("audio", 1),
                ("image", 0),
                ("image", 1),
290
            ],
291
292
        ),
        ## Interleaved, internally sorted
293
        dict(
294
295
296
297
298
299
300
301
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=4),
                    PlaceholderRange(offset=8, length=2),
                ],
                "audio": [
                    PlaceholderRange(offset=5, length=2),
                    PlaceholderRange(offset=11, length=4),
302
                ],
303
304
305
306
307
308
            },
            expected_modality_idxs=[
                ("image", 0),
                ("audio", 0),
                ("image", 1),
                ("audio", 1),
309
310
            ],
        ),
311
        ## Interleaved, internally unsorted
312
        dict(
313
314
            mm_positions={
                "image": [
315
316
                    PlaceholderRange(offset=8, length=2),
                    PlaceholderRange(offset=0, length=4),
317
318
                ],
                "audio": [
319
320
                    PlaceholderRange(offset=11, length=4),
                    PlaceholderRange(offset=5, length=2),
321
                ],
322
            },
323
324
325
326
327
            expected_modality_idxs=[
                ("image", 1),
                ("audio", 1),
                ("image", 0),
                ("audio", 0),
328
329
330
            ],
        ),
        # Three modalities
331
        ## Internally sorted
332
        dict(
333
334
335
336
337
338
339
340
341
342
343
344
            mm_positions={
                "image": [
                    PlaceholderRange(offset=15, length=7),
                    PlaceholderRange(offset=22, length=8),
                ],
                "audio": [
                    PlaceholderRange(offset=0, length=2),
                ],
                "video": [
                    PlaceholderRange(offset=3, length=4),
                    PlaceholderRange(offset=7, length=5),
                    PlaceholderRange(offset=12, length=6),
345
                ],
346
            },
347
348
349
350
351
352
353
            expected_modality_idxs=[
                ("audio", 0),
                ("video", 0),
                ("video", 1),
                ("video", 2),
                ("image", 0),
                ("image", 1),
354
            ],
355
        ),
356
        ## Interleaved, internally sorted
357
        dict(
358
359
360
361
362
363
364
365
366
367
368
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=2),
                    PlaceholderRange(offset=2, length=3),
                    PlaceholderRange(offset=20, length=4),
                ],
                "audio": [
                    PlaceholderRange(offset=5, length=2),
                ],
                "video": [
                    PlaceholderRange(offset=8, length=5),
369
                ],
370
            },
371
372
373
374
375
376
            expected_modality_idxs=[
                ("image", 0),
                ("image", 1),
                ("audio", 0),
                ("video", 0),
                ("image", 2),
377
            ],
378
        ),
379
380
        ## Interleaved, internally unsorted
        dict(
381
382
383
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=2),
384
385
                    PlaceholderRange(offset=20, length=4),
                    PlaceholderRange(offset=2, length=3),
386
387
                ],
                "audio": [
388
                    PlaceholderRange(offset=5, length=2),
389
390
                ],
                "video": [
391
                    PlaceholderRange(offset=8, length=5),
392
                ],
393
            },
394
395
396
397
398
399
            expected_modality_idxs=[
                ("image", 0),
                ("image", 2),
                ("audio", 0),
                ("video", 0),
                ("image", 1),
400
401
            ],
        ),
402
403
404
405
406
    ],
)
def test_argsort_mm_positions(case):
    mm_positions = case["mm_positions"]
    expected_modality_idxs = case["expected_modality_idxs"]
407

408
    modality_idxs = argsort_mm_positions(mm_positions)
409

410
    assert modality_idxs == expected_modality_idxs
411
412
413
414
415
416
417


@pytest.mark.asyncio
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
@pytest.mark.parametrize("num_frames", [-1, 32, 1800])
async def test_allowed_media_domains(video_url: str, num_frames: int):
    connector = MediaConnector(
418
419
420
421
422
        media_io_kwargs={
            "video": {
                "num_frames": num_frames,
            }
        },
423
424
425
        allowed_media_domains=[
            "www.bogotobogo.com",
            "github.com",
426
427
        ],
    )
428
429
430
431
432
433
434
435
436
437
438
439

    video_sync, metadata_sync = connector.fetch_video(video_url)
    video_async, metadata_async = await connector.fetch_video_async(video_url)
    assert np.array_equal(video_sync, video_async)
    assert metadata_sync == metadata_async

    disallowed_url = "https://upload.wikimedia.org/wikipedia/commons/4/47/PNG_transparency_demonstration_1.png"
    with pytest.raises(ValueError):
        _, _ = connector.fetch_video(disallowed_url)

    with pytest.raises(ValueError):
        _, _ = await connector.fetch_video_async(disallowed_url)