test_utils.py 14.6 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
5
import base64
import mimetypes
6
7
import os
from tempfile import NamedTemporaryFile, TemporaryDirectory
8
9
10

import numpy as np
import pytest
11
from PIL import Image, ImageChops
12

13
from vllm.multimodal.image import convert_image_mode
14
from vllm.multimodal.inputs import PlaceholderRange
15
from vllm.multimodal.utils import MediaConnector, argsort_mm_positions
16
17

# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
18
19
20
21
22
TEST_IMAGE_ASSETS = [
    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
    "Grayscale_8bits_palette_sample_image.png",  # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
    "1280px-Venn_diagram_rgb.svg.png",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
    "RGBA_comp.png",  # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
23
24
]

25
26
TEST_VIDEO_URLS = [
    "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4",
27
    "https://github.com/opencv/opencv/raw/refs/tags/4.12.0/samples/data/vtest.avi",
28
29
]

30

31
@pytest.fixture(scope="module")
32
def url_images(local_asset_server) -> dict[str, Image.Image]:
33
    return {
34
35
        image_url: local_asset_server.get_image_asset(image_url)
        for image_url in TEST_IMAGE_ASSETS
36
    }
37
38


39
def get_supported_suffixes() -> tuple[str, ...]:
40
    # We should at least test the file types mentioned in GPT-4 with Vision
41
    OPENAI_SUPPORTED_SUFFIXES = (".png", ".jpeg", ".jpg", ".webp", ".gif")
42
43

    # Additional file types that are supported by us
44
    EXTRA_SUPPORTED_SUFFIXES = (".bmp", ".tiff")
45
46
47
48
49

    return OPENAI_SUPPORTED_SUFFIXES + EXTRA_SUPPORTED_SUFFIXES


def _image_equals(a: Image.Image, b: Image.Image) -> bool:
50
    return (np.asarray(a) == np.asarray(convert_image_mode(b, a.mode))).all()
51
52


53
@pytest.mark.asyncio
54
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
55
async def test_fetch_image_http(image_url: str):
56
57
58
59
    connector = MediaConnector()

    image_sync = connector.fetch_image(image_url)
    image_async = await connector.fetch_image_async(image_url)
60
61
62
    assert _image_equals(image_sync, image_async)


63
@pytest.mark.asyncio
64
@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
65
@pytest.mark.parametrize("suffix", get_supported_suffixes())
66
67
68
async def test_fetch_image_base64(
    url_images: dict[str, Image.Image], raw_image_url: str, suffix: str
):
69
70
71
72
73
    connector = MediaConnector(
        # Domain restriction should not apply to data URLs.
        allowed_media_domains=[
            "www.bogotobogo.com",
            "github.com",
74
75
        ]
    )
76
    url_image = url_images[raw_image_url]
77
78
79
80
81
82
83

    try:
        mime_type = Image.MIME[Image.registered_extensions()[suffix]]
    except KeyError:
        try:
            mime_type = mimetypes.types_map[suffix]
        except KeyError:
84
            pytest.skip("No MIME type")
85
86
87
88
89

    with NamedTemporaryFile(suffix=suffix) as f:
        try:
            url_image.save(f.name)
        except Exception as e:
90
91
            if e.args[0] == "cannot write mode RGBA as JPEG":
                pytest.skip("Conversion not supported")
92
93
94
95
96
97

            raise

        base64_image = base64.b64encode(f.read()).decode("utf-8")
        data_url = f"data:{mime_type};base64,{base64_image}"

98
        data_image_sync = connector.fetch_image(data_url)
99
        if _image_equals(url_image, Image.open(f)):
100
            assert _image_equals(url_image, data_image_sync)
101
102
        else:
            pass  # Lossy format; only check that image can be opened
103

104
        data_image_async = await connector.fetch_image_async(data_url)
105
        assert _image_equals(data_image_sync, data_image_async)
106
107


108
@pytest.mark.asyncio
109
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
110
async def test_fetch_image_local_files(image_url: str):
111
112
    connector = MediaConnector()

113
    with TemporaryDirectory() as temp_dir:
114
115
116
        local_connector = MediaConnector(allowed_local_media_path=temp_dir)

        origin_image = connector.fetch_image(image_url)
117
118
119
120
121
        origin_image.save(
            os.path.join(temp_dir, os.path.basename(image_url)),
            quality=100,
            icc_profile=origin_image.info.get("icc_profile"),
        )
122

123
        image_async = await local_connector.fetch_image_async(
124
125
            f"file://{temp_dir}/{os.path.basename(image_url)}"
        )
126
        image_sync = local_connector.fetch_image(
127
128
            f"file://{temp_dir}/{os.path.basename(image_url)}"
        )
129
130
131
        # Check that the images are equal
        assert not ImageChops.difference(image_sync, image_async).getbbox()

132
133
        with pytest.raises(ValueError, match="must be a subpath"):
            await local_connector.fetch_image_async(
134
135
                f"file://{temp_dir}/../{os.path.basename(image_url)}"
            )
136
137
        with pytest.raises(RuntimeError, match="Cannot load local files"):
            await connector.fetch_image_async(
138
139
                f"file://{temp_dir}/../{os.path.basename(image_url)}"
            )
140

141
142
        with pytest.raises(ValueError, match="must be a subpath"):
            local_connector.fetch_image(
143
144
                f"file://{temp_dir}/../{os.path.basename(image_url)}"
            )
145
        with pytest.raises(RuntimeError, match="Cannot load local files"):
146
            connector.fetch_image(f"file://{temp_dir}/../{os.path.basename(image_url)}")
147
148


149
@pytest.mark.asyncio
150
151
@pytest.mark.parametrize("image_url", [TEST_IMAGE_ASSETS[0]], indirect=True)
async def test_fetch_image_local_files_with_space_in_name(image_url: str):
152
153
154
155
156
157
158
    connector = MediaConnector()

    with TemporaryDirectory() as temp_dir:
        local_connector = MediaConnector(allowed_local_media_path=temp_dir)

        origin_image = connector.fetch_image(image_url)
        filename = "file name with space.jpg"
159
160
161
162
163
        origin_image.save(
            os.path.join(temp_dir, filename),
            quality=100,
            icc_profile=origin_image.info.get("icc_profile"),
        )
164
165
166

        try:
            image_async = await local_connector.fetch_image_async(
167
168
169
                f"file://{temp_dir}/{filename}"
            )
            image_sync = local_connector.fetch_image(f"file://{temp_dir}/{filename}")
170
        except FileNotFoundError as e:
171
            pytest.fail("Failed to fetch image with space in name: {}".format(e))
172
173
174
175
        # Check that the images are equal
        assert not ImageChops.difference(image_sync, image_async).getbbox()


176
177
178
179
180
181
182
183
184
185
186
187
188
@pytest.mark.asyncio
async def test_fetch_image_error_conversion():
    connector = MediaConnector()
    broken_img = "data:image/png;base64,aGVsbG9fdmxsbV9jb21tdW5pdHkK"

    # PIL.UnidentifiedImageError should be converted to ValueError
    with pytest.raises(ValueError):
        await connector.fetch_image_async(broken_img)

    with pytest.raises(ValueError):
        connector.fetch_image(broken_img)


189
190
191
192
@pytest.mark.asyncio
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
@pytest.mark.parametrize("num_frames", [-1, 32, 1800])
async def test_fetch_video_http(video_url: str, num_frames: int):
193
    connector = MediaConnector(
194
195
196
197
198
199
        media_io_kwargs={
            "video": {
                "num_frames": num_frames,
            }
        }
    )
200

201
202
203
204
    video_sync, metadata_sync = connector.fetch_video(video_url)
    video_async, metadata_async = await connector.fetch_video_async(video_url)
    assert np.array_equal(video_sync, video_async)
    assert metadata_sync == metadata_async
205
206


207
208
209
210
211
@pytest.mark.asyncio
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
@pytest.mark.parametrize("max_duration", [1, 60, 1800])
@pytest.mark.parametrize("requested_fps", [2, 24])
async def test_fetch_video_http_with_dynamic_loader(
212
213
214
215
216
    video_url: str,
    max_duration: int,
    requested_fps: int,
    monkeypatch: pytest.MonkeyPatch,
):
217
218
219
220
221
222
223
224
    with monkeypatch.context() as m:
        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic")
        connector = MediaConnector(
            media_io_kwargs={
                "video": {
                    "max_duration": max_duration,
                    "requested_fps": requested_fps,
                }
225
226
            }
        )
227
228

        video_sync, metadata_sync = connector.fetch_video(video_url)
229
        video_async, metadata_async = await connector.fetch_video_async(video_url)
230
231
232
233
234
235

        assert np.array_equal(video_sync, video_async)
        assert metadata_sync == metadata_async
        assert metadata_sync["video_backend"] == "opencv_dynamic"


236
237
238
@pytest.mark.parametrize(
    "case",
    [
239
240
        # Single modality
        ## Internally sorted
241
        dict(
242
243
244
245
246
247
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=2),
                    PlaceholderRange(offset=3, length=2),
                ]
            },
248
249
250
            expected_modality_idxs=[
                ("image", 0),
                ("image", 1),
251
252
            ],
        ),
253
        ## Internally unsorted
254
        dict(
255
256
            mm_positions={
                "image": [
257
                    PlaceholderRange(offset=3, length=2),
258
259
260
                    PlaceholderRange(offset=0, length=2),
                ]
            },
261
262
263
            expected_modality_idxs=[
                ("image", 1),
                ("image", 0),
264
265
            ],
        ),
266
267
        # Two modalities
        ## Internally sorted
268
        dict(
269
270
271
272
273
274
275
276
            mm_positions={
                "image": [
                    PlaceholderRange(offset=7, length=4),
                    PlaceholderRange(offset=11, length=5),
                ],
                "audio": [
                    PlaceholderRange(offset=0, length=2),
                    PlaceholderRange(offset=2, length=3),
277
                ],
278
            },
279
280
281
282
283
            expected_modality_idxs=[
                ("audio", 0),
                ("audio", 1),
                ("image", 0),
                ("image", 1),
284
            ],
285
286
        ),
        ## Interleaved, internally sorted
287
        dict(
288
289
290
291
292
293
294
295
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=4),
                    PlaceholderRange(offset=8, length=2),
                ],
                "audio": [
                    PlaceholderRange(offset=5, length=2),
                    PlaceholderRange(offset=11, length=4),
296
                ],
297
298
299
300
301
302
            },
            expected_modality_idxs=[
                ("image", 0),
                ("audio", 0),
                ("image", 1),
                ("audio", 1),
303
304
            ],
        ),
305
        ## Interleaved, internally unsorted
306
        dict(
307
308
            mm_positions={
                "image": [
309
310
                    PlaceholderRange(offset=8, length=2),
                    PlaceholderRange(offset=0, length=4),
311
312
                ],
                "audio": [
313
314
                    PlaceholderRange(offset=11, length=4),
                    PlaceholderRange(offset=5, length=2),
315
                ],
316
            },
317
318
319
320
321
            expected_modality_idxs=[
                ("image", 1),
                ("audio", 1),
                ("image", 0),
                ("audio", 0),
322
323
324
            ],
        ),
        # Three modalities
325
        ## Internally sorted
326
        dict(
327
328
329
330
331
332
333
334
335
336
337
338
            mm_positions={
                "image": [
                    PlaceholderRange(offset=15, length=7),
                    PlaceholderRange(offset=22, length=8),
                ],
                "audio": [
                    PlaceholderRange(offset=0, length=2),
                ],
                "video": [
                    PlaceholderRange(offset=3, length=4),
                    PlaceholderRange(offset=7, length=5),
                    PlaceholderRange(offset=12, length=6),
339
                ],
340
            },
341
342
343
344
345
346
347
            expected_modality_idxs=[
                ("audio", 0),
                ("video", 0),
                ("video", 1),
                ("video", 2),
                ("image", 0),
                ("image", 1),
348
            ],
349
        ),
350
        ## Interleaved, internally sorted
351
        dict(
352
353
354
355
356
357
358
359
360
361
362
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=2),
                    PlaceholderRange(offset=2, length=3),
                    PlaceholderRange(offset=20, length=4),
                ],
                "audio": [
                    PlaceholderRange(offset=5, length=2),
                ],
                "video": [
                    PlaceholderRange(offset=8, length=5),
363
                ],
364
            },
365
366
367
368
369
370
            expected_modality_idxs=[
                ("image", 0),
                ("image", 1),
                ("audio", 0),
                ("video", 0),
                ("image", 2),
371
            ],
372
        ),
373
374
        ## Interleaved, internally unsorted
        dict(
375
376
377
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=2),
378
379
                    PlaceholderRange(offset=20, length=4),
                    PlaceholderRange(offset=2, length=3),
380
381
                ],
                "audio": [
382
                    PlaceholderRange(offset=5, length=2),
383
384
                ],
                "video": [
385
                    PlaceholderRange(offset=8, length=5),
386
                ],
387
            },
388
389
390
391
392
393
            expected_modality_idxs=[
                ("image", 0),
                ("image", 2),
                ("audio", 0),
                ("video", 0),
                ("image", 1),
394
395
            ],
        ),
396
397
398
399
400
    ],
)
def test_argsort_mm_positions(case):
    mm_positions = case["mm_positions"]
    expected_modality_idxs = case["expected_modality_idxs"]
401

402
    modality_idxs = argsort_mm_positions(mm_positions)
403

404
    assert modality_idxs == expected_modality_idxs
405
406
407
408
409
410
411


@pytest.mark.asyncio
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
@pytest.mark.parametrize("num_frames", [-1, 32, 1800])
async def test_allowed_media_domains(video_url: str, num_frames: int):
    connector = MediaConnector(
412
413
414
415
416
        media_io_kwargs={
            "video": {
                "num_frames": num_frames,
            }
        },
417
418
419
        allowed_media_domains=[
            "www.bogotobogo.com",
            "github.com",
420
421
        ],
    )
422
423
424
425
426
427
428
429
430
431
432
433

    video_sync, metadata_sync = connector.fetch_video(video_url)
    video_async, metadata_async = await connector.fetch_video_async(video_url)
    assert np.array_equal(video_sync, video_async)
    assert metadata_sync == metadata_async

    disallowed_url = "https://upload.wikimedia.org/wikipedia/commons/4/47/PNG_transparency_demonstration_1.png"
    with pytest.raises(ValueError):
        _, _ = connector.fetch_video(disallowed_url)

    with pytest.raises(ValueError):
        _, _ = await connector.fetch_video_async(disallowed_url)