test_utils.py 14.6 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
5
import base64
import mimetypes
6
7
import os
from tempfile import NamedTemporaryFile, TemporaryDirectory
8
9
10

import numpy as np
import pytest
11
from PIL import Image, ImageChops
12

13
from vllm.multimodal.image import convert_image_mode
14
from vllm.multimodal.inputs import PlaceholderRange
15
from vllm.multimodal.utils import MediaConnector, argsort_mm_positions
16
17

# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
18
19
20
21
22
TEST_IMAGE_ASSETS = [
    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
    "Grayscale_8bits_palette_sample_image.png",  # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
    "1280px-Venn_diagram_rgb.svg.png",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
    "RGBA_comp.png",  # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
23
24
]

25
26
TEST_VIDEO_URLS = [
    "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4",
27
    "https://github.com/opencv/opencv/raw/refs/tags/4.12.0/samples/data/vtest.avi",
28
29
]

30

31
@pytest.fixture(scope="module")
32
def url_images(local_asset_server) -> dict[str, Image.Image]:
33
34

    return {
35
36
        image_url: local_asset_server.get_image_asset(image_url)
        for image_url in TEST_IMAGE_ASSETS
37
    }
38
39


40
def get_supported_suffixes() -> tuple[str, ...]:
41
42
43
44
45
46
47
48
49
50
    # We should at least test the file types mentioned in GPT-4 with Vision
    OPENAI_SUPPORTED_SUFFIXES = ('.png', '.jpeg', '.jpg', '.webp', '.gif')

    # Additional file types that are supported by us
    EXTRA_SUPPORTED_SUFFIXES = ('.bmp', '.tiff')

    return OPENAI_SUPPORTED_SUFFIXES + EXTRA_SUPPORTED_SUFFIXES


def _image_equals(a: Image.Image, b: Image.Image) -> bool:
51
    return (np.asarray(a) == np.asarray(convert_image_mode(b, a.mode))).all()
52
53


54
@pytest.mark.asyncio
55
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
56
async def test_fetch_image_http(image_url: str):
57
58
59
60
    connector = MediaConnector()

    image_sync = connector.fetch_image(image_url)
    image_async = await connector.fetch_image_async(image_url)
61
62
63
    assert _image_equals(image_sync, image_async)


64
@pytest.mark.asyncio
65
@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
66
@pytest.mark.parametrize("suffix", get_supported_suffixes())
67
async def test_fetch_image_base64(url_images: dict[str, Image.Image],
68
                                  raw_image_url: str, suffix: str):
69
70
71
72
73
74
    connector = MediaConnector(
        # Domain restriction should not apply to data URLs.
        allowed_media_domains=[
            "www.bogotobogo.com",
            "github.com",
        ])
75
    url_image = url_images[raw_image_url]
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96

    try:
        mime_type = Image.MIME[Image.registered_extensions()[suffix]]
    except KeyError:
        try:
            mime_type = mimetypes.types_map[suffix]
        except KeyError:
            pytest.skip('No MIME type')

    with NamedTemporaryFile(suffix=suffix) as f:
        try:
            url_image.save(f.name)
        except Exception as e:
            if e.args[0] == 'cannot write mode RGBA as JPEG':
                pytest.skip('Conversion not supported')

            raise

        base64_image = base64.b64encode(f.read()).decode("utf-8")
        data_url = f"data:{mime_type};base64,{base64_image}"

97
        data_image_sync = connector.fetch_image(data_url)
98
        if _image_equals(url_image, Image.open(f)):
99
            assert _image_equals(url_image, data_image_sync)
100
101
        else:
            pass  # Lossy format; only check that image can be opened
102

103
        data_image_async = await connector.fetch_image_async(data_url)
104
        assert _image_equals(data_image_sync, data_image_async)
105
106


107
@pytest.mark.asyncio
108
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
109
async def test_fetch_image_local_files(image_url: str):
110
111
    connector = MediaConnector()

112
    with TemporaryDirectory() as temp_dir:
113
114
115
        local_connector = MediaConnector(allowed_local_media_path=temp_dir)

        origin_image = connector.fetch_image(image_url)
116
117
118
119
        origin_image.save(os.path.join(temp_dir, os.path.basename(image_url)),
                          quality=100,
                          icc_profile=origin_image.info.get('icc_profile'))

120
121
122
123
        image_async = await local_connector.fetch_image_async(
            f"file://{temp_dir}/{os.path.basename(image_url)}")
        image_sync = local_connector.fetch_image(
            f"file://{temp_dir}/{os.path.basename(image_url)}")
124
125
126
        # Check that the images are equal
        assert not ImageChops.difference(image_sync, image_async).getbbox()

127
128
129
130
131
        with pytest.raises(ValueError, match="must be a subpath"):
            await local_connector.fetch_image_async(
                f"file://{temp_dir}/../{os.path.basename(image_url)}")
        with pytest.raises(RuntimeError, match="Cannot load local files"):
            await connector.fetch_image_async(
132
133
                f"file://{temp_dir}/../{os.path.basename(image_url)}")

134
135
136
137
138
139
        with pytest.raises(ValueError, match="must be a subpath"):
            local_connector.fetch_image(
                f"file://{temp_dir}/../{os.path.basename(image_url)}")
        with pytest.raises(RuntimeError, match="Cannot load local files"):
            connector.fetch_image(
                f"file://{temp_dir}/../{os.path.basename(image_url)}")
140
141


142
@pytest.mark.asyncio
143
144
@pytest.mark.parametrize("image_url", [TEST_IMAGE_ASSETS[0]], indirect=True)
async def test_fetch_image_local_files_with_space_in_name(image_url: str):
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
    connector = MediaConnector()

    with TemporaryDirectory() as temp_dir:
        local_connector = MediaConnector(allowed_local_media_path=temp_dir)

        origin_image = connector.fetch_image(image_url)
        filename = "file name with space.jpg"
        origin_image.save(os.path.join(temp_dir, filename),
                          quality=100,
                          icc_profile=origin_image.info.get('icc_profile'))

        try:
            image_async = await local_connector.fetch_image_async(
                f"file://{temp_dir}/{filename}")
            image_sync = local_connector.fetch_image(
                f"file://{temp_dir}/{filename}")
        except FileNotFoundError as e:
            pytest.fail(
                "Failed to fetch image with space in name: {}".format(e))
        # Check that the images are equal
        assert not ImageChops.difference(image_sync, image_async).getbbox()


168
169
170
171
172
173
174
175
176
177
178
179
180
@pytest.mark.asyncio
async def test_fetch_image_error_conversion():
    connector = MediaConnector()
    broken_img = "data:image/png;base64,aGVsbG9fdmxsbV9jb21tdW5pdHkK"

    # PIL.UnidentifiedImageError should be converted to ValueError
    with pytest.raises(ValueError):
        await connector.fetch_image_async(broken_img)

    with pytest.raises(ValueError):
        connector.fetch_image(broken_img)


181
182
183
184
@pytest.mark.asyncio
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
@pytest.mark.parametrize("num_frames", [-1, 32, 1800])
async def test_fetch_video_http(video_url: str, num_frames: int):
185
186
187
188
    connector = MediaConnector(
        media_io_kwargs={"video": {
            "num_frames": num_frames,
        }})
189

190
191
192
193
    video_sync, metadata_sync = connector.fetch_video(video_url)
    video_async, metadata_async = await connector.fetch_video_async(video_url)
    assert np.array_equal(video_sync, video_async)
    assert metadata_sync == metadata_async
194
195


196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
@pytest.mark.asyncio
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
@pytest.mark.parametrize("max_duration", [1, 60, 1800])
@pytest.mark.parametrize("requested_fps", [2, 24])
async def test_fetch_video_http_with_dynamic_loader(
        video_url: str, max_duration: int, requested_fps: int,
        monkeypatch: pytest.MonkeyPatch):
    with monkeypatch.context() as m:
        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic")
        connector = MediaConnector(
            media_io_kwargs={
                "video": {
                    "max_duration": max_duration,
                    "requested_fps": requested_fps,
                }
            })

        video_sync, metadata_sync = connector.fetch_video(video_url)
        video_async, metadata_async = await connector.fetch_video_async(
            video_url)

        assert np.array_equal(video_sync, video_async)
        assert metadata_sync == metadata_async
        assert metadata_sync["video_backend"] == "opencv_dynamic"


222
223
224
225
# yapf: disable
@pytest.mark.parametrize(
    "case",
    [
226
227
        # Single modality
        ## Internally sorted
228
        dict(
229
230
231
232
233
234
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=2),
                    PlaceholderRange(offset=3, length=2),
                ]
            },
235
236
237
            expected_modality_idxs=[
                ("image", 0),
                ("image", 1),
238
239
            ],
        ),
240
        ## Internally unsorted
241
        dict(
242
243
            mm_positions={
                "image": [
244
                    PlaceholderRange(offset=3, length=2),
245
246
247
                    PlaceholderRange(offset=0, length=2),
                ]
            },
248
249
250
            expected_modality_idxs=[
                ("image", 1),
                ("image", 0),
251
252
253
            ],
        ),

254
255
        # Two modalities
        ## Internally sorted
256
        dict(
257
258
259
260
261
262
263
264
265
266
            mm_positions={
                "image": [
                    PlaceholderRange(offset=7, length=4),
                    PlaceholderRange(offset=11, length=5),
                ],
                "audio": [
                    PlaceholderRange(offset=0, length=2),
                    PlaceholderRange(offset=2, length=3),
                ]
            },
267
268
269
270
271
            expected_modality_idxs=[
                ("audio", 0),
                ("audio", 1),
                ("image", 0),
                ("image", 1),
272
            ],
273
274
        ),
        ## Interleaved, internally sorted
275
        dict(
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=4),
                    PlaceholderRange(offset=8, length=2),
                ],
                "audio": [
                    PlaceholderRange(offset=5, length=2),
                    PlaceholderRange(offset=11, length=4),
                ]
            },
            expected_modality_idxs=[
                ("image", 0),
                ("audio", 0),
                ("image", 1),
                ("audio", 1),
291
292
            ],
        ),
293
        ## Interleaved, internally unsorted
294
        dict(
295
296
            mm_positions={
                "image": [
297
298
                    PlaceholderRange(offset=8, length=2),
                    PlaceholderRange(offset=0, length=4),
299
300
                ],
                "audio": [
301
302
                    PlaceholderRange(offset=11, length=4),
                    PlaceholderRange(offset=5, length=2),
303
304
                ]
            },
305
306
307
308
309
            expected_modality_idxs=[
                ("image", 1),
                ("audio", 1),
                ("image", 0),
                ("audio", 0),
310
311
312
313
            ],
        ),

        # Three modalities
314
        ## Internally sorted
315
        dict(
316
317
318
319
320
321
322
323
324
325
326
327
328
329
            mm_positions={
                "image": [
                    PlaceholderRange(offset=15, length=7),
                    PlaceholderRange(offset=22, length=8),
                ],
                "audio": [
                    PlaceholderRange(offset=0, length=2),
                ],
                "video": [
                    PlaceholderRange(offset=3, length=4),
                    PlaceholderRange(offset=7, length=5),
                    PlaceholderRange(offset=12, length=6),
                ]
            },
330
331
332
333
334
335
336
            expected_modality_idxs=[
                ("audio", 0),
                ("video", 0),
                ("video", 1),
                ("video", 2),
                ("image", 0),
                ("image", 1),
337
            ],
338
        ),
339
        ## Interleaved, internally sorted
340
        dict(
341
342
343
344
345
346
347
348
349
350
351
352
353
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=2),
                    PlaceholderRange(offset=2, length=3),
                    PlaceholderRange(offset=20, length=4),
                ],
                "audio": [
                    PlaceholderRange(offset=5, length=2),
                ],
                "video": [
                    PlaceholderRange(offset=8, length=5),
                ]
            },
354
355
356
357
358
359
            expected_modality_idxs=[
                ("image", 0),
                ("image", 1),
                ("audio", 0),
                ("video", 0),
                ("image", 2),
360
            ],
361
        ),
362
363
        ## Interleaved, internally unsorted
        dict(
364
365
366
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=2),
367
368
                    PlaceholderRange(offset=20, length=4),
                    PlaceholderRange(offset=2, length=3),
369
370
                ],
                "audio": [
371
                    PlaceholderRange(offset=5, length=2),
372
373
                ],
                "video": [
374
                    PlaceholderRange(offset=8, length=5),
375
376
                ]
            },
377
378
379
380
381
382
            expected_modality_idxs=[
                ("image", 0),
                ("image", 2),
                ("audio", 0),
                ("video", 0),
                ("image", 1),
383
384
            ],
        ),
385
386
387
388
389
390
    ],
)
# yapf: enable
def test_argsort_mm_positions(case):
    mm_positions = case["mm_positions"]
    expected_modality_idxs = case["expected_modality_idxs"]
391

392
    modality_idxs = argsort_mm_positions(mm_positions)
393

394
    assert modality_idxs == expected_modality_idxs
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420


@pytest.mark.asyncio
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
@pytest.mark.parametrize("num_frames", [-1, 32, 1800])
async def test_allowed_media_domains(video_url: str, num_frames: int):
    connector = MediaConnector(
        media_io_kwargs={"video": {
            "num_frames": num_frames,
        }},
        allowed_media_domains=[
            "www.bogotobogo.com",
            "github.com",
        ])

    video_sync, metadata_sync = connector.fetch_video(video_url)
    video_async, metadata_async = await connector.fetch_video_async(video_url)
    assert np.array_equal(video_sync, video_async)
    assert metadata_sync == metadata_async

    disallowed_url = "https://upload.wikimedia.org/wikipedia/commons/4/47/PNG_transparency_demonstration_1.png"
    with pytest.raises(ValueError):
        _, _ = connector.fetch_video(disallowed_url)

    with pytest.raises(ValueError):
        _, _ = await connector.fetch_video_async(disallowed_url)