test_utils.py 13.7 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
5
import base64
import mimetypes
6
7
import os
from tempfile import NamedTemporaryFile, TemporaryDirectory
8
from typing import TYPE_CHECKING, NamedTuple
9
10
11

import numpy as np
import pytest
12
from PIL import Image, ImageChops
13

14
from vllm.multimodal.image import convert_image_mode
15
from vllm.multimodal.inputs import PlaceholderRange
16
from vllm.multimodal.utils import MediaConnector, argsort_mm_positions
17

18
19
20
if TYPE_CHECKING:
    from vllm.multimodal.inputs import MultiModalPlaceholderDict

21
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
22
23
24
25
26
TEST_IMAGE_ASSETS = [
    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
    "Grayscale_8bits_palette_sample_image.png",  # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
    "1280px-Venn_diagram_rgb.svg.png",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
    "RGBA_comp.png",  # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
27
28
]

29
30
TEST_VIDEO_URLS = [
    "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4",
31
    "https://github.com/opencv/opencv/raw/refs/tags/4.12.0/samples/data/vtest.avi",
32
33
]

34

35
@pytest.fixture(scope="module")
36
def url_images(local_asset_server) -> dict[str, Image.Image]:
37
38

    return {
39
40
        image_url: local_asset_server.get_image_asset(image_url)
        for image_url in TEST_IMAGE_ASSETS
41
    }
42
43


44
def get_supported_suffixes() -> tuple[str, ...]:
45
46
47
48
49
50
51
52
53
54
    # We should at least test the file types mentioned in GPT-4 with Vision
    OPENAI_SUPPORTED_SUFFIXES = ('.png', '.jpeg', '.jpg', '.webp', '.gif')

    # Additional file types that are supported by us
    EXTRA_SUPPORTED_SUFFIXES = ('.bmp', '.tiff')

    return OPENAI_SUPPORTED_SUFFIXES + EXTRA_SUPPORTED_SUFFIXES


def _image_equals(a: Image.Image, b: Image.Image) -> bool:
55
    return (np.asarray(a) == np.asarray(convert_image_mode(b, a.mode))).all()
56
57


58
@pytest.mark.asyncio
59
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
60
async def test_fetch_image_http(image_url: str):
61
62
63
64
    connector = MediaConnector()

    image_sync = connector.fetch_image(image_url)
    image_async = await connector.fetch_image_async(image_url)
65
66
67
    assert _image_equals(image_sync, image_async)


68
@pytest.mark.asyncio
69
@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
70
@pytest.mark.parametrize("suffix", get_supported_suffixes())
71
async def test_fetch_image_base64(url_images: dict[str, Image.Image],
72
                                  raw_image_url: str, suffix: str):
73
    connector = MediaConnector()
74
    url_image = url_images[raw_image_url]
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95

    try:
        mime_type = Image.MIME[Image.registered_extensions()[suffix]]
    except KeyError:
        try:
            mime_type = mimetypes.types_map[suffix]
        except KeyError:
            pytest.skip('No MIME type')

    with NamedTemporaryFile(suffix=suffix) as f:
        try:
            url_image.save(f.name)
        except Exception as e:
            if e.args[0] == 'cannot write mode RGBA as JPEG':
                pytest.skip('Conversion not supported')

            raise

        base64_image = base64.b64encode(f.read()).decode("utf-8")
        data_url = f"data:{mime_type};base64,{base64_image}"

96
        data_image_sync = connector.fetch_image(data_url)
97
        if _image_equals(url_image, Image.open(f)):
98
            assert _image_equals(url_image, data_image_sync)
99
100
        else:
            pass  # Lossy format; only check that image can be opened
101

102
        data_image_async = await connector.fetch_image_async(data_url)
103
        assert _image_equals(data_image_sync, data_image_async)
104
105


106
@pytest.mark.asyncio
107
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
108
async def test_fetch_image_local_files(image_url: str):
109
110
    connector = MediaConnector()

111
    with TemporaryDirectory() as temp_dir:
112
113
114
        local_connector = MediaConnector(allowed_local_media_path=temp_dir)

        origin_image = connector.fetch_image(image_url)
115
116
117
118
        origin_image.save(os.path.join(temp_dir, os.path.basename(image_url)),
                          quality=100,
                          icc_profile=origin_image.info.get('icc_profile'))

119
120
121
122
        image_async = await local_connector.fetch_image_async(
            f"file://{temp_dir}/{os.path.basename(image_url)}")
        image_sync = local_connector.fetch_image(
            f"file://{temp_dir}/{os.path.basename(image_url)}")
123
124
125
        # Check that the images are equal
        assert not ImageChops.difference(image_sync, image_async).getbbox()

126
127
128
129
130
        with pytest.raises(ValueError, match="must be a subpath"):
            await local_connector.fetch_image_async(
                f"file://{temp_dir}/../{os.path.basename(image_url)}")
        with pytest.raises(RuntimeError, match="Cannot load local files"):
            await connector.fetch_image_async(
131
132
                f"file://{temp_dir}/../{os.path.basename(image_url)}")

133
134
135
136
137
138
        with pytest.raises(ValueError, match="must be a subpath"):
            local_connector.fetch_image(
                f"file://{temp_dir}/../{os.path.basename(image_url)}")
        with pytest.raises(RuntimeError, match="Cannot load local files"):
            connector.fetch_image(
                f"file://{temp_dir}/../{os.path.basename(image_url)}")
139
140


141
@pytest.mark.asyncio
142
143
@pytest.mark.parametrize("image_url", [TEST_IMAGE_ASSETS[0]], indirect=True)
async def test_fetch_image_local_files_with_space_in_name(image_url: str):
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
    connector = MediaConnector()

    with TemporaryDirectory() as temp_dir:
        local_connector = MediaConnector(allowed_local_media_path=temp_dir)

        origin_image = connector.fetch_image(image_url)
        filename = "file name with space.jpg"
        origin_image.save(os.path.join(temp_dir, filename),
                          quality=100,
                          icc_profile=origin_image.info.get('icc_profile'))

        try:
            image_async = await local_connector.fetch_image_async(
                f"file://{temp_dir}/{filename}")
            image_sync = local_connector.fetch_image(
                f"file://{temp_dir}/{filename}")
        except FileNotFoundError as e:
            pytest.fail(
                "Failed to fetch image with space in name: {}".format(e))
        # Check that the images are equal
        assert not ImageChops.difference(image_sync, image_async).getbbox()


167
168
169
170
171
172
173
174
175
176
177
178
179
@pytest.mark.asyncio
async def test_fetch_image_error_conversion():
    connector = MediaConnector()
    broken_img = "data:image/png;base64,aGVsbG9fdmxsbV9jb21tdW5pdHkK"

    # PIL.UnidentifiedImageError should be converted to ValueError
    with pytest.raises(ValueError):
        await connector.fetch_image_async(broken_img)

    with pytest.raises(ValueError):
        connector.fetch_image(broken_img)


180
181
182
183
@pytest.mark.asyncio
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
@pytest.mark.parametrize("num_frames", [-1, 32, 1800])
async def test_fetch_video_http(video_url: str, num_frames: int):
184
185
186
187
    connector = MediaConnector(
        media_io_kwargs={"video": {
            "num_frames": num_frames,
        }})
188

189
190
191
192
    video_sync, metadata_sync = connector.fetch_video(video_url)
    video_async, metadata_async = await connector.fetch_video_async(video_url)
    assert np.array_equal(video_sync, video_async)
    assert metadata_sync == metadata_async
193
194


195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
@pytest.mark.asyncio
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
@pytest.mark.parametrize("max_duration", [1, 60, 1800])
@pytest.mark.parametrize("requested_fps", [2, 24])
async def test_fetch_video_http_with_dynamic_loader(
        video_url: str, max_duration: int, requested_fps: int,
        monkeypatch: pytest.MonkeyPatch):
    with monkeypatch.context() as m:
        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic")
        connector = MediaConnector(
            media_io_kwargs={
                "video": {
                    "max_duration": max_duration,
                    "requested_fps": requested_fps,
                }
            })

        video_sync, metadata_sync = connector.fetch_video(video_url)
        video_async, metadata_async = await connector.fetch_video_async(
            video_url)

        assert np.array_equal(video_sync, video_async)
        assert metadata_sync == metadata_async
        assert metadata_sync["video_backend"] == "opencv_dynamic"


221
# Used for `test_argsort_mm_positions`.
222
223
class TestCase(NamedTuple):
    mm_positions: "MultiModalPlaceholderDict"
224
    expected_modality_idxs: list[tuple[str, int]]
225
226


227
def test_argsort_mm_positions():
228
229

    test_cases = [
230
231
        # Single modality
        ## Internally sorted
232
233
234
235
236
237
238
        TestCase(
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=2),
                    PlaceholderRange(offset=3, length=2),
                ]
            },
239
240
241
            expected_modality_idxs=[
                ("image", 0),
                ("image", 1),
242
243
            ],
        ),
244
        ## Internally unsorted
245
246
247
        TestCase(
            mm_positions={
                "image": [
248
                    PlaceholderRange(offset=3, length=2),
249
250
251
                    PlaceholderRange(offset=0, length=2),
                ]
            },
252
253
254
            expected_modality_idxs=[
                ("image", 1),
                ("image", 0),
255
256
257
            ],
        ),

258
259
        # Two modalities
        ## Internally sorted
260
261
262
263
264
265
266
267
268
269
270
        TestCase(
            mm_positions={
                "image": [
                    PlaceholderRange(offset=7, length=4),
                    PlaceholderRange(offset=11, length=5),
                ],
                "audio": [
                    PlaceholderRange(offset=0, length=2),
                    PlaceholderRange(offset=2, length=3),
                ]
            },
271
272
273
274
275
            expected_modality_idxs=[
                ("audio", 0),
                ("audio", 1),
                ("image", 0),
                ("image", 1),
276
            ],
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
        ),
        ## Interleaved, internally sorted
        TestCase(
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=4),
                    PlaceholderRange(offset=8, length=2),
                ],
                "audio": [
                    PlaceholderRange(offset=5, length=2),
                    PlaceholderRange(offset=11, length=4),
                ]
            },
            expected_modality_idxs=[
                ("image", 0),
                ("audio", 0),
                ("image", 1),
                ("audio", 1),
295
296
            ],
        ),
297
        ## Interleaved, internally unsorted
298
299
300
        TestCase(
            mm_positions={
                "image": [
301
302
                    PlaceholderRange(offset=8, length=2),
                    PlaceholderRange(offset=0, length=4),
303
304
                ],
                "audio": [
305
306
                    PlaceholderRange(offset=11, length=4),
                    PlaceholderRange(offset=5, length=2),
307
308
                ]
            },
309
310
311
312
313
            expected_modality_idxs=[
                ("image", 1),
                ("audio", 1),
                ("image", 0),
                ("audio", 0),
314
315
316
317
            ],
        ),

        # Three modalities
318
        ## Internally sorted
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
        TestCase(
            mm_positions={
                "image": [
                    PlaceholderRange(offset=15, length=7),
                    PlaceholderRange(offset=22, length=8),
                ],
                "audio": [
                    PlaceholderRange(offset=0, length=2),
                ],
                "video": [
                    PlaceholderRange(offset=3, length=4),
                    PlaceholderRange(offset=7, length=5),
                    PlaceholderRange(offset=12, length=6),
                ]
            },
334
335
336
337
338
339
340
            expected_modality_idxs=[
                ("audio", 0),
                ("video", 0),
                ("video", 1),
                ("video", 2),
                ("image", 0),
                ("image", 1),
341
            ],
342
        ),
343
        ## Interleaved, internally sorted
344
345
346
347
348
349
350
351
352
353
354
355
356
357
        TestCase(
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=2),
                    PlaceholderRange(offset=2, length=3),
                    PlaceholderRange(offset=20, length=4),
                ],
                "audio": [
                    PlaceholderRange(offset=5, length=2),
                ],
                "video": [
                    PlaceholderRange(offset=8, length=5),
                ]
            },
358
359
360
361
362
363
            expected_modality_idxs=[
                ("image", 0),
                ("image", 1),
                ("audio", 0),
                ("video", 0),
                ("image", 2),
364
            ],
365
        ),
366
        ## Interleaved, internally sunorted
367
368
369
370
        TestCase(
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=2),
371
372
                    PlaceholderRange(offset=20, length=4),
                    PlaceholderRange(offset=2, length=3),
373
374
                ],
                "audio": [
375
                    PlaceholderRange(offset=5, length=2),
376
377
                ],
                "video": [
378
                    PlaceholderRange(offset=8, length=5),
379
380
                ]
            },
381
382
383
384
385
386
            expected_modality_idxs=[
                ("image", 0),
                ("image", 2),
                ("audio", 0),
                ("video", 0),
                ("image", 1),
387
388
            ],
        ),
389
390
    ]

391
392
    for mm_positions, expected_modality_idxs in test_cases:
        modality_idxs = argsort_mm_positions(mm_positions)
393

394
        assert modality_idxs == expected_modality_idxs