test_utils.py 13.5 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
5
import base64
import mimetypes
6
7
import os
from tempfile import NamedTemporaryFile, TemporaryDirectory
8
9
10

import numpy as np
import pytest
11
from PIL import Image, ImageChops
12

13
from vllm.multimodal.image import convert_image_mode
14
from vllm.multimodal.inputs import PlaceholderRange
15
from vllm.multimodal.utils import MediaConnector, argsort_mm_positions
16
17

# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
18
19
20
21
22
TEST_IMAGE_ASSETS = [
    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
    "Grayscale_8bits_palette_sample_image.png",  # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
    "1280px-Venn_diagram_rgb.svg.png",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
    "RGBA_comp.png",  # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
23
24
]

25
26
TEST_VIDEO_URLS = [
    "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4",
27
    "https://github.com/opencv/opencv/raw/refs/tags/4.12.0/samples/data/vtest.avi",
28
29
]

30

31
@pytest.fixture(scope="module")
32
def url_images(local_asset_server) -> dict[str, Image.Image]:
33
34

    return {
35
36
        image_url: local_asset_server.get_image_asset(image_url)
        for image_url in TEST_IMAGE_ASSETS
37
    }
38
39


40
def get_supported_suffixes() -> tuple[str, ...]:
41
42
43
44
45
46
47
48
49
50
    # We should at least test the file types mentioned in GPT-4 with Vision
    OPENAI_SUPPORTED_SUFFIXES = ('.png', '.jpeg', '.jpg', '.webp', '.gif')

    # Additional file types that are supported by us
    EXTRA_SUPPORTED_SUFFIXES = ('.bmp', '.tiff')

    return OPENAI_SUPPORTED_SUFFIXES + EXTRA_SUPPORTED_SUFFIXES


def _image_equals(a: Image.Image, b: Image.Image) -> bool:
51
    return (np.asarray(a) == np.asarray(convert_image_mode(b, a.mode))).all()
52
53


54
@pytest.mark.asyncio
55
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
56
async def test_fetch_image_http(image_url: str):
57
58
59
60
    connector = MediaConnector()

    image_sync = connector.fetch_image(image_url)
    image_async = await connector.fetch_image_async(image_url)
61
62
63
    assert _image_equals(image_sync, image_async)


64
@pytest.mark.asyncio
65
@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
66
@pytest.mark.parametrize("suffix", get_supported_suffixes())
67
async def test_fetch_image_base64(url_images: dict[str, Image.Image],
68
                                  raw_image_url: str, suffix: str):
69
    connector = MediaConnector()
70
    url_image = url_images[raw_image_url]
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91

    try:
        mime_type = Image.MIME[Image.registered_extensions()[suffix]]
    except KeyError:
        try:
            mime_type = mimetypes.types_map[suffix]
        except KeyError:
            pytest.skip('No MIME type')

    with NamedTemporaryFile(suffix=suffix) as f:
        try:
            url_image.save(f.name)
        except Exception as e:
            if e.args[0] == 'cannot write mode RGBA as JPEG':
                pytest.skip('Conversion not supported')

            raise

        base64_image = base64.b64encode(f.read()).decode("utf-8")
        data_url = f"data:{mime_type};base64,{base64_image}"

92
        data_image_sync = connector.fetch_image(data_url)
93
        if _image_equals(url_image, Image.open(f)):
94
            assert _image_equals(url_image, data_image_sync)
95
96
        else:
            pass  # Lossy format; only check that image can be opened
97

98
        data_image_async = await connector.fetch_image_async(data_url)
99
        assert _image_equals(data_image_sync, data_image_async)
100
101


102
@pytest.mark.asyncio
103
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
104
async def test_fetch_image_local_files(image_url: str):
105
106
    connector = MediaConnector()

107
    with TemporaryDirectory() as temp_dir:
108
109
110
        local_connector = MediaConnector(allowed_local_media_path=temp_dir)

        origin_image = connector.fetch_image(image_url)
111
112
113
114
        origin_image.save(os.path.join(temp_dir, os.path.basename(image_url)),
                          quality=100,
                          icc_profile=origin_image.info.get('icc_profile'))

115
116
117
118
        image_async = await local_connector.fetch_image_async(
            f"file://{temp_dir}/{os.path.basename(image_url)}")
        image_sync = local_connector.fetch_image(
            f"file://{temp_dir}/{os.path.basename(image_url)}")
119
120
121
        # Check that the images are equal
        assert not ImageChops.difference(image_sync, image_async).getbbox()

122
123
124
125
126
        with pytest.raises(ValueError, match="must be a subpath"):
            await local_connector.fetch_image_async(
                f"file://{temp_dir}/../{os.path.basename(image_url)}")
        with pytest.raises(RuntimeError, match="Cannot load local files"):
            await connector.fetch_image_async(
127
128
                f"file://{temp_dir}/../{os.path.basename(image_url)}")

129
130
131
132
133
134
        with pytest.raises(ValueError, match="must be a subpath"):
            local_connector.fetch_image(
                f"file://{temp_dir}/../{os.path.basename(image_url)}")
        with pytest.raises(RuntimeError, match="Cannot load local files"):
            connector.fetch_image(
                f"file://{temp_dir}/../{os.path.basename(image_url)}")
135
136


137
@pytest.mark.asyncio
138
139
@pytest.mark.parametrize("image_url", [TEST_IMAGE_ASSETS[0]], indirect=True)
async def test_fetch_image_local_files_with_space_in_name(image_url: str):
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
    connector = MediaConnector()

    with TemporaryDirectory() as temp_dir:
        local_connector = MediaConnector(allowed_local_media_path=temp_dir)

        origin_image = connector.fetch_image(image_url)
        filename = "file name with space.jpg"
        origin_image.save(os.path.join(temp_dir, filename),
                          quality=100,
                          icc_profile=origin_image.info.get('icc_profile'))

        try:
            image_async = await local_connector.fetch_image_async(
                f"file://{temp_dir}/{filename}")
            image_sync = local_connector.fetch_image(
                f"file://{temp_dir}/{filename}")
        except FileNotFoundError as e:
            pytest.fail(
                "Failed to fetch image with space in name: {}".format(e))
        # Check that the images are equal
        assert not ImageChops.difference(image_sync, image_async).getbbox()


163
164
165
166
167
168
169
170
171
172
173
174
175
@pytest.mark.asyncio
async def test_fetch_image_error_conversion():
    connector = MediaConnector()
    broken_img = "data:image/png;base64,aGVsbG9fdmxsbV9jb21tdW5pdHkK"

    # PIL.UnidentifiedImageError should be converted to ValueError
    with pytest.raises(ValueError):
        await connector.fetch_image_async(broken_img)

    with pytest.raises(ValueError):
        connector.fetch_image(broken_img)


176
177
178
179
@pytest.mark.asyncio
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
@pytest.mark.parametrize("num_frames", [-1, 32, 1800])
async def test_fetch_video_http(video_url: str, num_frames: int):
180
181
182
183
    connector = MediaConnector(
        media_io_kwargs={"video": {
            "num_frames": num_frames,
        }})
184

185
186
187
188
    video_sync, metadata_sync = connector.fetch_video(video_url)
    video_async, metadata_async = await connector.fetch_video_async(video_url)
    assert np.array_equal(video_sync, video_async)
    assert metadata_sync == metadata_async
189
190


191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
@pytest.mark.asyncio
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
@pytest.mark.parametrize("max_duration", [1, 60, 1800])
@pytest.mark.parametrize("requested_fps", [2, 24])
async def test_fetch_video_http_with_dynamic_loader(
        video_url: str, max_duration: int, requested_fps: int,
        monkeypatch: pytest.MonkeyPatch):
    with monkeypatch.context() as m:
        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic")
        connector = MediaConnector(
            media_io_kwargs={
                "video": {
                    "max_duration": max_duration,
                    "requested_fps": requested_fps,
                }
            })

        video_sync, metadata_sync = connector.fetch_video(video_url)
        video_async, metadata_async = await connector.fetch_video_async(
            video_url)

        assert np.array_equal(video_sync, video_async)
        assert metadata_sync == metadata_async
        assert metadata_sync["video_backend"] == "opencv_dynamic"


217
218
219
220
# yapf: disable
@pytest.mark.parametrize(
    "case",
    [
221
222
        # Single modality
        ## Internally sorted
223
        dict(
224
225
226
227
228
229
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=2),
                    PlaceholderRange(offset=3, length=2),
                ]
            },
230
231
232
            expected_modality_idxs=[
                ("image", 0),
                ("image", 1),
233
234
            ],
        ),
235
        ## Internally unsorted
236
        dict(
237
238
            mm_positions={
                "image": [
239
                    PlaceholderRange(offset=3, length=2),
240
241
242
                    PlaceholderRange(offset=0, length=2),
                ]
            },
243
244
245
            expected_modality_idxs=[
                ("image", 1),
                ("image", 0),
246
247
248
            ],
        ),

249
250
        # Two modalities
        ## Internally sorted
251
        dict(
252
253
254
255
256
257
258
259
260
261
            mm_positions={
                "image": [
                    PlaceholderRange(offset=7, length=4),
                    PlaceholderRange(offset=11, length=5),
                ],
                "audio": [
                    PlaceholderRange(offset=0, length=2),
                    PlaceholderRange(offset=2, length=3),
                ]
            },
262
263
264
265
266
            expected_modality_idxs=[
                ("audio", 0),
                ("audio", 1),
                ("image", 0),
                ("image", 1),
267
            ],
268
269
        ),
        ## Interleaved, internally sorted
270
        dict(
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=4),
                    PlaceholderRange(offset=8, length=2),
                ],
                "audio": [
                    PlaceholderRange(offset=5, length=2),
                    PlaceholderRange(offset=11, length=4),
                ]
            },
            expected_modality_idxs=[
                ("image", 0),
                ("audio", 0),
                ("image", 1),
                ("audio", 1),
286
287
            ],
        ),
288
        ## Interleaved, internally unsorted
289
        dict(
290
291
            mm_positions={
                "image": [
292
293
                    PlaceholderRange(offset=8, length=2),
                    PlaceholderRange(offset=0, length=4),
294
295
                ],
                "audio": [
296
297
                    PlaceholderRange(offset=11, length=4),
                    PlaceholderRange(offset=5, length=2),
298
299
                ]
            },
300
301
302
303
304
            expected_modality_idxs=[
                ("image", 1),
                ("audio", 1),
                ("image", 0),
                ("audio", 0),
305
306
307
308
            ],
        ),

        # Three modalities
309
        ## Internally sorted
310
        dict(
311
312
313
314
315
316
317
318
319
320
321
322
323
324
            mm_positions={
                "image": [
                    PlaceholderRange(offset=15, length=7),
                    PlaceholderRange(offset=22, length=8),
                ],
                "audio": [
                    PlaceholderRange(offset=0, length=2),
                ],
                "video": [
                    PlaceholderRange(offset=3, length=4),
                    PlaceholderRange(offset=7, length=5),
                    PlaceholderRange(offset=12, length=6),
                ]
            },
325
326
327
328
329
330
331
            expected_modality_idxs=[
                ("audio", 0),
                ("video", 0),
                ("video", 1),
                ("video", 2),
                ("image", 0),
                ("image", 1),
332
            ],
333
        ),
334
        ## Interleaved, internally sorted
335
        dict(
336
337
338
339
340
341
342
343
344
345
346
347
348
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=2),
                    PlaceholderRange(offset=2, length=3),
                    PlaceholderRange(offset=20, length=4),
                ],
                "audio": [
                    PlaceholderRange(offset=5, length=2),
                ],
                "video": [
                    PlaceholderRange(offset=8, length=5),
                ]
            },
349
350
351
352
353
354
            expected_modality_idxs=[
                ("image", 0),
                ("image", 1),
                ("audio", 0),
                ("video", 0),
                ("image", 2),
355
            ],
356
        ),
357
358
        ## Interleaved, internally unsorted
        dict(
359
360
361
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=2),
362
363
                    PlaceholderRange(offset=20, length=4),
                    PlaceholderRange(offset=2, length=3),
364
365
                ],
                "audio": [
366
                    PlaceholderRange(offset=5, length=2),
367
368
                ],
                "video": [
369
                    PlaceholderRange(offset=8, length=5),
370
371
                ]
            },
372
373
374
375
376
377
            expected_modality_idxs=[
                ("image", 0),
                ("image", 2),
                ("audio", 0),
                ("video", 0),
                ("image", 1),
378
379
            ],
        ),
380
381
382
383
384
385
    ],
)
# yapf: enable
def test_argsort_mm_positions(case):
    mm_positions = case["mm_positions"]
    expected_modality_idxs = case["expected_modality_idxs"]
386

387
    modality_idxs = argsort_mm_positions(mm_positions)
388

389
    assert modality_idxs == expected_modality_idxs