test_utils.py 18.2 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
5
import base64
import mimetypes
6
7
import os
from tempfile import NamedTemporaryFile, TemporaryDirectory
8
from typing import TYPE_CHECKING, NamedTuple, Optional
9
10
11

import numpy as np
import pytest
12
13
import torch
import torch.multiprocessing as mp
14
from PIL import Image, ImageChops
15

16
17
18
19
from tests.utils import multi_gpu_test
from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.distributed.parallel_state import (init_distributed_environment,
                                             initialize_model_parallel)
20
from vllm.multimodal.image import convert_image_mode
21
from vllm.multimodal.inputs import PlaceholderRange
22
from vllm.multimodal.utils import (MediaConnector,
23
24
25
26
                                   merge_and_sort_multimodal_metadata,
                                   run_dp_sharded_vision_model)
from vllm.platforms import current_platform
from vllm.utils import get_open_port, update_environment_variables
27

28
29
30
31
if TYPE_CHECKING:
    from vllm.multimodal.hasher import MultiModalHashDict
    from vllm.multimodal.inputs import MultiModalPlaceholderDict

32
33
34
35
36
37
38
39
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_URLS = [
    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
]

40
41
42
43
44
TEST_VIDEO_URLS = [
    "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4",
    "https://filesamples.com/samples/video/avi/sample_640x360.avi",
]

45

46
@pytest.fixture(scope="module")
47
def url_images() -> dict[str, Image.Image]:
48
49
50
51
52
53
    connector = MediaConnector()

    return {
        image_url: connector.fetch_image(image_url)
        for image_url in TEST_IMAGE_URLS
    }
54
55


56
def get_supported_suffixes() -> tuple[str, ...]:
57
58
59
60
61
62
63
64
65
66
    # We should at least test the file types mentioned in GPT-4 with Vision
    OPENAI_SUPPORTED_SUFFIXES = ('.png', '.jpeg', '.jpg', '.webp', '.gif')

    # Additional file types that are supported by us
    EXTRA_SUPPORTED_SUFFIXES = ('.bmp', '.tiff')

    return OPENAI_SUPPORTED_SUFFIXES + EXTRA_SUPPORTED_SUFFIXES


def _image_equals(a: Image.Image, b: Image.Image) -> bool:
67
    return (np.asarray(a) == np.asarray(convert_image_mode(b, a.mode))).all()
68
69


70
@pytest.mark.asyncio
71
72
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_fetch_image_http(image_url: str):
73
74
75
76
    connector = MediaConnector()

    image_sync = connector.fetch_image(image_url)
    image_async = await connector.fetch_image_async(image_url)
77
78
79
    assert _image_equals(image_sync, image_async)


80
@pytest.mark.asyncio
81
82
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
@pytest.mark.parametrize("suffix", get_supported_suffixes())
83
async def test_fetch_image_base64(url_images: dict[str, Image.Image],
84
                                  image_url: str, suffix: str):
85
    connector = MediaConnector()
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
    url_image = url_images[image_url]

    try:
        mime_type = Image.MIME[Image.registered_extensions()[suffix]]
    except KeyError:
        try:
            mime_type = mimetypes.types_map[suffix]
        except KeyError:
            pytest.skip('No MIME type')

    with NamedTemporaryFile(suffix=suffix) as f:
        try:
            url_image.save(f.name)
        except Exception as e:
            if e.args[0] == 'cannot write mode RGBA as JPEG':
                pytest.skip('Conversion not supported')

            raise

        base64_image = base64.b64encode(f.read()).decode("utf-8")
        data_url = f"data:{mime_type};base64,{base64_image}"

108
        data_image_sync = connector.fetch_image(data_url)
109
        if _image_equals(url_image, Image.open(f)):
110
            assert _image_equals(url_image, data_image_sync)
111
112
        else:
            pass  # Lossy format; only check that image can be opened
113

114
        data_image_async = await connector.fetch_image_async(data_url)
115
        assert _image_equals(data_image_sync, data_image_async)
116
117


118
119
120
@pytest.mark.asyncio
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_fetch_image_local_files(image_url: str):
121
122
    connector = MediaConnector()

123
    with TemporaryDirectory() as temp_dir:
124
125
126
        local_connector = MediaConnector(allowed_local_media_path=temp_dir)

        origin_image = connector.fetch_image(image_url)
127
128
129
130
        origin_image.save(os.path.join(temp_dir, os.path.basename(image_url)),
                          quality=100,
                          icc_profile=origin_image.info.get('icc_profile'))

131
132
133
134
        image_async = await local_connector.fetch_image_async(
            f"file://{temp_dir}/{os.path.basename(image_url)}")
        image_sync = local_connector.fetch_image(
            f"file://{temp_dir}/{os.path.basename(image_url)}")
135
136
137
        # Check that the images are equal
        assert not ImageChops.difference(image_sync, image_async).getbbox()

138
139
140
141
142
        with pytest.raises(ValueError, match="must be a subpath"):
            await local_connector.fetch_image_async(
                f"file://{temp_dir}/../{os.path.basename(image_url)}")
        with pytest.raises(RuntimeError, match="Cannot load local files"):
            await connector.fetch_image_async(
143
144
                f"file://{temp_dir}/../{os.path.basename(image_url)}")

145
146
147
148
149
150
        with pytest.raises(ValueError, match="must be a subpath"):
            local_connector.fetch_image(
                f"file://{temp_dir}/../{os.path.basename(image_url)}")
        with pytest.raises(RuntimeError, match="Cannot load local files"):
            connector.fetch_image(
                f"file://{temp_dir}/../{os.path.basename(image_url)}")
151
152


153
154
155
156
157
158
159
160
161
162
163
164
165
@pytest.mark.asyncio
async def test_fetch_image_error_conversion():
    connector = MediaConnector()
    broken_img = "data:image/png;base64,aGVsbG9fdmxsbV9jb21tdW5pdHkK"

    # PIL.UnidentifiedImageError should be converted to ValueError
    with pytest.raises(ValueError):
        await connector.fetch_image_async(broken_img)

    with pytest.raises(ValueError):
        connector.fetch_image(broken_img)


166
167
168
169
170
171
172
173
174
@pytest.mark.asyncio
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
@pytest.mark.parametrize("num_frames", [-1, 32, 1800])
async def test_fetch_video_http(video_url: str, num_frames: int):
    connector = MediaConnector()

    video_sync = connector.fetch_video(video_url, num_frames=num_frames)
    video_async = await connector.fetch_video_async(video_url,
                                                    num_frames=num_frames)
175
176
177
    # Check that the video frames are equal and metadata are same
    assert np.array_equal(video_sync[0], video_async[0])
    assert video_sync[1] == video_async[1]
178
179


180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
# Used for the next two tests related to `merge_and_sort_multimodal_metadata`.
class TestCase(NamedTuple):
    mm_positions: "MultiModalPlaceholderDict"
    mm_hashes: Optional["MultiModalHashDict"]
    expected_modalities: list[str]
    expected_ranges: list[PlaceholderRange]
    expected_hashes: Optional[list[str]]


def test_merge_and_sort_multimodal_metadata():

    test_cases = [
        # Single modality should return result as is but flattened
        TestCase(
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=2),
                    PlaceholderRange(offset=3, length=2),
                ]
            },
            mm_hashes={"image": ["hash1", "hash2"]},
201
            expected_modalities=["image", "image"],
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
            expected_ranges=[
                PlaceholderRange(offset=0, length=2),
                PlaceholderRange(offset=3, length=2),
            ],
            expected_hashes=["hash1", "hash2"],
        ),

        # Single modality without hashes return None for mm hash.
        TestCase(
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=2),
                    PlaceholderRange(offset=2, length=2),
                ]
            },
            mm_hashes=None,
218
            expected_modalities=["image", "image"],
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
            expected_ranges=[
                PlaceholderRange(offset=0, length=2),
                PlaceholderRange(offset=2, length=2),
            ],
            expected_hashes=None,
        ),

        # Multiple modalities with hashes should return sorted modalities
        # and flattened ranges and hashes.
        TestCase(
            mm_positions={
                "image": [
                    PlaceholderRange(offset=7, length=4),
                    PlaceholderRange(offset=11, length=5),
                ],
                "audio": [
                    PlaceholderRange(offset=0, length=2),
                    PlaceholderRange(offset=2, length=3),
                ]
            },
            mm_hashes={
                "image": ["image_hash1", "image_hash2"],
                "audio": ["audio_hash1", "audio_hash2"],
            },
243
            expected_modalities=["audio", "audio", "image", "image"],
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
            expected_ranges=[
                PlaceholderRange(offset=0, length=2),
                PlaceholderRange(offset=2, length=3),
                PlaceholderRange(offset=7, length=4),
                PlaceholderRange(offset=11, length=5),
            ],
            expected_hashes=[
                "audio_hash1", "audio_hash2", "image_hash1", "image_hash2"
            ],
        ),

        # Multiple modalities without hashes should return sorted modalities
        # and flattened ranges and None.
        TestCase(
            mm_positions={
                "image": [
                    PlaceholderRange(offset=7, length=4),
                    PlaceholderRange(offset=11, length=5),
                ],
                "audio": [
                    PlaceholderRange(offset=0, length=2),
                    PlaceholderRange(offset=2, length=3),
                ]
            },
            mm_hashes=None,
269
            expected_modalities=["audio", "audio", "image", "image"],
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
            expected_ranges=[
                PlaceholderRange(offset=0, length=2),
                PlaceholderRange(offset=2, length=3),
                PlaceholderRange(offset=7, length=4),
                PlaceholderRange(offset=11, length=5),
            ],
            expected_hashes=None,
        ),

        # Three modalities
        TestCase(
            mm_positions={
                "image": [
                    PlaceholderRange(offset=15, length=7),
                    PlaceholderRange(offset=22, length=8),
                ],
                "audio": [
                    PlaceholderRange(offset=0, length=2),
                ],
                "video": [
                    PlaceholderRange(offset=3, length=4),
                    PlaceholderRange(offset=7, length=5),
                    PlaceholderRange(offset=12, length=6),
                ]
            },
            mm_hashes={
                "image": ["image_hash1", "image_hash2"],
                "audio": ["audio_hash1"],
                "video": ["video_hash1", "video_hash2", "video_hash3"]
            },
300
301
302
            expected_modalities=[
                "audio", "video", "video", "video", "image", "image"
            ],
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
            expected_ranges=[
                PlaceholderRange(offset=0, length=2),
                PlaceholderRange(offset=3, length=4),
                PlaceholderRange(offset=7, length=5),
                PlaceholderRange(offset=12, length=6),
                PlaceholderRange(offset=15, length=7),
                PlaceholderRange(offset=22, length=8),
            ],
            expected_hashes=[
                "audio_hash1", "video_hash1", "video_hash2", "video_hash3",
                "image_hash1", "image_hash2"
            ],
        ),
    ]

    for (mm_positions, mm_hashes, expected_modalities, expected_ranges,
         expected_hashes) in test_cases:
        modalities, ranges, hashes = merge_and_sort_multimodal_metadata(
            mm_positions, mm_hashes)

        assert modalities == expected_modalities
        assert ranges == expected_ranges
        assert hashes == expected_hashes


def test_merge_and_sort_multimodal_metadata_with_interleaving():

    test_cases = [

        # <image> <audio> <image> <audio>
        TestCase(
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=4),
                    PlaceholderRange(offset=8, length=2),
                ],
                "audio": [
                    PlaceholderRange(offset=5, length=2),
                    PlaceholderRange(offset=11, length=4),
                ]
            },
            mm_hashes={
                "image": ["image_hash1", "image_hash2"],
                "audio": ["audio_hash1", "audio_hash2"],
            },
348
349
350
351
352
353
354
355
356
357
            expected_modalities=["image", "audio", "image", "audio"],
            expected_ranges=[
                PlaceholderRange(offset=0, length=4),
                PlaceholderRange(offset=5, length=2),
                PlaceholderRange(offset=8, length=2),
                PlaceholderRange(offset=11, length=4),
            ],
            expected_hashes=[
                "image_hash1", "audio_hash1", "image_hash2", "audio_hash2"
            ],
358
359
        ),

360
        # <image> <image> <audio> <video> <image>
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
        TestCase(
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=2),
                    PlaceholderRange(offset=2, length=3),
                    PlaceholderRange(offset=20, length=4),
                ],
                "audio": [
                    PlaceholderRange(offset=5, length=2),
                ],
                "video": [
                    PlaceholderRange(offset=8, length=5),
                ]
            },
            mm_hashes=None,
376
377
378
379
380
381
382
383
            expected_modalities=["image", "image", "audio", "video", "image"],
            expected_ranges=[
                PlaceholderRange(offset=0, length=2),
                PlaceholderRange(offset=2, length=3),
                PlaceholderRange(offset=5, length=2),
                PlaceholderRange(offset=8, length=5),
                PlaceholderRange(offset=20, length=4),
            ],
384
385
            expected_hashes=None,
        ),
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416

        # <image> <audio> <video> <image> with hashes
        TestCase(
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=2),
                    PlaceholderRange(offset=18, length=4),
                ],
                "audio": [
                    PlaceholderRange(offset=6, length=2),
                ],
                "video": [
                    PlaceholderRange(offset=10, length=5),
                ]
            },
            mm_hashes={
                "image": ["image_hash1", "image_hash2"],
                "audio": ["audio_hash1"],
                "video": ["video_hash1"],
            },
            expected_modalities=["image", "audio", "video", "image"],
            expected_ranges=[
                PlaceholderRange(offset=0, length=2),
                PlaceholderRange(offset=6, length=2),
                PlaceholderRange(offset=10, length=5),
                PlaceholderRange(offset=18, length=4),
            ],
            expected_hashes=[
                "image_hash1", "audio_hash1", "video_hash1", "image_hash2"
            ],
        ),
417
418
    ]

419
420
421
422
    for (mm_positions, mm_hashes, expected_modalities, expected_ranges,
         expected_hashes) in test_cases:
        modalities, ranges, hashes = merge_and_sort_multimodal_metadata(
            mm_positions, mm_hashes)
423

424
425
426
        assert modalities == expected_modalities
        assert ranges == expected_ranges
        assert hashes == expected_hashes
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513


class SimpleLinearModel(torch.nn.Module):
    """A simple linear vision model for testing."""

    def __init__(self, input_dim: int = 3 * 224 * 224, output_dim: int = 32):
        super().__init__()
        self.flatten = torch.nn.Flatten()
        self.linear = torch.nn.Linear(input_dim, output_dim)

    def forward(self, x: torch.Tensor):
        # Flatten the input and apply linear transformation
        x = self.flatten(x)
        return self.linear(x)


@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize(
    "batch_size",
    [
        1,  # Single image
        4,  # Small batch
        5,  # Odd batch size (for testing padding)
    ],
)
def test_run_dp_sharded_vision_model(batch_size: int):
    world_size = 2
    # Launch processes
    mp.spawn(
        run_dp_sharded_vision_model_vs_direct,
        args=(
            world_size,
            batch_size,
            get_open_port(),
        ),
        nprocs=world_size,
    )


def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int,
                                          batch_size: int, master_port: int):
    """
    Test that run_dp_sharded_vision_model produces the same results as 
    calling the model directly.
    """

    # Set random seed for reproducibility
    current_platform.seed_everything(0)

    device = torch.device(f"cuda:{local_rank}")
    torch.cuda.set_device(device)
    torch.set_default_device(device)

    update_environment_variables({
        'RANK': str(local_rank),
        'LOCAL_RANK': str(local_rank),
        'WORLD_SIZE': str(world_size),
        'MASTER_ADDR': 'localhost',
        'MASTER_PORT': str(master_port),
    })

    # initialize distributed
    init_distributed_environment()
    initialize_model_parallel(tensor_model_parallel_size=world_size)

    # Create a test input tensor
    image_input = torch.randn(batch_size, 3, 224, 224)

    # Create a simple linear model
    vision_model = SimpleLinearModel()

    # Run the model directly on the full input
    with torch.inference_mode():
        direct_output = vision_model(image_input)

    # Run the model through the sharded function
    with torch.inference_mode():
        sharded_output = run_dp_sharded_vision_model(image_input, vision_model)

    # Check that the world size is setup correctly
    assert get_tensor_model_parallel_world_size() == world_size

    # Check that the outputs have the same shape
    assert direct_output.shape == sharded_output.shape

    # Check that the outputs are close (they should be identical)
    assert torch.allclose(direct_output, sharded_output, rtol=1e-5, atol=1e-5)