test_utils.py 4.38 KB
Newer Older
1
2
3
4
5
6
7
8
import base64
import mimetypes
from tempfile import NamedTemporaryFile
from typing import Dict, Tuple

import numpy as np
import pytest
from PIL import Image
9
from transformers import AutoConfig, AutoTokenizer
10

11
12
from vllm.multimodal.utils import (async_fetch_image, fetch_image,
                                   repeat_and_pad_placeholder_tokens)
13
14
15
16
17
18
19
20
21
22

# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_URLS = [
    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
]


23
24
25
@pytest.fixture(scope="module")
def url_images() -> Dict[str, Image.Image]:
    return {image_url: fetch_image(image_url) for image_url in TEST_IMAGE_URLS}
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41


def get_supported_suffixes() -> Tuple[str, ...]:
    # We should at least test the file types mentioned in GPT-4 with Vision
    OPENAI_SUPPORTED_SUFFIXES = ('.png', '.jpeg', '.jpg', '.webp', '.gif')

    # Additional file types that are supported by us
    EXTRA_SUPPORTED_SUFFIXES = ('.bmp', '.tiff')

    return OPENAI_SUPPORTED_SUFFIXES + EXTRA_SUPPORTED_SUFFIXES


def _image_equals(a: Image.Image, b: Image.Image) -> bool:
    return (np.asarray(a) == np.asarray(b.convert(a.mode))).all()


42
@pytest.mark.asyncio
43
44
45
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_fetch_image_http(image_url: str):
    image_sync = fetch_image(image_url)
46
    image_async = await async_fetch_image(image_url)
47
48
49
    assert _image_equals(image_sync, image_async)


50
@pytest.mark.asyncio
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
@pytest.mark.parametrize("suffix", get_supported_suffixes())
async def test_fetch_image_base64(url_images: Dict[str, Image.Image],
                                  image_url: str, suffix: str):
    url_image = url_images[image_url]

    try:
        mime_type = Image.MIME[Image.registered_extensions()[suffix]]
    except KeyError:
        try:
            mime_type = mimetypes.types_map[suffix]
        except KeyError:
            pytest.skip('No MIME type')

    with NamedTemporaryFile(suffix=suffix) as f:
        try:
            url_image.save(f.name)
        except Exception as e:
            if e.args[0] == 'cannot write mode RGBA as JPEG':
                pytest.skip('Conversion not supported')

            raise

        base64_image = base64.b64encode(f.read()).decode("utf-8")
        data_url = f"data:{mime_type};base64,{base64_image}"

77
        data_image_sync = fetch_image(data_url)
78
        if _image_equals(url_image, Image.open(f)):
79
            assert _image_equals(url_image, data_image_sync)
80
81
        else:
            pass  # Lossy format; only check that image can be opened
82

83
        data_image_async = await async_fetch_image(data_url)
84
        assert _image_equals(data_image_sync, data_image_async)
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115


@pytest.mark.parametrize("model", ["llava-hf/llava-v1.6-mistral-7b-hf"])
def test_repeat_and_pad_placeholder_tokens(model):
    config = AutoConfig.from_pretrained(model)
    image_token_id = config.image_token_index

    tokenizer = AutoTokenizer.from_pretrained(model)

    test_cases = [
        ("<image>", 2, "<image><image>", [32000, 32000]),
        ("<image><image>", 2, "<image><image><image>", [32000, 32000, 32000]),
        ("<image><image>", [3, 2], "<image><image><image><image><image>",
         [32000, 32000, 32000, 32000, 32000]),
        ("Image:<image>Image:<image>!", [3, 2],
         "Image:<image><image><image>Image:<image><image>!",
         [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918]),
        ("<image>", [3, 2], "<image><image><image>", [32000, 32000, 32000]),
    ]

    for prompt, repeat_count, expected_prompt, expected_token_ids in test_cases:
        new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens(
            tokenizer=tokenizer,
            prompt=prompt,
            prompt_token_ids=tokenizer.encode(prompt,
                                              add_special_tokens=False),
            placeholder_token_id=image_token_id,
            repeat_count=repeat_count,
        )
        assert new_prompt == expected_prompt
        assert new_token_ids == expected_token_ids