test_image.py 5.19 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import json

import openai
import pytest
import pytest_asyncio

from tests.utils import RemoteOpenAIServer
11
from vllm.multimodal.utils import encode_image_base64
12
13
14
15
16

# Use a small vision model for testing
MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
MAXIMUM_IMAGES = 2
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
17
18
19
20
21
TEST_IMAGE_ASSETS = [
    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
    "Grayscale_8bits_palette_sample_image.png",  # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
    "1280px-Venn_diagram_rgb.svg.png",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
    "RGBA_comp.png",  # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
]


@pytest.fixture(scope="module")
def default_image_server_args():
    return [
        "--enforce-eager",
        "--max-model-len",
        "6000",
        "--max-num-seqs",
        "128",
        "--limit-mm-per-prompt",
        json.dumps({"image": MAXIMUM_IMAGES}),
    ]


@pytest.fixture(scope="module")
def image_server(default_image_server_args):
40
    with RemoteOpenAIServer(
41
42
43
        MODEL_NAME,
        default_image_server_args,
        env_dict={"VLLM_ENABLE_RESPONSES_API_STORE": "1"},
44
    ) as remote_server:
45
46
47
48
49
50
51
52
53
54
        yield remote_server


@pytest_asyncio.fixture
async def client(image_server):
    async with image_server.get_async_client() as async_client:
        yield async_client


@pytest.fixture(scope="session")
55
def base64_encoded_image(local_asset_server) -> dict[str, str]:
56
    return {
57
        image_url: encode_image_base64(local_asset_server.get_image_asset(image_url))
58
        for image_url in TEST_IMAGE_ASSETS
59
60
61
62
63
    }


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
64
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
65
66
67
async def test_single_chat_session_image(
    client: openai.AsyncOpenAI, model_name: str, image_url: str
):
68
    content_text = "What's in this image?"
69
70
71
72
73
74
75
76
77
78
79
80
81
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "input_image",
                    "image_url": image_url,
                    "detail": "auto",
                },
                {"type": "input_text", "text": content_text},
            ],
        }
    ]
82
83
84
85
86
87
88
89
90
91
92

    # test image url
    response = await client.responses.create(
        model=model_name,
        input=messages,
    )
    assert len(response.output_text) > 0


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
93
@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
94
95
96
async def test_single_chat_session_image_base64encoded(
    client: openai.AsyncOpenAI,
    model_name: str,
97
    raw_image_url: str,
98
99
100
    base64_encoded_image: dict[str, str],
):
    content_text = "What's in this image?"
101
102
103
104
105
106
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "input_image",
107
                    "image_url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",  # noqa: E501
108
109
110
111
112
113
                    "detail": "auto",
                },
                {"type": "input_text", "text": content_text},
            ],
        }
    ]
114
115
116
117
118
119
120
121
122
123
124
125
    # test image base64
    response = await client.responses.create(
        model=model_name,
        input=messages,
    )
    assert len(response.output_text) > 0


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize(
    "image_urls",
126
    [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
    indirect=True,
)
async def test_multi_image_input(
    client: openai.AsyncOpenAI, model_name: str, image_urls: list[str]
):
    messages = [
        {
            "role": "user",
            "content": [
                *(
                    {
                        "type": "input_image",
                        "image_url": image_url,
                        "detail": "auto",
                    }
                    for image_url in image_urls
                ),
                {"type": "input_text", "text": "What's in this image?"},
            ],
        }
    ]
148
149
150
151
152
153
154
155
156
157

    if len(image_urls) > MAXIMUM_IMAGES:
        with pytest.raises(openai.BadRequestError):  # test multi-image input
            await client.responses.create(
                model=model_name,
                input=messages,
            )
        # the server should still work afterwards
        response = await client.responses.create(
            model=model_name,
158
159
160
161
162
163
            input=[
                {
                    "role": "user",
                    "content": "What's the weather like in Paris today?",
                }
            ],
164
165
166
167
168
169
170
171
        )
        assert len(response.output_text) > 0
    else:
        response = await client.responses.create(
            model=model_name,
            input=messages,
        )
        assert len(response.output_text) > 0