test_image.py 5.16 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import json

import openai
import pytest
import pytest_asyncio

from tests.utils import RemoteOpenAIServer
11
from vllm.multimodal.utils import encode_image_base64
12
13
14
15
16

# Use a small vision model for testing
MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
MAXIMUM_IMAGES = 2
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
17
18
19
20
21
TEST_IMAGE_ASSETS = [
    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
    "Grayscale_8bits_palette_sample_image.png",  # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
    "1280px-Venn_diagram_rgb.svg.png",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
    "RGBA_comp.png",  # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
]


@pytest.fixture(scope="module")
def default_image_server_args():
    return [
        "--enforce-eager",
        "--max-model-len",
        "6000",
        "--max-num-seqs",
        "128",
        "--limit-mm-per-prompt",
        json.dumps({"image": MAXIMUM_IMAGES}),
    ]


@pytest.fixture(scope="module")
def image_server(default_image_server_args):
40
41
42
43
44
    with RemoteOpenAIServer(
            MODEL_NAME,
            default_image_server_args,
            env_dict={"VLLM_ENABLE_RESPONSES_API_STORE": "1"},
    ) as remote_server:
45
46
47
48
49
50
51
52
53
54
        yield remote_server


@pytest_asyncio.fixture
async def client(image_server):
    async with image_server.get_async_client() as async_client:
        yield async_client


@pytest.fixture(scope="session")
55
def base64_encoded_image(local_asset_server) -> dict[str, str]:
56
    return {
57
58
59
        image_url:
        encode_image_base64(local_asset_server.get_image_asset(image_url))
        for image_url in TEST_IMAGE_ASSETS
60
61
62
63
64
    }


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
65
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
async def test_single_chat_session_image(client: openai.AsyncOpenAI,
                                         model_name: str, image_url: str):
    content_text = "What's in this image?"
    messages = [{
        "role":
        "user",
        "content": [
            {
                "type": "input_image",
                "image_url": image_url,
                "detail": "auto",
            },
            {
                "type": "input_text",
                "text": content_text
            },
        ],
    }]

    # test image url
    response = await client.responses.create(
        model=model_name,
        input=messages,
    )
    assert len(response.output_text) > 0


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
95
@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
96
97
98
async def test_single_chat_session_image_base64encoded(
    client: openai.AsyncOpenAI,
    model_name: str,
99
    raw_image_url: str,
100
101
102
103
104
105
106
107
108
109
    base64_encoded_image: dict[str, str],
):
    content_text = "What's in this image?"
    messages = [{
        "role":
        "user",
        "content": [
            {
                "type": "input_image",
                "image_url":
110
                f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
                "detail": "auto",
            },
            {
                "type": "input_text",
                "text": content_text
            },
        ],
    }]
    # test image base64
    response = await client.responses.create(
        model=model_name,
        input=messages,
    )
    assert len(response.output_text) > 0


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize(
    "image_urls",
131
132
    [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
    indirect=True)
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
                                 image_urls: list[str]):
    messages = [{
        "role":
        "user",
        "content": [
            *({
                "type": "input_image",
                "image_url": image_url,
                "detail": "auto",
            } for image_url in image_urls),
            {
                "type": "input_text",
                "text": "What's in this image?"
            },
        ],
    }]

    if len(image_urls) > MAXIMUM_IMAGES:
        with pytest.raises(openai.BadRequestError):  # test multi-image input
            await client.responses.create(
                model=model_name,
                input=messages,
            )
        # the server should still work afterwards
        response = await client.responses.create(
            model=model_name,
            input=[{
                "role": "user",
                "content": "What's the weather like in Paris today?",
            }],
        )
        assert len(response.output_text) > 0
    else:
        response = await client.responses.create(
            model=model_name,
            input=messages,
        )
        assert len(response.output_text) > 0