"vllm/vscode:/vscode.git/clone" did not exist on "9ab4388cd3dc6bb2069125c1a84feaa1aa193e0b"
test_render_multimodal.py 4.39 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

"""Multimodal tests for the /render endpoints that expose prompt preprocessing."""

import httpx
import pytest
import pytest_asyncio

from tests.utils import RemoteOpenAIServer
from vllm.multimodal.utils import encode_image_url

VISION_MODEL_NAME = "Qwen/Qwen3-VL-2B-Instruct"


@pytest.fixture(scope="module")
def vision_server():
    """Vision-capable server used for multimodal /render tests."""

    args = [
        "--enforce-eager",
        "--max-model-len",
        "100",
        "--max-num-seqs",
        "1",
        "--limit-mm-per-prompt.image",
        "1",
        "--limit-mm-per-prompt.video",
        "0",
    ]

    env_overrides: dict[str, str] = {}

    with RemoteOpenAIServer(
        VISION_MODEL_NAME,
        args,
        env_dict=env_overrides,
    ) as remote_server:
        yield remote_server


@pytest_asyncio.fixture
async def vision_client(vision_server):
    async with httpx.AsyncClient(
        base_url=vision_server.url_for(""), timeout=60.0
    ) as http_client:
        yield http_client


@pytest.mark.asyncio
async def test_chat_completion_render_with_base64_image_url(
    vision_client,
    local_asset_server,
):
    """Render a multimodal chat request and verify tokens are returned."""

    image = local_asset_server.get_image_asset("RGBA_comp.png")
    data_url = encode_image_url(image, format="PNG")

    assert data_url.startswith("data:image/")
    assert ";base64," in data_url

    response = await vision_client.post(
        "/v1/chat/completions/render",
        json={
            "model": VISION_MODEL_NAME,
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {"type": "image_url", "image_url": {"url": data_url}},
                        {"type": "text", "text": "What's in this image?"},
                    ],
                }
            ],
        },
    )

    assert response.status_code == 200

    data = response.json()
    assert isinstance(data, dict)
    assert "token_ids" in data
    assert isinstance(data["token_ids"], list)
    assert len(data["token_ids"]) > 0

    # Verify multimodal features are populated
    assert "features" in data
    features = data["features"]
    assert features is not None

    # mm_hashes: should have an "image" key with a list of hash strings
    assert "mm_hashes" in features
    assert "image" in features["mm_hashes"]
    image_hashes = features["mm_hashes"]["image"]
    assert isinstance(image_hashes, list)
    assert len(image_hashes) > 0
    assert all(isinstance(h, str) for h in image_hashes)

    # mm_placeholders: should have an "image" key with offset/length dicts
    assert "mm_placeholders" in features
    assert "image" in features["mm_placeholders"]
    image_placeholders = features["mm_placeholders"]["image"]
    assert isinstance(image_placeholders, list)
    assert len(image_placeholders) > 0
    for p in image_placeholders:
        assert "offset" in p
        assert "length" in p
        assert isinstance(p["offset"], int)
        assert isinstance(p["length"], int)
        assert p["length"] > 0


@pytest.mark.asyncio
async def test_tokenize_matches_render_for_multimodal_input(
    vision_client,
    local_asset_server,
):
    """`/tokenize` should match `/v1/chat/completions/render` token output."""

    image = local_asset_server.get_image_asset("RGBA_comp.png")
    data_url = encode_image_url(image, format="PNG")

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image_url", "image_url": {"url": data_url}},
                {"type": "text", "text": "What's in this image?"},
            ],
        }
    ]

    render_response = await vision_client.post(
        "/v1/chat/completions/render",
        json={
            "model": VISION_MODEL_NAME,
            "messages": messages,
        },
    )
    assert render_response.status_code == 200
    render_data = render_response.json()

    tokenize_response = await vision_client.post(
        "/tokenize",
        json={
            "model": VISION_MODEL_NAME,
            "messages": messages,
        },
    )
    assert tokenize_response.status_code == 200
    tokenize_data = tokenize_response.json()

    assert tokenize_data["tokens"] == render_data["token_ids"]
    assert tokenize_data["count"] == len(render_data["token_ids"])