test_pixtral.py 8.08 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
import json
from dataclasses import asdict
5
from typing import TYPE_CHECKING, Any, Optional
6

Patrick von Platen's avatar
Patrick von Platen committed
7
import pytest
8
from mistral_common.multimodal import download_image
9
10
11
12
from mistral_common.protocol.instruct.messages import ImageURLChunk
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
13
from transformers import AutoProcessor
14

15
from vllm import RequestOutput, SamplingParams, TextPrompt, TokensPrompt
16
from vllm.multimodal import MultiModalDataBuiltins
17
from vllm.multimodal.inputs import PlaceholderRange
18
from vllm.sequence import Logprob, SampleLogprobs
Patrick von Platen's avatar
Patrick von Platen committed
19

20
from ....utils import VLLM_PATH, large_gpu_test
21
from ...utils import check_logprobs_close, dummy_hf_overrides
Patrick von Platen's avatar
Patrick von Platen committed
22

23
24
if TYPE_CHECKING:
    from _typeshed import StrPath
Patrick von Platen's avatar
Patrick von Platen committed
25

26
27
28
29
30
PIXTRAL_ID = "mistralai/Pixtral-12B-2409"
MISTRAL_SMALL_3_1_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"

MODELS = [PIXTRAL_ID, MISTRAL_SMALL_3_1_ID]

31
IMG_URLS = [
32
33
34
35
    "237-400x300.jpg",  # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
    "231-200x300.jpg",  # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
    "27-500x500.jpg",  # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
    "17-150x600.jpg",  # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
36
37
38
39
]
PROMPT = "Describe each image in one short sentence."


40
def _create_msg_format(urls: list[str]) -> list[dict[str, Any]]:
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
    return [{
        "role":
        "user",
        "content": [{
            "type": "text",
            "text": PROMPT,
        }] + [{
            "type": "image_url",
            "image_url": {
                "url": url
            }
        } for url in urls],
    }]


56
def _create_msg_format_hf(urls: list[str]) -> list[dict[str, Any]]:
57
58
59
60
61
62
63
64
65
66
67
68
69
    return [{
        "role":
        "user",
        "content": [{
            "type": "text",
            "content": PROMPT,
        }, *({
            "type": "image",
            "image": download_image(url)
        } for url in urls)],
    }]


70
def _create_engine_inputs(urls: list[str]) -> TokensPrompt:
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
    msg = _create_msg_format(urls)

    tokenizer = MistralTokenizer.from_model("pixtral")

    request = ChatCompletionRequest(messages=msg)  # type: ignore[type-var]
    tokenized = tokenizer.encode_chat_completion(request)

    engine_inputs = TokensPrompt(prompt_token_ids=tokenized.tokens)

    images = []
    for chunk in request.messages[0].content:
        if isinstance(chunk, ImageURLChunk):
            images.append(image_from_chunk(chunk))

    mm_data = MultiModalDataBuiltins(image=images)
    engine_inputs["multi_modal_data"] = mm_data

    return engine_inputs


91
def _create_engine_inputs_hf(urls: list[str]) -> TextPrompt:
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
    msg = _create_msg_format_hf(urls)

    tokenizer = AutoProcessor.from_pretrained("mistral-community/pixtral-12b")
    prompt = tokenizer.apply_chat_template(msg)

    images = []
    for chunk in msg[0]["content"]:
        if chunk["type"] == "image":
            images.append(chunk["image"])

    mm_data = MultiModalDataBuiltins(image=images)
    engine_inputs = TextPrompt(prompt=prompt, multi_modal_data=mm_data)

    return engine_inputs


108
109
110
111
SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
LIMIT_MM_PER_PROMPT = dict(image=4)

MAX_MODEL_LEN = [8192, 65536]
112
113
114
115

FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
assert FIXTURES_PATH.exists()

116
117
118
119
FIXTURE_LOGPROBS_CHAT = {
    PIXTRAL_ID: FIXTURES_PATH / "pixtral_chat.json",
    MISTRAL_SMALL_3_1_ID: FIXTURES_PATH / "mistral_small_3_chat.json",
}
120

121
OutputsLogprobs = list[tuple[list[int], str, Optional[SampleLogprobs]]]
122

123
124

# For the test author to store golden output in JSON
125
126
127
128
def _dump_outputs_w_logprobs(
    outputs: OutputsLogprobs,
    filename: "StrPath",
) -> None:
129
130
131
132
    json_data = [(tokens, text, [{
        k: asdict(v)
        for k, v in token_logprobs.items()
    } for token_logprobs in (logprobs or [])])
133
134
135
136
137
138
                 for tokens, text, logprobs in outputs]

    with open(filename, "w") as f:
        json.dump(json_data, f)


139
def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
140
141
142
    with open(filename, "rb") as f:
        json_data = json.load(f)

143
144
145
146
    return [(tokens, text, [{
        int(k): Logprob(**v)
        for k, v in token_logprobs.items()
    } for token_logprobs in logprobs]) for tokens, text, logprobs in json_data]
Patrick von Platen's avatar
Patrick von Platen committed
147
148


149
@large_gpu_test(min_gb=80)
Patrick von Platen's avatar
Patrick von Platen committed
150
@pytest.mark.parametrize("model", MODELS)
151
@pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
Patrick von Platen's avatar
Patrick von Platen committed
152
@pytest.mark.parametrize("dtype", ["bfloat16"])
153
154
def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str,
              local_asset_server) -> None:
155
156
    EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(
        FIXTURE_LOGPROBS_CHAT[model])
157
158
159
160
    with vllm_runner(
            model,
            dtype=dtype,
            tokenizer_mode="mistral",
161
162
            load_format="mistral",
            config_format="mistral",
163
164
165
166
            max_model_len=max_model_len,
            limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
    ) as vllm_model:
        outputs = []
167
168
169
170
171
172
173
174

        urls_all = [local_asset_server.url_for(u) for u in IMG_URLS]
        msgs = [
            _create_msg_format(urls_all[:1]),
            _create_msg_format(urls_all[:2]),
            _create_msg_format(urls_all),
        ]
        for msg in msgs:
175
            output = vllm_model.llm.chat(msg, sampling_params=SAMPLING_PARAMS)
176
177
178
179

            outputs.extend(output)

    logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
180
181
182
183
    # Remove last `None` prompt_logprobs to compare with fixture
    for i in range(len(logprobs)):
        assert logprobs[i][-1] is None
        logprobs[i] = logprobs[i][:-1]
184
185
186
187
    check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
                         outputs_1_lst=logprobs,
                         name_0="h100_ref",
                         name_1="output")
188
189


190
@pytest.mark.parametrize(
191
192
193
194
195
196
197
198
    "image_urls,expected_ranges",
    [(IMG_URLS[:1], [PlaceholderRange(offset=11, length=494)]),
     (IMG_URLS[1:4], [
         PlaceholderRange(offset=11, length=266),
         PlaceholderRange(offset=277, length=1056),
         PlaceholderRange(offset=1333, length=418)
     ])])
def test_multi_modal_placeholders(vllm_runner, image_urls: list[str],
199
                                  expected_ranges: list[PlaceholderRange],
200
201
202
                                  local_asset_server, monkeypatch) -> None:
    local_image_urls = [local_asset_server.url_for(u) for u in image_urls]
    prompt = _create_engine_inputs_hf(local_image_urls)
203
204
205
206

    # This placeholder checking test only works with V0 engine
    # where `multi_modal_placeholders` is returned with `RequestOutput`
    monkeypatch.setenv("VLLM_USE_V1", "0")
207
208
209
210
    with vllm_runner(
            "mistral-community/pixtral-12b",
            max_model_len=8192,
            limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
211
212
            load_format="dummy",
            hf_overrides=dummy_hf_overrides,
213
    ) as vllm_model:
214
        outputs = vllm_model.llm.generate(prompt)
215
216
217
218
219
220
221
222
223
224
225
226
227

        assert len(outputs) == 1, f"{len(outputs)=}"
        output: RequestOutput = outputs[0]
        assert hasattr(output,
                       "multi_modal_placeholders"), f"{output.__dict__=}"
        assert "image" in output.multi_modal_placeholders, \
            f"{output.multi_modal_placeholders.keys()=}"
        image_placeholder_ranges: list[
            PlaceholderRange] = output.multi_modal_placeholders["image"]
        assert len(image_placeholder_ranges) == len(
            expected_ranges), f"{image_placeholder_ranges=}"
        for real_range, expected_range in zip(image_placeholder_ranges,
                                              expected_ranges):
228
229
230
            assert real_range.offset == expected_range.offset, \
                f"{real_range=} {expected_range=}"
            assert real_range.length == expected_range.length, \
231
                f"{real_range=} {expected_range=}"