test_pixtral.py 7.61 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
3
import json
from dataclasses import asdict
4
from typing import TYPE_CHECKING, Any, Optional
5

Patrick von Platen's avatar
Patrick von Platen committed
6
import pytest
7
from mistral_common.multimodal import download_image
8
9
10
11
from mistral_common.protocol.instruct.messages import ImageURLChunk
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
12
from transformers import AutoProcessor
13

14
from vllm import RequestOutput, SamplingParams, TextPrompt, TokensPrompt
15
from vllm.multimodal import MultiModalDataBuiltins
16
from vllm.multimodal.inputs import PlaceholderRange
17
from vllm.sequence import Logprob, SampleLogprobs
Patrick von Platen's avatar
Patrick von Platen committed
18

19
from ....utils import VLLM_PATH, large_gpu_test
20
from ...utils import check_logprobs_close
Patrick von Platen's avatar
Patrick von Platen committed
21

22
23
if TYPE_CHECKING:
    from _typeshed import StrPath
Patrick von Platen's avatar
Patrick von Platen committed
24

25
26
27
28
29
PIXTRAL_ID = "mistralai/Pixtral-12B-2409"
MISTRAL_SMALL_3_1_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"

MODELS = [PIXTRAL_ID, MISTRAL_SMALL_3_1_ID]

30
31
32
33
34
35
36
37
38
IMG_URLS = [
    "https://picsum.photos/id/237/400/300",
    "https://picsum.photos/id/231/200/300",
    "https://picsum.photos/id/27/500/500",
    "https://picsum.photos/id/17/150/600",
]
PROMPT = "Describe each image in one short sentence."


39
def _create_msg_format(urls: list[str]) -> list[dict[str, Any]]:
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
    return [{
        "role":
        "user",
        "content": [{
            "type": "text",
            "text": PROMPT,
        }] + [{
            "type": "image_url",
            "image_url": {
                "url": url
            }
        } for url in urls],
    }]


55
def _create_msg_format_hf(urls: list[str]) -> list[dict[str, Any]]:
56
57
58
59
60
61
62
63
64
65
66
67
68
    return [{
        "role":
        "user",
        "content": [{
            "type": "text",
            "content": PROMPT,
        }, *({
            "type": "image",
            "image": download_image(url)
        } for url in urls)],
    }]


69
def _create_engine_inputs(urls: list[str]) -> TokensPrompt:
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
    msg = _create_msg_format(urls)

    tokenizer = MistralTokenizer.from_model("pixtral")

    request = ChatCompletionRequest(messages=msg)  # type: ignore[type-var]
    tokenized = tokenizer.encode_chat_completion(request)

    engine_inputs = TokensPrompt(prompt_token_ids=tokenized.tokens)

    images = []
    for chunk in request.messages[0].content:
        if isinstance(chunk, ImageURLChunk):
            images.append(image_from_chunk(chunk))

    mm_data = MultiModalDataBuiltins(image=images)
    engine_inputs["multi_modal_data"] = mm_data

    return engine_inputs


90
def _create_engine_inputs_hf(urls: list[str]) -> TextPrompt:
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
    msg = _create_msg_format_hf(urls)

    tokenizer = AutoProcessor.from_pretrained("mistral-community/pixtral-12b")
    prompt = tokenizer.apply_chat_template(msg)

    images = []
    for chunk in msg[0]["content"]:
        if chunk["type"] == "image":
            images.append(chunk["image"])

    mm_data = MultiModalDataBuiltins(image=images)
    engine_inputs = TextPrompt(prompt=prompt, multi_modal_data=mm_data)

    return engine_inputs


107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
MSGS = [
    _create_msg_format(IMG_URLS[:1]),
    _create_msg_format(IMG_URLS[:2]),
    _create_msg_format(IMG_URLS),
]
ENGINE_INPUTS = [
    _create_engine_inputs(IMG_URLS[:1]),
    _create_engine_inputs(IMG_URLS[:2]),
    _create_engine_inputs(IMG_URLS),
]

SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
LIMIT_MM_PER_PROMPT = dict(image=4)

MAX_MODEL_LEN = [8192, 65536]
122
123
124
125

FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
assert FIXTURES_PATH.exists()

126
127
128
129
FIXTURE_LOGPROBS_CHAT = {
    PIXTRAL_ID: FIXTURES_PATH / "pixtral_chat.json",
    MISTRAL_SMALL_3_1_ID: FIXTURES_PATH / "mistral_small_3_chat.json",
}
130

131
OutputsLogprobs = list[tuple[list[int], str, Optional[SampleLogprobs]]]
132

133
134

# For the test author to store golden output in JSON
135
136
137
138
def _dump_outputs_w_logprobs(
    outputs: OutputsLogprobs,
    filename: "StrPath",
) -> None:
139
140
141
142
    json_data = [(tokens, text, [{
        k: asdict(v)
        for k, v in token_logprobs.items()
    } for token_logprobs in (logprobs or [])])
143
144
145
146
147
148
                 for tokens, text, logprobs in outputs]

    with open(filename, "w") as f:
        json.dump(json_data, f)


149
def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
150
151
152
    with open(filename, "rb") as f:
        json_data = json.load(f)

153
154
155
156
    return [(tokens, text, [{
        int(k): Logprob(**v)
        for k, v in token_logprobs.items()
    } for token_logprobs in logprobs]) for tokens, text, logprobs in json_data]
Patrick von Platen's avatar
Patrick von Platen committed
157
158


159
@large_gpu_test(min_gb=80)
Patrick von Platen's avatar
Patrick von Platen committed
160
@pytest.mark.parametrize("model", MODELS)
161
@pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
Patrick von Platen's avatar
Patrick von Platen committed
162
@pytest.mark.parametrize("dtype", ["bfloat16"])
163
def test_chat(
Patrick von Platen's avatar
Patrick von Platen committed
164
    vllm_runner,
165
    max_model_len: int,
Patrick von Platen's avatar
Patrick von Platen committed
166
167
168
    model: str,
    dtype: str,
) -> None:
169
170
    EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(
        FIXTURE_LOGPROBS_CHAT[model])
171
172
173
174
    with vllm_runner(
            model,
            dtype=dtype,
            tokenizer_mode="mistral",
175
176
            load_format="mistral",
            config_format="mistral",
177
178
179
180
181
182
183
184
185
186
187
            max_model_len=max_model_len,
            limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
    ) as vllm_model:
        outputs = []
        for msg in MSGS:
            output = vllm_model.model.chat(msg,
                                           sampling_params=SAMPLING_PARAMS)

            outputs.extend(output)

    logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
188
189
190
191
    # Remove last `None` prompt_logprobs to compare with fixture
    for i in range(len(logprobs)):
        assert logprobs[i][-1] is None
        logprobs[i] = logprobs[i][:-1]
192
193
194
195
    check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
                         outputs_1_lst=logprobs,
                         name_0="h100_ref",
                         name_1="output")
196
197


198
@large_gpu_test(min_gb=48)
199
200
201
202
203
204
205
206
@pytest.mark.parametrize("prompt,expected_ranges",
                         [(_create_engine_inputs_hf(IMG_URLS[:1]),
                           [PlaceholderRange(offset=11, length=494)]),
                          (_create_engine_inputs_hf(IMG_URLS[1:4]), [
                              PlaceholderRange(offset=11, length=266),
                              PlaceholderRange(offset=277, length=1056),
                              PlaceholderRange(offset=1333, length=418)
                          ])])
207
208
209
210
211
212
213
def test_multi_modal_placeholders(vllm_runner, prompt,
                                  expected_ranges: list[PlaceholderRange],
                                  monkeypatch) -> None:

    # This placeholder checking test only works with V0 engine
    # where `multi_modal_placeholders` is returned with `RequestOutput`
    monkeypatch.setenv("VLLM_USE_V1", "0")
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
    with vllm_runner(
            "mistral-community/pixtral-12b",
            max_model_len=8192,
            limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
    ) as vllm_model:
        outputs = vllm_model.model.generate(prompt)

        assert len(outputs) == 1, f"{len(outputs)=}"
        output: RequestOutput = outputs[0]
        assert hasattr(output,
                       "multi_modal_placeholders"), f"{output.__dict__=}"
        assert "image" in output.multi_modal_placeholders, \
            f"{output.multi_modal_placeholders.keys()=}"
        image_placeholder_ranges: list[
            PlaceholderRange] = output.multi_modal_placeholders["image"]
        assert len(image_placeholder_ranges) == len(
            expected_ranges), f"{image_placeholder_ranges=}"
        for real_range, expected_range in zip(image_placeholder_ranges,
                                              expected_ranges):
            assert real_range == expected_range, \
                f"{real_range=} {expected_range=}"