test_pixtral.py 7.86 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
import json
from dataclasses import asdict
5
from typing import TYPE_CHECKING, Any, Optional
6

7
import os
Patrick von Platen's avatar
Patrick von Platen committed
8
import pytest
9
from mistral_common.multimodal import download_image
10
11
12
13
from mistral_common.protocol.instruct.messages import ImageURLChunk
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
14
from transformers import AutoProcessor
15

16
from vllm import RequestOutput, SamplingParams, TextPrompt, TokensPrompt
17
from vllm.multimodal import MultiModalDataBuiltins
18
from vllm.multimodal.inputs import PlaceholderRange
19
from vllm.sequence import Logprob, SampleLogprobs
Patrick von Platen's avatar
Patrick von Platen committed
20

21
from ....utils import VLLM_PATH, large_gpu_test
22
from ...utils import check_logprobs_close
23
from ....utils import models_path_prefix
Patrick von Platen's avatar
Patrick von Platen committed
24

25
26
if TYPE_CHECKING:
    from _typeshed import StrPath
Patrick von Platen's avatar
Patrick von Platen committed
27

28
29
30
PIXTRAL_ID = "mistralai/Pixtral-12B-2409"
MISTRAL_SMALL_3_1_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"

zhuwenwen's avatar
zhuwenwen committed
31
MODELS = [os.path.join(models_path_prefix, PIXTRAL_ID), os.path.join(models_path_prefix, MISTRAL_SMALL_3_1_ID)]
32

33
34
35
36
37
38
39
40
41
IMG_URLS = [
    "https://picsum.photos/id/237/400/300",
    "https://picsum.photos/id/231/200/300",
    "https://picsum.photos/id/27/500/500",
    "https://picsum.photos/id/17/150/600",
]
PROMPT = "Describe each image in one short sentence."


42
def _create_msg_format(urls: list[str]) -> list[dict[str, Any]]:
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
    return [{
        "role":
        "user",
        "content": [{
            "type": "text",
            "text": PROMPT,
        }] + [{
            "type": "image_url",
            "image_url": {
                "url": url
            }
        } for url in urls],
    }]


58
def _create_msg_format_hf(urls: list[str]) -> list[dict[str, Any]]:
59
60
61
62
63
64
65
66
67
68
69
70
71
    return [{
        "role":
        "user",
        "content": [{
            "type": "text",
            "content": PROMPT,
        }, *({
            "type": "image",
            "image": download_image(url)
        } for url in urls)],
    }]


72
def _create_engine_inputs(urls: list[str]) -> TokensPrompt:
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
    msg = _create_msg_format(urls)

    tokenizer = MistralTokenizer.from_model("pixtral")

    request = ChatCompletionRequest(messages=msg)  # type: ignore[type-var]
    tokenized = tokenizer.encode_chat_completion(request)

    engine_inputs = TokensPrompt(prompt_token_ids=tokenized.tokens)

    images = []
    for chunk in request.messages[0].content:
        if isinstance(chunk, ImageURLChunk):
            images.append(image_from_chunk(chunk))

    mm_data = MultiModalDataBuiltins(image=images)
    engine_inputs["multi_modal_data"] = mm_data

    return engine_inputs


93
def _create_engine_inputs_hf(urls: list[str]) -> TextPrompt:
94
95
    msg = _create_msg_format_hf(urls)

zhuwenwen's avatar
zhuwenwen committed
96
    tokenizer = AutoProcessor.from_pretrained(os.path.join(models_path_prefix, "mistral-community/pixtral-12b"))
97
98
99
100
101
102
103
104
105
106
107
108
109
    prompt = tokenizer.apply_chat_template(msg)

    images = []
    for chunk in msg[0]["content"]:
        if chunk["type"] == "image":
            images.append(chunk["image"])

    mm_data = MultiModalDataBuiltins(image=images)
    engine_inputs = TextPrompt(prompt=prompt, multi_modal_data=mm_data)

    return engine_inputs


110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
MSGS = [
    _create_msg_format(IMG_URLS[:1]),
    _create_msg_format(IMG_URLS[:2]),
    _create_msg_format(IMG_URLS),
]
ENGINE_INPUTS = [
    _create_engine_inputs(IMG_URLS[:1]),
    _create_engine_inputs(IMG_URLS[:2]),
    _create_engine_inputs(IMG_URLS),
]

SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
LIMIT_MM_PER_PROMPT = dict(image=4)

MAX_MODEL_LEN = [8192, 65536]
125
126
127
128

FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
assert FIXTURES_PATH.exists()

129
130
131
132
FIXTURE_LOGPROBS_CHAT = {
    PIXTRAL_ID: FIXTURES_PATH / "pixtral_chat.json",
    MISTRAL_SMALL_3_1_ID: FIXTURES_PATH / "mistral_small_3_chat.json",
}
133

134
OutputsLogprobs = list[tuple[list[int], str, Optional[SampleLogprobs]]]
135

136
137

# For the test author to store golden output in JSON
138
139
140
141
def _dump_outputs_w_logprobs(
    outputs: OutputsLogprobs,
    filename: "StrPath",
) -> None:
142
143
144
145
    json_data = [(tokens, text, [{
        k: asdict(v)
        for k, v in token_logprobs.items()
    } for token_logprobs in (logprobs or [])])
146
147
148
149
150
151
                 for tokens, text, logprobs in outputs]

    with open(filename, "w") as f:
        json.dump(json_data, f)


152
def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
153
154
155
    with open(filename, "rb") as f:
        json_data = json.load(f)

156
157
158
159
    return [(tokens, text, [{
        int(k): Logprob(**v)
        for k, v in token_logprobs.items()
    } for token_logprobs in logprobs]) for tokens, text, logprobs in json_data]
Patrick von Platen's avatar
Patrick von Platen committed
160
161


162
@large_gpu_test(min_gb=80)
Patrick von Platen's avatar
Patrick von Platen committed
163
@pytest.mark.parametrize("model", MODELS)
164
@pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
Patrick von Platen's avatar
Patrick von Platen committed
165
@pytest.mark.parametrize("dtype", ["bfloat16"])
166
def test_chat(
Patrick von Platen's avatar
Patrick von Platen committed
167
    vllm_runner,
168
    max_model_len: int,
Patrick von Platen's avatar
Patrick von Platen committed
169
170
171
    model: str,
    dtype: str,
) -> None:
172
173
    EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(
        FIXTURE_LOGPROBS_CHAT[model])
174
175
176
177
    with vllm_runner(
            model,
            dtype=dtype,
            tokenizer_mode="mistral",
178
179
            load_format="mistral",
            config_format="mistral",
180
181
182
183
184
185
186
187
188
189
190
            max_model_len=max_model_len,
            limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
    ) as vllm_model:
        outputs = []
        for msg in MSGS:
            output = vllm_model.model.chat(msg,
                                           sampling_params=SAMPLING_PARAMS)

            outputs.extend(output)

    logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
191
192
193
194
    # Remove last `None` prompt_logprobs to compare with fixture
    for i in range(len(logprobs)):
        assert logprobs[i][-1] is None
        logprobs[i] = logprobs[i][:-1]
195
196
197
198
    check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
                         outputs_1_lst=logprobs,
                         name_0="h100_ref",
                         name_1="output")
199
200


201
@large_gpu_test(min_gb=48)
202
203
204
205
206
207
208
209
@pytest.mark.parametrize("prompt,expected_ranges",
                         [(_create_engine_inputs_hf(IMG_URLS[:1]),
                           [PlaceholderRange(offset=11, length=494)]),
                          (_create_engine_inputs_hf(IMG_URLS[1:4]), [
                              PlaceholderRange(offset=11, length=266),
                              PlaceholderRange(offset=277, length=1056),
                              PlaceholderRange(offset=1333, length=418)
                          ])])
210
211
212
213
214
215
216
def test_multi_modal_placeholders(vllm_runner, prompt,
                                  expected_ranges: list[PlaceholderRange],
                                  monkeypatch) -> None:

    # This placeholder checking test only works with V0 engine
    # where `multi_modal_placeholders` is returned with `RequestOutput`
    monkeypatch.setenv("VLLM_USE_V1", "0")
217
    with vllm_runner(
zhuwenwen's avatar
zhuwenwen committed
218
            os.path.join(models_path_prefix, "mistral-community/pixtral-12b"),
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
            max_model_len=8192,
            limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
    ) as vllm_model:
        outputs = vllm_model.model.generate(prompt)

        assert len(outputs) == 1, f"{len(outputs)=}"
        output: RequestOutput = outputs[0]
        assert hasattr(output,
                       "multi_modal_placeholders"), f"{output.__dict__=}"
        assert "image" in output.multi_modal_placeholders, \
            f"{output.multi_modal_placeholders.keys()=}"
        image_placeholder_ranges: list[
            PlaceholderRange] = output.multi_modal_placeholders["image"]
        assert len(image_placeholder_ranges) == len(
            expected_ranges), f"{image_placeholder_ranges=}"
        for real_range, expected_range in zip(image_placeholder_ranges,
                                              expected_ranges):
            assert real_range == expected_range, \
                f"{real_range=} {expected_range=}"