test_pixtral.py 8.3 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
import json
from dataclasses import asdict
5
from typing import TYPE_CHECKING, Any, Optional
6

7
import os
Patrick von Platen's avatar
Patrick von Platen committed
8
import pytest
9
from mistral_common.multimodal import download_image
10
11
12
13
from mistral_common.protocol.instruct.messages import ImageURLChunk
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
14
from transformers import AutoProcessor
15

16
from vllm import RequestOutput, SamplingParams, TextPrompt, TokensPrompt
17
from vllm.multimodal import MultiModalDataBuiltins
18
from vllm.multimodal.inputs import PlaceholderRange
19
from vllm.sequence import Logprob, SampleLogprobs
Patrick von Platen's avatar
Patrick von Platen committed
20

21
from ....utils import VLLM_PATH, large_gpu_test
22
from ...utils import check_logprobs_close, dummy_hf_overrides, models_path_prefix
Patrick von Platen's avatar
Patrick von Platen committed
23

24
25
if TYPE_CHECKING:
    from _typeshed import StrPath
Patrick von Platen's avatar
Patrick von Platen committed
26

27
28
PIXTRAL_ID = os.path.join(models_path_prefix, "mistralai/Pixtral-12B-2409")
MISTRAL_SMALL_3_1_ID = os.path.join(models_path_prefix, "mistralai/Mistral-Small-3.1-24B-Instruct-2503")
29

zhuwenwen's avatar
zhuwenwen committed
30
MODELS = [os.path.join(models_path_prefix, PIXTRAL_ID), os.path.join(models_path_prefix, MISTRAL_SMALL_3_1_ID)]
31

32
IMG_URLS = [
33
34
35
36
    "237-400x300.jpg",  # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
    "231-200x300.jpg",  # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
    "27-500x500.jpg",  # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
    "17-150x600.jpg",  # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
37
38
39
40
]
PROMPT = "Describe each image in one short sentence."


41
def _create_msg_format(urls: list[str]) -> list[dict[str, Any]]:
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
    return [{
        "role":
        "user",
        "content": [{
            "type": "text",
            "text": PROMPT,
        }] + [{
            "type": "image_url",
            "image_url": {
                "url": url
            }
        } for url in urls],
    }]


57
def _create_msg_format_hf(urls: list[str]) -> list[dict[str, Any]]:
58
59
60
61
62
63
64
65
66
67
68
69
70
    return [{
        "role":
        "user",
        "content": [{
            "type": "text",
            "content": PROMPT,
        }, *({
            "type": "image",
            "image": download_image(url)
        } for url in urls)],
    }]


71
def _create_engine_inputs(urls: list[str]) -> TokensPrompt:
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
    msg = _create_msg_format(urls)

    tokenizer = MistralTokenizer.from_model("pixtral")

    request = ChatCompletionRequest(messages=msg)  # type: ignore[type-var]
    tokenized = tokenizer.encode_chat_completion(request)

    engine_inputs = TokensPrompt(prompt_token_ids=tokenized.tokens)

    images = []
    for chunk in request.messages[0].content:
        if isinstance(chunk, ImageURLChunk):
            images.append(image_from_chunk(chunk))

    mm_data = MultiModalDataBuiltins(image=images)
    engine_inputs["multi_modal_data"] = mm_data

    return engine_inputs


92
def _create_engine_inputs_hf(urls: list[str]) -> TextPrompt:
93
94
    msg = _create_msg_format_hf(urls)

zhuwenwen's avatar
zhuwenwen committed
95
    tokenizer = AutoProcessor.from_pretrained(os.path.join(models_path_prefix, "mistral-community/pixtral-12b"))
96
97
98
99
100
101
102
103
104
105
106
107
108
    prompt = tokenizer.apply_chat_template(msg)

    images = []
    for chunk in msg[0]["content"]:
        if chunk["type"] == "image":
            images.append(chunk["image"])

    mm_data = MultiModalDataBuiltins(image=images)
    engine_inputs = TextPrompt(prompt=prompt, multi_modal_data=mm_data)

    return engine_inputs


109
110
111
112
SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
LIMIT_MM_PER_PROMPT = dict(image=4)

MAX_MODEL_LEN = [8192, 65536]
113
114
115
116

FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
assert FIXTURES_PATH.exists()

117
118
119
120
FIXTURE_LOGPROBS_CHAT = {
    PIXTRAL_ID: FIXTURES_PATH / "pixtral_chat.json",
    MISTRAL_SMALL_3_1_ID: FIXTURES_PATH / "mistral_small_3_chat.json",
}
121

122
OutputsLogprobs = list[tuple[list[int], str, Optional[SampleLogprobs]]]
123

124
125

# For the test author to store golden output in JSON
126
127
128
129
def _dump_outputs_w_logprobs(
    outputs: OutputsLogprobs,
    filename: "StrPath",
) -> None:
130
131
132
133
    json_data = [(tokens, text, [{
        k: asdict(v)
        for k, v in token_logprobs.items()
    } for token_logprobs in (logprobs or [])])
134
135
136
137
138
139
                 for tokens, text, logprobs in outputs]

    with open(filename, "w") as f:
        json.dump(json_data, f)


140
def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
141
142
143
    with open(filename, "rb") as f:
        json_data = json.load(f)

144
145
146
147
    return [(tokens, text, [{
        int(k): Logprob(**v)
        for k, v in token_logprobs.items()
    } for token_logprobs in logprobs]) for tokens, text, logprobs in json_data]
Patrick von Platen's avatar
Patrick von Platen committed
148
149


150
@large_gpu_test(min_gb=80)
Patrick von Platen's avatar
Patrick von Platen committed
151
@pytest.mark.parametrize("model", MODELS)
152
@pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
Patrick von Platen's avatar
Patrick von Platen committed
153
@pytest.mark.parametrize("dtype", ["bfloat16"])
154
155
def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str,
              local_asset_server) -> None:
156
157
    EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(
        FIXTURE_LOGPROBS_CHAT[model])
158
159
160
161
    with vllm_runner(
            model,
            dtype=dtype,
            tokenizer_mode="mistral",
162
163
            load_format="mistral",
            config_format="mistral",
164
165
166
167
            max_model_len=max_model_len,
            limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
    ) as vllm_model:
        outputs = []
168
169
170
171
172
173
174
175

        urls_all = [local_asset_server.url_for(u) for u in IMG_URLS]
        msgs = [
            _create_msg_format(urls_all[:1]),
            _create_msg_format(urls_all[:2]),
            _create_msg_format(urls_all),
        ]
        for msg in msgs:
176
            output = vllm_model.llm.chat(msg, sampling_params=SAMPLING_PARAMS)
177
178
179
180

            outputs.extend(output)

    logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
181
182
183
184
    # Remove last `None` prompt_logprobs to compare with fixture
    for i in range(len(logprobs)):
        assert logprobs[i][-1] is None
        logprobs[i] = logprobs[i][:-1]
185
186
187
188
    check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
                         outputs_1_lst=logprobs,
                         name_0="h100_ref",
                         name_1="output")
189
190


191
@pytest.mark.parametrize(
192
193
194
195
196
197
198
199
    "image_urls,expected_ranges",
    [(IMG_URLS[:1], [PlaceholderRange(offset=11, length=494)]),
     (IMG_URLS[1:4], [
         PlaceholderRange(offset=11, length=266),
         PlaceholderRange(offset=277, length=1056),
         PlaceholderRange(offset=1333, length=418)
     ])])
def test_multi_modal_placeholders(vllm_runner, image_urls: list[str],
200
                                  expected_ranges: list[PlaceholderRange],
201
202
203
                                  local_asset_server, monkeypatch) -> None:
    local_image_urls = [local_asset_server.url_for(u) for u in image_urls]
    prompt = _create_engine_inputs_hf(local_image_urls)
204
205
206
207

    # This placeholder checking test only works with V0 engine
    # where `multi_modal_placeholders` is returned with `RequestOutput`
    monkeypatch.setenv("VLLM_USE_V1", "0")
208
    with vllm_runner(
zhuwenwen's avatar
zhuwenwen committed
209
            os.path.join(models_path_prefix, "mistral-community/pixtral-12b"),
210
211
            max_model_len=8192,
            limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
212
213
            load_format="dummy",
            hf_overrides=dummy_hf_overrides,
214
    ) as vllm_model:
215
        outputs = vllm_model.llm.generate(prompt)
216
217
218
219
220
221
222
223
224
225
226
227
228

        assert len(outputs) == 1, f"{len(outputs)=}"
        output: RequestOutput = outputs[0]
        assert hasattr(output,
                       "multi_modal_placeholders"), f"{output.__dict__=}"
        assert "image" in output.multi_modal_placeholders, \
            f"{output.multi_modal_placeholders.keys()=}"
        image_placeholder_ranges: list[
            PlaceholderRange] = output.multi_modal_placeholders["image"]
        assert len(image_placeholder_ranges) == len(
            expected_ranges), f"{image_placeholder_ranges=}"
        for real_range, expected_range in zip(image_placeholder_ranges,
                                              expected_ranges):
229
230
231
            assert real_range.offset == expected_range.offset, \
                f"{real_range=} {expected_range=}"
            assert real_range.length == expected_range.length, \
232
                f"{real_range=} {expected_range=}"