"vllm/vscode:/vscode.git/clone" did not exist on "32985bed7c88f654b11f919ead34d77e846c32e3"
test_pixtral.py 5.98 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
import json
from dataclasses import asdict
5
from typing import TYPE_CHECKING, Any, Optional
6

Patrick von Platen's avatar
Patrick von Platen committed
7
import pytest
8
from mistral_common.multimodal import download_image
9
10
11
12
from mistral_common.protocol.instruct.messages import ImageURLChunk
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
13
from transformers import AutoProcessor
14

15
from vllm import SamplingParams, TextPrompt, TokensPrompt
16
from vllm.logprobs import Logprob, SampleLogprobs
17
from vllm.multimodal import MultiModalDataBuiltins
Patrick von Platen's avatar
Patrick von Platen committed
18

19
from ....utils import VLLM_PATH, large_gpu_test
20
from ...utils import check_logprobs_close
Patrick von Platen's avatar
Patrick von Platen committed
21

22
23
if TYPE_CHECKING:
    from _typeshed import StrPath
Patrick von Platen's avatar
Patrick von Platen committed
24

25
26
27
28
29
PIXTRAL_ID = "mistralai/Pixtral-12B-2409"
MISTRAL_SMALL_3_1_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"

MODELS = [PIXTRAL_ID, MISTRAL_SMALL_3_1_ID]

30
IMG_URLS = [
31
32
33
34
    "237-400x300.jpg",  # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
    "231-200x300.jpg",  # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
    "27-500x500.jpg",  # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
    "17-150x600.jpg",  # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
35
36
37
38
]
PROMPT = "Describe each image in one short sentence."


39
def _create_msg_format(urls: list[str]) -> list[dict[str, Any]]:
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
    return [{
        "role":
        "user",
        "content": [{
            "type": "text",
            "text": PROMPT,
        }] + [{
            "type": "image_url",
            "image_url": {
                "url": url
            }
        } for url in urls],
    }]


55
def _create_msg_format_hf(urls: list[str]) -> list[dict[str, Any]]:
56
57
58
59
60
61
62
63
64
65
66
67
68
    return [{
        "role":
        "user",
        "content": [{
            "type": "text",
            "content": PROMPT,
        }, *({
            "type": "image",
            "image": download_image(url)
        } for url in urls)],
    }]


69
def _create_engine_inputs(urls: list[str]) -> TokensPrompt:
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
    msg = _create_msg_format(urls)

    tokenizer = MistralTokenizer.from_model("pixtral")

    request = ChatCompletionRequest(messages=msg)  # type: ignore[type-var]
    tokenized = tokenizer.encode_chat_completion(request)

    engine_inputs = TokensPrompt(prompt_token_ids=tokenized.tokens)

    images = []
    for chunk in request.messages[0].content:
        if isinstance(chunk, ImageURLChunk):
            images.append(image_from_chunk(chunk))

    mm_data = MultiModalDataBuiltins(image=images)
    engine_inputs["multi_modal_data"] = mm_data

    return engine_inputs


90
def _create_engine_inputs_hf(urls: list[str]) -> TextPrompt:
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
    msg = _create_msg_format_hf(urls)

    tokenizer = AutoProcessor.from_pretrained("mistral-community/pixtral-12b")
    prompt = tokenizer.apply_chat_template(msg)

    images = []
    for chunk in msg[0]["content"]:
        if chunk["type"] == "image":
            images.append(chunk["image"])

    mm_data = MultiModalDataBuiltins(image=images)
    engine_inputs = TextPrompt(prompt=prompt, multi_modal_data=mm_data)

    return engine_inputs


107
108
109
110
SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
LIMIT_MM_PER_PROMPT = dict(image=4)

MAX_MODEL_LEN = [8192, 65536]
111
112
113
114

FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
assert FIXTURES_PATH.exists()

115
116
117
118
FIXTURE_LOGPROBS_CHAT = {
    PIXTRAL_ID: FIXTURES_PATH / "pixtral_chat.json",
    MISTRAL_SMALL_3_1_ID: FIXTURES_PATH / "mistral_small_3_chat.json",
}
119

120
OutputsLogprobs = list[tuple[list[int], str, Optional[SampleLogprobs]]]
121

122
123

# For the test author to store golden output in JSON
124
125
126
127
def _dump_outputs_w_logprobs(
    outputs: OutputsLogprobs,
    filename: "StrPath",
) -> None:
128
129
130
131
    json_data = [(tokens, text, [{
        k: asdict(v)
        for k, v in token_logprobs.items()
    } for token_logprobs in (logprobs or [])])
132
133
134
135
136
137
                 for tokens, text, logprobs in outputs]

    with open(filename, "w") as f:
        json.dump(json_data, f)


138
def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
139
140
141
    with open(filename, "rb") as f:
        json_data = json.load(f)

142
143
144
145
    return [(tokens, text, [{
        int(k): Logprob(**v)
        for k, v in token_logprobs.items()
    } for token_logprobs in logprobs]) for tokens, text, logprobs in json_data]
Patrick von Platen's avatar
Patrick von Platen committed
146
147


148
@large_gpu_test(min_gb=80)
Patrick von Platen's avatar
Patrick von Platen committed
149
@pytest.mark.parametrize("model", MODELS)
150
@pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
Patrick von Platen's avatar
Patrick von Platen committed
151
@pytest.mark.parametrize("dtype", ["bfloat16"])
152
153
def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str,
              local_asset_server) -> None:
154
155
    EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(
        FIXTURE_LOGPROBS_CHAT[model])
156
157
158
159
    with vllm_runner(
            model,
            dtype=dtype,
            tokenizer_mode="mistral",
160
161
            load_format="mistral",
            config_format="mistral",
162
163
164
165
            max_model_len=max_model_len,
            limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
    ) as vllm_model:
        outputs = []
166
167
168
169
170
171
172
173

        urls_all = [local_asset_server.url_for(u) for u in IMG_URLS]
        msgs = [
            _create_msg_format(urls_all[:1]),
            _create_msg_format(urls_all[:2]),
            _create_msg_format(urls_all),
        ]
        for msg in msgs:
174
            output = vllm_model.llm.chat(msg, sampling_params=SAMPLING_PARAMS)
175
176
177
178

            outputs.extend(output)

    logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
179
180
181
182
    # Remove last `None` prompt_logprobs to compare with fixture
    for i in range(len(logprobs)):
        assert logprobs[i][-1] is None
        logprobs[i] = logprobs[i][:-1]
183
184
185
186
    check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
                         outputs_1_lst=logprobs,
                         name_0="h100_ref",
                         name_1="output")