test_voxtral.py 6.63 KB
Newer Older
1
2
3
4
5
6
7
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import json

import pytest
from mistral_common.audio import Audio
8
9
from mistral_common.protocol.instruct.chunk import AudioChunk, RawAudio, TextChunk
from mistral_common.protocol.instruct.messages import UserMessage
10
from transformers import VoxtralForConditionalGeneration
11

12
from vllm.tokenizers.mistral import MistralTokenizer
13
14
15

from ....conftest import AudioTestAssets
from ....utils import RemoteOpenAIServer
16
from ...utils import check_logprobs_close
17
from .test_ultravox import MULTI_AUDIO_PROMPT, run_multi_audio_test
18
from .vlm_utils import model_utils
19
20
21

MODEL_NAME = "mistralai/Voxtral-Mini-3B-2507"
MISTRAL_FORMAT_ARGS = [
22
23
24
25
26
27
    "--tokenizer_mode",
    "mistral",
    "--config_format",
    "mistral",
    "--load_format",
    "mistral",
28
29
30
]


31
32
def _get_prompt(audio_assets: AudioTestAssets, question: str) -> list[int]:
    """Build a token-ID prompt via mistral_common for vLLM offline inference."""
33
34
35
    tokenizer = MistralTokenizer.from_pretrained(MODEL_NAME)

    audios = [
36
37
        Audio.from_file(str(asset.get_local_path()), strict=False)
        for asset in audio_assets
38
39
40
41
42
    ]
    audio_chunks = [
        AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
    ]

43
44
45
    messages = [
        UserMessage(content=[*audio_chunks, TextChunk(text=question)]).to_openai()
    ]
46
47
48
49
50
51
52
    return tokenizer.apply_chat_template(messages=messages)


@pytest.mark.core_model
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
53
54
55
56
57
58
59
def test_models_with_multiple_audios(
    vllm_runner,
    audio_assets: AudioTestAssets,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
) -> None:
60
61
62
    vllm_prompt = _get_prompt(audio_assets, MULTI_AUDIO_PROMPT)
    run_multi_audio_test(
        vllm_runner,
63
        [(vllm_prompt, [a.audio_and_sample_rate for a in audio_assets])],  # type: ignore[list-item]
64
65
66
67
68
69
70
71
        MODEL_NAME,
        dtype=dtype,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
        tokenizer_mode="mistral",
    )


72
73
74
75
76
77
78
79
80
81
82
def test_online_serving(vllm_runner, audio_assets: AudioTestAssets):
    """Two-layer accuracy and serving validation using Mistral format.

    1. Offline vLLM greedy output (runs first to avoid CUDA fork issues
       with multiprocessing - see vlm_utils/core.py).
    2. Online OpenAI-compatible API output must match offline — validates
       that the serving path (chat template, audio encoding, tokenization)
       does not corrupt anything.

    Steps run sequentially so each releases the GPU before the next starts.
    """
83

84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
    question = f"What's happening in these {len(audio_assets)} audio clips?"
    max_tokens = 10
    audio_data = [asset.audio_and_sample_rate for asset in audio_assets]

    vllm_prompt = _get_prompt(audio_assets, question)
    with vllm_runner(
        MODEL_NAME,
        dtype="half",
        enforce_eager=True,
        tokenizer_mode="mistral",
        config_format="mistral",
        load_format="mistral",
        limit_mm_per_prompt={"audio": len(audio_assets)},
    ) as vllm_model:
        offline_outputs = vllm_model.generate_greedy(
            [vllm_prompt],
            max_tokens,
            audios=[audio_data],
        )

    offline_text = offline_outputs[0][1]
    assert offline_text, "Offline vLLM inference produced empty output"

    def _asset_to_openai_chunk(asset):
108
109
        audio = Audio.from_file(str(asset.get_local_path()), strict=False)
        audio.format = "wav"
110
        return AudioChunk.from_audio(audio).to_openai()
111

112
113
114
    messages = [
        {
            "role": "user",
115
116
117
118
            "content": [
                *[_asset_to_openai_chunk(a) for a in audio_assets],
                {"type": "text", "text": question},
            ],
119
120
121
        }
    ]

122
123
124
125
126
127
    server_args = [
        "--enforce-eager",
        "--limit-mm-per-prompt",
        json.dumps({"audio": len(audio_assets)}),
        *MISTRAL_FORMAT_ARGS,
    ]
128

129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
    with RemoteOpenAIServer(
        MODEL_NAME,
        server_args,
        env_dict={"VLLM_AUDIO_FETCH_TIMEOUT": "30"},
    ) as remote_server:
        client = remote_server.get_client()
        completion = client.chat.completions.create(
            model=MODEL_NAME,
            messages=messages,
            max_tokens=max_tokens,
            temperature=0,
        )

    assert len(completion.choices) == 1
    choice = completion.choices[0]
144
    assert choice.finish_reason == "length"
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
    assert choice.message.content == offline_text, (
        f"Online serving output does not match offline inference.\n"
        f"  Online:  {choice.message.content!r}\n"
        f"  Offline: {offline_text!r}"
    )


def test_hf_reference(hf_runner, vllm_runner, audio_assets: AudioTestAssets):
    """Compare vLLM Mistral-format output against HF Transformers reference.

    Instead of requiring an exact text match (which is brittle across
    attention backends), we compare per-token logprobs using the standard
    check_logprobs_close helper: when tokens diverge at a position, each
    runner's chosen token must appear in the other's top-k logprobs.

    Marked xfail(strict=False) so remaining edge-case mismatches
    don't block CI.
    """
    question = f"What's happening in these {len(audio_assets)} audio clips?"
    max_tokens = 10
    num_logprobs = 5
    audio_data = [asset.audio_and_sample_rate for asset in audio_assets]

    vllm_prompt = _get_prompt(audio_assets, question)
    with vllm_runner(
        MODEL_NAME,
        dtype="half",
        enforce_eager=True,
        tokenizer_mode="mistral",
        config_format="mistral",
        load_format="mistral",
        limit_mm_per_prompt={"audio": len(audio_assets)},
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
            [vllm_prompt],
            max_tokens,
            num_logprobs,
            audios=[audio_data],
        )
    assert vllm_outputs[0][1], "vLLM inference produced empty output"

    with hf_runner(
        MODEL_NAME,
        dtype="half",
        auto_cls=VoxtralForConditionalGeneration,
    ) as hf_model:
        hf_model = model_utils.voxtral_patch_hf_runner(hf_model)
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
            [question],
            max_tokens,
            num_logprobs,
            audios=[audio_data],
        )
    assert hf_outputs[0][1], "HF Transformers produced empty output"

    print(
        f"HF Reference Comparison\n"
        f"  vLLM: {vllm_outputs[0][1]!r}\n"
        f"  HF:   {hf_outputs[0][1]!r}"
    )
    check_logprobs_close(
        outputs_0_lst=vllm_outputs,
        outputs_1_lst=hf_outputs,
        name_0="vllm",
        name_1="hf",
    )