"tests/kernels/moe/untest_modular_kernel_combinations.py" did not exist on "711e912946d23f4ccc1f554b1524c960553c5e28"
test_ultravox.py 9.1 KB
Newer Older
1
2
# SPDX-License-Identifier: Apache-2.0

3
import json
4
from typing import Any, Optional
5
6
7

import numpy as np
import pytest
8
import pytest_asyncio
9
from transformers import AutoModel, AutoTokenizer
10

11
from vllm.multimodal.audio import resample_audio_librosa
12
13
from vllm.sequence import SampleLogprobs

14
from ....conftest import HfRunner, VllmRunner, _AudioAssets
15
from ....utils import RemoteOpenAIServer
16
from ...registry import HF_EXAMPLE_MODELS
17
from ...utils import check_logprobs_close
18

19
MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
20

21
AudioTuple = tuple[np.ndarray, int]
22

23
VLLM_PLACEHOLDER = "<|audio|>"
24
25
HF_PLACEHOLDER = "<|audio|>"

26
27
28
29
30
31
32
CHUNKED_PREFILL_KWARGS = {
    "enable_chunked_prefill": True,
    "max_num_seqs": 2,
    # Use a very small limit to exercise chunked prefill.
    "max_num_batched_tokens": 16
}

33

34
35
36
37
@pytest.fixture(scope="module", params=("mary_had_lamb", "winning_call"))
def audio(request):
    from vllm.assets.audio import AudioAsset
    return AudioAsset(request.param)
38
39


40
41
42
43
44
45
46
47
48
49
50
51
def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]:
    """Convert kwargs to CLI args."""
    args = []
    for key, value in params_kwargs.items():
        if isinstance(value, bool):
            if value:
                args.append(f"--{key.replace('_','-')}")
        else:
            args.append(f"--{key.replace('_','-')}={value}")
    return args


52
53
54
55
@pytest.fixture(params=[
    pytest.param({}, marks=pytest.mark.cpu_model),
    pytest.param(CHUNKED_PREFILL_KWARGS),
])
56
def server(request, audio_assets: _AudioAssets):
57
    args = [
58
59
        "--dtype", "bfloat16", "--max-model-len", "4096", "--enforce-eager",
        "--limit-mm-per-prompt",
60
        json.dumps({"audio": len(audio_assets)}), "--trust-remote-code"
61
    ] + params_kwargs_to_cli_args(request.param)
62

63
64
65
66
    with RemoteOpenAIServer(MODEL_NAME,
                            args,
                            env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
                                      "30"}) as remote_server:
67
68
69
70
71
72
73
74
75
        yield remote_server


@pytest_asyncio.fixture
async def client(server):
    async with server.get_async_client() as async_client:
        yield async_client


76
77
78
def _get_prompt(audio_count, question, placeholder):
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    placeholder = f"{placeholder}\n" * audio_count
79

80
81
82
83
84
85
    return tokenizer.apply_chat_template([{
        'role': 'user',
        'content': f"{placeholder}{question}"
    }],
                                         tokenize=False,
                                         add_generation_prompt=True)
86
87


88
def vllm_to_hf_output(vllm_output: tuple[list[int], str,
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
                                         Optional[SampleLogprobs]],
                      model: str):
    """Sanitize vllm output to be comparable with hf output."""
    output_ids, output_str, out_logprobs = vllm_output

    tokenizer = AutoTokenizer.from_pretrained(model)
    eos_token_id = tokenizer.eos_token_id

    hf_output_ids = output_ids[:]
    hf_output_str = output_str
    if hf_output_ids[-1] == eos_token_id:
        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)

    return hf_output_ids, hf_output_str, out_logprobs


def run_test(
106
107
108
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
    prompts_and_audios: list[tuple[str, str, AudioTuple]],
109
110
111
112
113
    model: str,
    *,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
114
    **kwargs,
115
116
):
    """Inference result should be the same between hf and vllm."""
117
118
119
120
    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
    model_info.check_available_online(on_fail="skip")
    model_info.check_transformers_version(on_fail="skip")

121
122
123
124
125
    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
    # if we run HF first, the cuda initialization will be done and it
    # will hurt multiprocessing backend with fork method (the default method).

126
127
    with vllm_runner(model, dtype=dtype, enforce_eager=True,
                     **kwargs) as vllm_model:
128
129
130
131
132
133
134
135
        vllm_outputs_per_audio = [
            vllm_model.generate_greedy_logprobs([vllm_prompt],
                                                max_tokens,
                                                num_logprobs=num_logprobs,
                                                audios=[audio])
            for vllm_prompt, _, audio in prompts_and_audios
        ]

136
    with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
137
138
139
140
141
        hf_outputs_per_audio = [
            hf_model.generate_greedy_logprobs_limit(
                [hf_prompt],
                max_tokens,
                num_logprobs=num_logprobs,
142
143
144
                audios=[(resample_audio_librosa(audio[0],
                                                orig_sr=audio[1],
                                                target_sr=16000), 16000)])
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
            for _, hf_prompt, audio in prompts_and_audios
        ]

    for hf_outputs, vllm_outputs in zip(hf_outputs_per_audio,
                                        vllm_outputs_per_audio):
        check_logprobs_close(
            outputs_0_lst=hf_outputs,
            outputs_1_lst=[
                vllm_to_hf_output(vllm_output, model)
                for vllm_output in vllm_outputs
            ],
            name_0="hf",
            name_1="vllm",
        )


161
def run_multi_audio_test(
162
163
    vllm_runner: type[VllmRunner],
    prompts_and_audios: list[tuple[str, list[AudioTuple]]],
164
165
166
167
168
    model: str,
    *,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
169
    **kwargs,
170
):
171
172
173
174
    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
    model_info.check_available_online(on_fail="skip")
    model_info.check_transformers_version(on_fail="skip")

175
176
177
178
179
180
    with vllm_runner(model,
                     dtype=dtype,
                     enforce_eager=True,
                     limit_mm_per_prompt={
                         "audio":
                         max((len(audio) for _, audio in prompts_and_audios))
181
182
                     },
                     **kwargs) as vllm_model:
183
184
185
186
187
188
189
190
191
192
193
        vllm_outputs = vllm_model.generate_greedy_logprobs(
            [prompt for prompt, _ in prompts_and_audios],
            max_tokens,
            num_logprobs=num_logprobs,
            audios=[audios for _, audios in prompts_and_audios])

    # The HuggingFace model doesn't support multiple audios yet, so
    # just assert that some tokens were generated.
    assert all(tokens for tokens, *_ in vllm_outputs)


194
@pytest.mark.core_model
195
@pytest.mark.parametrize("dtype", ["bfloat16"])
196
197
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
198
199
200
201
@pytest.mark.parametrize("vllm_kwargs", [
    pytest.param({}, marks=pytest.mark.cpu_model),
    pytest.param(CHUNKED_PREFILL_KWARGS),
])
202
def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
203
                num_logprobs: int, vllm_kwargs: dict) -> None:
204
205
206

    vllm_prompt = _get_prompt(1, "Describe the audio above.", VLLM_PLACEHOLDER)
    hf_prompt = _get_prompt(1, "Describe the audio above.", HF_PLACEHOLDER)
207
208
209
    run_test(
        hf_runner,
        vllm_runner,
210
211
212
213
214
        [(vllm_prompt, hf_prompt, audio.audio_and_sample_rate)],
        MODEL_NAME,
        dtype=dtype,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
215
        **vllm_kwargs,
216
217
218
    )


219
@pytest.mark.core_model
220
221
222
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
223
224
225
226
@pytest.mark.parametrize("vllm_kwargs", [
    pytest.param({}, marks=pytest.mark.cpu_model),
    pytest.param(CHUNKED_PREFILL_KWARGS),
])
227
228
229
def test_models_with_multiple_audios(vllm_runner, audio_assets: _AudioAssets,
                                     dtype: str, max_tokens: int,
                                     num_logprobs: int,
230
                                     vllm_kwargs: dict) -> None:
231
232
233
234
235
236
237
238

    vllm_prompt = _get_prompt(len(audio_assets),
                              "Describe each of the audios above.",
                              VLLM_PLACEHOLDER)
    run_multi_audio_test(
        vllm_runner,
        [(vllm_prompt, [audio.audio_and_sample_rate
                        for audio in audio_assets])],
239
240
241
242
        MODEL_NAME,
        dtype=dtype,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
243
        **vllm_kwargs,
244
    )
245
246
247


@pytest.mark.asyncio
248
async def test_online_serving(client, audio_assets: _AudioAssets):
249
    """Exercises online serving with/without chunked prefill enabled."""
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276

    messages = [{
        "role":
        "user",
        "content": [
            *[{
                "type": "audio_url",
                "audio_url": {
                    "url": audio.url
                }
            } for audio in audio_assets],
            {
                "type":
                "text",
                "text":
                f"What's happening in these {len(audio_assets)} audio clips?"
            },
        ],
    }]

    chat_completion = await client.chat.completions.create(model=MODEL_NAME,
                                                           messages=messages,
                                                           max_tokens=10)

    assert len(chat_completion.choices) == 1
    choice = chat_completion.choices[0]
    assert choice.finish_reason == "length"