test_transcription_validation.py 3.19 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
# imports for structured outputs tests
5
6
7
8
9
10
import json

import pytest

from ...utils import RemoteOpenAIServer

Patrick von Platen's avatar
Patrick von Platen committed
11
MISTRAL_FORMAT_ARGS = [
12
13
14
15
16
17
    "--tokenizer_mode",
    "mistral",
    "--config_format",
    "mistral",
    "--load_format",
    "mistral",
Patrick von Platen's avatar
Patrick von Platen committed
18
19
]

20
21

@pytest.mark.asyncio
22
@pytest.mark.parametrize("model_name", ["mistralai/Voxtral-Mini-3B-2507"])
Patrick von Platen's avatar
Patrick von Platen committed
23
async def test_basic_audio(mary_had_lamb, model_name):
24
    server_args = ["--enforce-eager"]
Patrick von Platen's avatar
Patrick von Platen committed
25
26
27
28

    if model_name.startswith("mistralai"):
        server_args += MISTRAL_FORMAT_ARGS

29
30
31
32
33
34
35
36
    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()
        transcription = await client.audio.transcriptions.create(
            model=model_name,
            file=mary_had_lamb,
            language="en",
            response_format="text",
37
38
            temperature=0.0,
        )
39
        out = json.loads(transcription)
40
41
        out_text = out["text"]
        out_usage = out["usage"]
42
43
        assert "Mary had a little lamb," in out_text
        assert out_usage["seconds"] == 16, out_usage["seconds"]
44
45


46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
@pytest.mark.asyncio
async def test_basic_audio_with_lora(mary_had_lamb):
    """Ensure STT (transcribe) requests can pass LoRA through to generate."""
    model_name = "ibm-granite/granite-speech-3.3-2b"
    lora_model_name = "speech"
    server_args = [
        "--enforce-eager",
        "--enable-lora",
        "--max-lora-rank",
        "64",
        "--lora-modules",
        f"{lora_model_name}={model_name}",
        "--max-model-len",
        "2048",
        "--max-num-seqs",
        "1",
    ]

    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()
        transcription = await client.audio.transcriptions.create(
            model=lora_model_name,
            file=mary_had_lamb,
            language="en",
            response_format="text",
            temperature=0.0,
        )
    out = json.loads(transcription)
    out_text = out["text"]
    out_usage = out["usage"]
    assert "mary had a little lamb" in out_text
    assert out_usage["seconds"] == 16, out_usage["seconds"]


81
82
83
84
85
86
87
@pytest.mark.asyncio
async def test_basic_audio_gemma(foscolo):
    # Gemma accuracy on some of the audio samples we use is particularly bad,
    # hence we use a different one here. WER is evaluated separately.
    model_name = "google/gemma-3n-E2B-it"
    server_args = ["--enforce-eager"]

88
89
90
    with RemoteOpenAIServer(
        model_name, server_args, max_wait_seconds=480
    ) as remote_server:
91
92
93
94
95
96
        client = remote_server.get_async_client()
        transcription = await client.audio.transcriptions.create(
            model=model_name,
            file=foscolo,
            language="it",
            response_format="text",
97
98
99
            temperature=0.0,
        )
        out = json.loads(transcription)["text"]
100
        assert "da cui vergine nacque Venere" in out