test_translation_validation.py 5.83 KB
Newer Older
1
2
3
4
5
6
7
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import io
# imports for guided decoding tests
import json

8
import httpx
9
10
11
import librosa
import numpy as np
import pytest
12
import pytest_asyncio
13
14
15
16
17
18
import soundfile as sf

from vllm.assets.audio import AudioAsset

from ...utils import RemoteOpenAIServer

19
20
21
MODEL_NAME = "openai/whisper-small"
SERVER_ARGS = ["--enforce-eager"]

22
23
24
25
26
27
28
29
30

@pytest.fixture
def foscolo():
    # Test translation it->en
    path = AudioAsset('azacinto_foscolo').get_local_path()
    with open(str(path), "rb") as f:
        yield f


31
32
33
34
@pytest.fixture(scope="module")
def server():
    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
        yield remote_server
35
36


37
38
39
40
@pytest_asyncio.fixture
async def client(server):
    async with server.get_async_client() as async_client:
        yield async_client
41
42
43
44
45
46


@pytest.mark.asyncio
async def test_non_asr_model(foscolo):
    # text to text model
    model_name = "JackFram/llama-68m"
47
    with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server:
48
49
50
51
        client = remote_server.get_async_client()
        res = await client.audio.translations.create(model=model_name,
                                                     file=foscolo,
                                                     temperature=0.0)
52
53
54
        err = res.error
        assert err["code"] == 400 and not res.text
        assert err["message"] == "The model does not support Translations API"
55
56


57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# NOTE: (NickLucche) the large-v3-turbo model was not trained on translation!
@pytest.mark.asyncio
async def test_basic_audio(foscolo, client):
    translation = await client.audio.translations.create(
        model=MODEL_NAME,
        file=foscolo,
        response_format="text",
        # TODO remove once language detection is implemented
        extra_body=dict(language="it"),
        temperature=0.0)
    out = json.loads(translation)['text'].strip().lower()
    assert "greek sea" in out


@pytest.mark.asyncio
async def test_audio_prompt(foscolo, client):
    # Condition whisper on starting text
    prompt = "Nor have I ever"
    transcription = await client.audio.translations.create(
        model=MODEL_NAME,
        file=foscolo,
        prompt=prompt,
        extra_body=dict(language="it"),
        response_format="text",
        temperature=0.0)
    out = json.loads(transcription)['text']
    assert "Nor will I ever touch the sacred" not in out
    assert prompt not in out


87
@pytest.mark.asyncio
88
async def test_streaming_response(foscolo, client, server):
89
    translation = ""
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
    res_no_stream = await client.audio.translations.create(
        model=MODEL_NAME,
        file=foscolo,
        response_format="json",
        extra_body=dict(language="it"),
        temperature=0.0)
    # Stream via HTTPX since OpenAI translation client doesn't expose streaming
    url = server.url_for("v1/audio/translations")
    headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
    data = {
        "model": MODEL_NAME,
        "language": "it",
        "stream": True,
        "temperature": 0.0,
    }
    foscolo.seek(0)
    async with httpx.AsyncClient() as http_client:
        files = {"file": foscolo}
        async with http_client.stream("POST",
                                      url,
                                      headers=headers,
                                      data=data,
                                      files=files) as response:
            async for line in response.aiter_lines():
                if not line:
                    continue
                if line.startswith("data: "):
                    line = line[len("data: "):]
                if line.strip() == "[DONE]":
                    break
                chunk = json.loads(line)
                text = chunk["choices"][0].get("delta", {}).get("content")
                translation += text or ""

    assert translation == res_no_stream.text
125
126
127


@pytest.mark.asyncio
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
async def test_stream_options(foscolo, client, server):
    url = server.url_for("v1/audio/translations")
    headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
    data = {
        "model": MODEL_NAME,
        "language": "it",
        "stream": True,
        "stream_include_usage": True,
        "stream_continuous_usage_stats": True,
        "temperature": 0.0,
    }
    foscolo.seek(0)
    final = False
    continuous = True
    async with httpx.AsyncClient() as http_client:
        files = {"file": foscolo}
        async with http_client.stream("POST",
                                      url,
                                      headers=headers,
                                      data=data,
                                      files=files) as response:
            async for line in response.aiter_lines():
                if not line:
                    continue
                if line.startswith("data: "):
                    line = line[len("data: "):]
                if line.strip() == "[DONE]":
                    break
                chunk = json.loads(line)
                choices = chunk.get("choices", [])
                if not choices:
159
160
161
                    # final usage sent
                    final = True
                else:
162
163
                    continuous = continuous and ("usage" in chunk)
    assert final and continuous
164
165
166


@pytest.mark.asyncio
167
async def test_long_audio_request(foscolo, client):
168
169
170
171
172
173
174
    foscolo.seek(0)
    audio, sr = librosa.load(foscolo)
    repeated_audio = np.tile(audio, 2)
    # Repeated audio to buffer
    buffer = io.BytesIO()
    sf.write(buffer, repeated_audio, sr, format='WAV')
    buffer.seek(0)
175
176
177
178
179
180
181
182
    translation = await client.audio.translations.create(
        model=MODEL_NAME,
        file=buffer,
        extra_body=dict(language="it"),
        response_format="text",
        temperature=0.0)
    out = json.loads(translation)['text'].strip().lower()
    assert out.count("greek sea") == 2