test_translation_validation.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import io
# imports for guided decoding tests
import json

import httpx
import librosa
import numpy as np
import pytest
import pytest_asyncio
import soundfile as sf

from vllm.assets.audio import AudioAsset

from ...utils import RemoteOpenAIServer

MODEL_NAME = "openai/whisper-small"
SERVER_ARGS = ["--enforce-eager"]


@pytest.fixture
def foscolo():
    # Test translation it->en
    path = AudioAsset('azacinto_foscolo').get_local_path()
    with open(str(path), "rb") as f:
        yield f


@pytest.fixture(scope="module")
def server():
    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
        yield remote_server


@pytest_asyncio.fixture
async def client(server):
    async with server.get_async_client() as async_client:
        yield async_client


@pytest.mark.asyncio
async def test_non_asr_model(foscolo):
    # text to text model
    model_name = "JackFram/llama-68m"
    with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server:
        client = remote_server.get_async_client()
        res = await client.audio.translations.create(model=model_name,
                                                     file=foscolo,
                                                     temperature=0.0)
        err = res.error
        assert err["code"] == 400 and not res.text
        assert err["message"] == "The model does not support Translations API"


# NOTE: (NickLucche) the large-v3-turbo model was not trained on translation!
@pytest.mark.asyncio
async def test_basic_audio(foscolo, client):
    translation = await client.audio.translations.create(
        model=MODEL_NAME,
        file=foscolo,
        response_format="text",
        # TODO remove once language detection is implemented
        extra_body=dict(language="it"),
        temperature=0.0)
    out = json.loads(translation)['text'].strip().lower()
    assert "greek sea" in out


@pytest.mark.asyncio
async def test_audio_prompt(foscolo, client):
    # Condition whisper on starting text
    prompt = "Nor have I ever"
    transcription = await client.audio.translations.create(
        model=MODEL_NAME,
        file=foscolo,
        prompt=prompt,
        extra_body=dict(language="it"),
        response_format="text",
        temperature=0.0)
    out = json.loads(transcription)['text']
    assert "Nor will I ever touch the sacred" not in out
    assert prompt not in out


@pytest.mark.asyncio
async def test_streaming_response(foscolo, client, server):
    translation = ""
    res_no_stream = await client.audio.translations.create(
        model=MODEL_NAME,
        file=foscolo,
        response_format="json",
        extra_body=dict(language="it"),
        temperature=0.0)
    # Stream via HTTPX since OpenAI translation client doesn't expose streaming
    url = server.url_for("v1/audio/translations")
    headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
    data = {
        "model": MODEL_NAME,
        "language": "it",
        "stream": True,
        "temperature": 0.0,
    }
    foscolo.seek(0)
    async with httpx.AsyncClient() as http_client:
        files = {"file": foscolo}
        async with http_client.stream("POST",
                                      url,
                                      headers=headers,
                                      data=data,
                                      files=files) as response:
            async for line in response.aiter_lines():
                if not line:
                    continue
                if line.startswith("data: "):
                    line = line[len("data: "):]
                if line.strip() == "[DONE]":
                    break
                chunk = json.loads(line)
                text = chunk["choices"][0].get("delta", {}).get("content")
                translation += text or ""

    assert translation == res_no_stream.text


@pytest.mark.asyncio
async def test_stream_options(foscolo, client, server):
    url = server.url_for("v1/audio/translations")
    headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
    data = {
        "model": MODEL_NAME,
        "language": "it",
        "stream": True,
        "stream_include_usage": True,
        "stream_continuous_usage_stats": True,
        "temperature": 0.0,
    }
    foscolo.seek(0)
    final = False
    continuous = True
    async with httpx.AsyncClient() as http_client:
        files = {"file": foscolo}
        async with http_client.stream("POST",
                                      url,
                                      headers=headers,
                                      data=data,
                                      files=files) as response:
            async for line in response.aiter_lines():
                if not line:
                    continue
                if line.startswith("data: "):
                    line = line[len("data: "):]
                if line.strip() == "[DONE]":
                    break
                chunk = json.loads(line)
                choices = chunk.get("choices", [])
                if not choices:
                    # final usage sent
                    final = True
                else:
                    continuous = continuous and ("usage" in chunk)
    assert final and continuous


@pytest.mark.asyncio
async def test_long_audio_request(foscolo, client):
    foscolo.seek(0)
    audio, sr = librosa.load(foscolo)
    repeated_audio = np.tile(audio, 2)
    # Repeated audio to buffer
    buffer = io.BytesIO()
    sf.write(buffer, repeated_audio, sr, format='WAV')
    buffer.seek(0)
    translation = await client.audio.translations.create(
        model=MODEL_NAME,
        file=buffer,
        extra_body=dict(language="it"),
        response_format="text",
        temperature=0.0)
    out = json.loads(translation)['text'].strip().lower()
    assert out.count("greek sea") == 2