Unverified Commit ae88468b authored by JasonCohere's avatar JasonCohere Committed by GitHub
Browse files

fix: Ensure invalid audio files return 400 error (#34715)


Signed-off-by: default avatarJason Ozuzu <jasonozuzu@cohere.com>
Co-authored-by: default avatarNicolò Lucchesi <nlucches@redhat.com>
parent e05cb3b9
......@@ -108,6 +108,23 @@ async def test_long_audio_request(mary_had_lamb, whisper_client):
assert out_usage["seconds"] == 161, out_usage["seconds"]
@pytest.mark.asyncio
async def test_invalid_audio_file(whisper_client):
"""Corrupted audio should surface as HTTP 400."""
invalid_audio = io.BytesIO(b"not a valid audio file")
invalid_audio.name = "invalid.wav"
with pytest.raises(openai.BadRequestError) as exc_info:
await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=invalid_audio,
language="en",
)
assert exc_info.value.status_code == 400
assert "Invalid or unsupported audio file" in exc_info.value.message
@pytest.mark.asyncio
async def test_completion_endpoints(whisper_client):
# text to text model
......
......@@ -11,6 +11,7 @@ from typing import Final, Literal, TypeAlias, TypeVar, cast
import numpy as np
from fastapi import Request
from soundfile import LibsndfileError
from transformers import PreTrainedTokenizerBase
import vllm.envs as envs
......@@ -57,6 +58,14 @@ try:
except ImportError:
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile
# being librosa's main backend. Used to validate if an audio loading error is due to a
# server error vs a client error (invalid audio file).
# 1 = unrecognised format (file is not a supported audio container)
# 3 = malformed file (corrupt or structurally invalid audio)
# 4 = unsupported encoding (codec not supported by this libsndfile build)
_BAD_SF_CODES = {1, 3, 4}
SpeechToTextResponse: TypeAlias = TranscriptionResponse | TranslationResponse
SpeechToTextResponseVerbose: TypeAlias = (
TranscriptionResponseVerbose | TranslationResponseVerbose
......@@ -315,9 +324,15 @@ class OpenAISpeechToText(OpenAIServing):
)
with io.BytesIO(audio_data) as bytes_:
try:
# NOTE resample to model SR here for efficiency. This is also a
# pre-requisite for chunking, as it assumes Whisper SR.
y, sr = librosa.load(bytes_, sr=self.asr_config.sample_rate)
except LibsndfileError as exc:
# Distinguish client errors (invalid audio) from server errors
if exc.code in _BAD_SF_CODES:
raise ValueError("Invalid or unsupported audio file.") from exc
raise
duration = librosa.get_duration(y=y, sr=sr)
do_split_audio = (
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment