Unverified Commit ae88468b authored by JasonCohere's avatar JasonCohere Committed by GitHub
Browse files

fix: Ensure invalid audio files return 400 error (#34715)


Signed-off-by: default avatarJason Ozuzu <jasonozuzu@cohere.com>
Co-authored-by: default avatarNicolò Lucchesi <nlucches@redhat.com>
parent e05cb3b9
...@@ -108,6 +108,23 @@ async def test_long_audio_request(mary_had_lamb, whisper_client): ...@@ -108,6 +108,23 @@ async def test_long_audio_request(mary_had_lamb, whisper_client):
assert out_usage["seconds"] == 161, out_usage["seconds"] assert out_usage["seconds"] == 161, out_usage["seconds"]
@pytest.mark.asyncio
async def test_invalid_audio_file(whisper_client):
"""Corrupted audio should surface as HTTP 400."""
invalid_audio = io.BytesIO(b"not a valid audio file")
invalid_audio.name = "invalid.wav"
with pytest.raises(openai.BadRequestError) as exc_info:
await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=invalid_audio,
language="en",
)
assert exc_info.value.status_code == 400
assert "Invalid or unsupported audio file" in exc_info.value.message
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_completion_endpoints(whisper_client): async def test_completion_endpoints(whisper_client):
# text to text model # text to text model
......
...@@ -11,6 +11,7 @@ from typing import Final, Literal, TypeAlias, TypeVar, cast ...@@ -11,6 +11,7 @@ from typing import Final, Literal, TypeAlias, TypeVar, cast
import numpy as np import numpy as np
from fastapi import Request from fastapi import Request
from soundfile import LibsndfileError
from transformers import PreTrainedTokenizerBase from transformers import PreTrainedTokenizerBase
import vllm.envs as envs import vllm.envs as envs
...@@ -57,6 +58,14 @@ try: ...@@ -57,6 +58,14 @@ try:
except ImportError: except ImportError:
librosa = PlaceholderModule("librosa") # type: ignore[assignment] librosa = PlaceholderModule("librosa") # type: ignore[assignment]
# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile
# being librosa's main backend. Used to validate if an audio loading error is due to a
# server error vs a client error (invalid audio file).
# 1 = unrecognised format (file is not a supported audio container)
# 3 = malformed file (corrupt or structurally invalid audio)
# 4 = unsupported encoding (codec not supported by this libsndfile build)
_BAD_SF_CODES = {1, 3, 4}
SpeechToTextResponse: TypeAlias = TranscriptionResponse | TranslationResponse SpeechToTextResponse: TypeAlias = TranscriptionResponse | TranslationResponse
SpeechToTextResponseVerbose: TypeAlias = ( SpeechToTextResponseVerbose: TypeAlias = (
TranscriptionResponseVerbose | TranslationResponseVerbose TranscriptionResponseVerbose | TranslationResponseVerbose
...@@ -315,9 +324,15 @@ class OpenAISpeechToText(OpenAIServing): ...@@ -315,9 +324,15 @@ class OpenAISpeechToText(OpenAIServing):
) )
with io.BytesIO(audio_data) as bytes_: with io.BytesIO(audio_data) as bytes_:
# NOTE resample to model SR here for efficiency. This is also a try:
# pre-requisite for chunking, as it assumes Whisper SR. # NOTE resample to model SR here for efficiency. This is also a
y, sr = librosa.load(bytes_, sr=self.asr_config.sample_rate) # pre-requisite for chunking, as it assumes Whisper SR.
y, sr = librosa.load(bytes_, sr=self.asr_config.sample_rate)
except LibsndfileError as exc:
# Distinguish client errors (invalid audio) from server errors
if exc.code in _BAD_SF_CODES:
raise ValueError("Invalid or unsupported audio file.") from exc
raise
duration = librosa.get_duration(y=y, sr=sr) duration = librosa.get_duration(y=y, sr=sr)
do_split_audio = ( do_split_audio = (
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment