fix: Ensure invalid audio files return 400 error (#34715)

Signed-off-by: Jason Ozuzu <jasonozuzu@cohere.com> Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>

fix: Ensure invalid audio files return 400 error (#34715)
Signed-off-by: Jason Ozuzu <jasonozuzu@cohere.com> Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
ae88468b · JasonCohere · GitHub · e05cb3b9 · ae88468b · ae88468b
Unverified Commit ae88468b authored Mar 03, 2026 by JasonCohere Committed by GitHub Mar 03, 2026
2 changed files
--- a/tests/entrypoints/openai/test_transcription_validation_whisper.py
+++ b/tests/entrypoints/openai/test_transcription_validation_whisper.py
@@ -108,6 +108,23 @@ async def test_long_audio_request(mary_had_lamb, whisper_client):
    assert out_usage["seconds"] == 161, out_usage["seconds"]
+@pytest.mark.asyncio
+async def test_invalid_audio_file(whisper_client):
+    """Corrupted audio should surface as HTTP 400."""
+    invalid_audio = io.BytesIO(b"not a valid audio file")
+    invalid_audio.name = "invalid.wav"
+    with pytest.raises(openai.BadRequestError) as exc_info:
+        await whisper_client.audio.transcriptions.create(
+            model=MODEL_NAME,
+            file=invalid_audio,
+            language="en",
+        )
+    assert exc_info.value.status_code == 400
+    assert "Invalid or unsupported audio file" in exc_info.value.message
 @pytest.mark.asyncio
 async def test_completion_endpoints(whisper_client):
    # text to text model

--- a/vllm/entrypoints/openai/speech_to_text/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text/speech_to_text.py
@@ -11,6 +11,7 @@ from typing import Final, Literal, TypeAlias, TypeVar, cast
 import numpy as np
 from fastapi import Request
+from soundfile import LibsndfileError
 from transformers import PreTrainedTokenizerBase
 import vllm.envs as envs
@@ -57,6 +58,14 @@ try:
 except ImportError:
    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
+# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile
+# being librosa's main backend. Used to validate if an audio loading error is due to a
+# server error vs a client error (invalid audio file).
+# 1 = unrecognised format      (file is not a supported audio container)
+# 3 = malformed file           (corrupt or structurally invalid audio)
+# 4 = unsupported encoding     (codec not supported by this libsndfile build)
+_BAD_SF_CODES = {1, 3, 4}
 SpeechToTextResponse: TypeAlias = TranscriptionResponse | TranslationResponse
 SpeechToTextResponseVerbose: TypeAlias = (
    TranscriptionResponseVerbose | TranslationResponseVerbose
@@ -315,9 +324,15 @@ class OpenAISpeechToText(OpenAIServing):
            )
        with io.BytesIO(audio_data) as bytes_:
-            # NOTE resample to model SR here for efficiency. This is also a
+            try:
-            # pre-requisite for chunking, as it assumes Whisper SR.
+                # NOTE resample to model SR here for efficiency. This is also a
-            y, sr = librosa.load(bytes_, sr=self.asr_config.sample_rate)
+                # pre-requisite for chunking, as it assumes Whisper SR.
+                y, sr = librosa.load(bytes_, sr=self.asr_config.sample_rate)
+            except LibsndfileError as exc:
+                # Distinguish client errors (invalid audio) from server errors
+                if exc.code in _BAD_SF_CODES:
+                    raise ValueError("Invalid or unsupported audio file.") from exc
+                raise
        duration = librosa.get_duration(y=y, sr=sr)
        do_split_audio = (