[Frontend] Set MAX_AUDIO_CLIP_FILESIZE_MB via env var instead of hardcoding (#21374)

Signed-off-by: Deven Labovitch <deven@videa.ai>

[Frontend] Set MAX_AUDIO_CLIP_FILESIZE_MB via env var instead of hardcoding (#21374)
Signed-off-by: Deven Labovitch <deven@videa.ai>
63d92abb · deven-labovitch · GitHub · 11599b0e · 63d92abb · 63d92abb
Unverified Commit 63d92abb authored Jul 23, 2025 by deven-labovitch Committed by GitHub Jul 23, 2025
Showing with 16 additions and 5 deletions

docs/serving/openai_compatible_server.md docs/serving/openai_compatible_server.md +5 -0

vllm/entrypoints/openai/speech_to_text.py vllm/entrypoints/openai/speech_to_text.py +4 -5

vllm/envs.py vllm/envs.py +7 -0

No files found.
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -351,6 +351,11 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
 Code example: <gh-file:examples/online_serving/openai_transcription_client.py>
 <!-- TODO: api enforced limits + uploading audios -->
+#### API Enforced Limits
+Set the maximum audio file size (in MB) that VLLM will accept, via the
+`VLLM_MAX_AUDIO_CLIP_FILESIZE_MB` environment variable. Default is 25 MB.
 #### Extra Parameters
 The following [sampling parameters][sampling-params] are supported.

--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -11,6 +11,7 @@ from typing import Callable, Literal, Optional, TypeVar, Union, cast
 import numpy as np
 from fastapi import Request
+import vllm.envs as envs
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
@@ -38,10 +39,6 @@ T = TypeVar("T", bound=SpeechToTextResponse)
 logger = init_logger(__name__)
-# As per https://platform.openai.com/docs/guides/speech-to-text#overview.
-# TODO configurable
-MAX_AUDIO_CLIP_FILESIZE_MB = 25
 class OpenAISpeechToText(OpenAIServing):
    """Base class for speech-to-text operations like transcription and 
@@ -70,6 +67,8 @@ class OpenAISpeechToText(OpenAIServing):
        self.asr_config = self.model_cls.get_speech_to_text_config(
            model_config, task_type)
+        self.max_audio_filesize_mb = envs.VLLM_MAX_AUDIO_CLIP_FILESIZE_MB
        if self.default_sampling_params:
            logger.info(
                "Overwriting default completion sampling param with: %s",
@@ -93,7 +92,7 @@ class OpenAISpeechToText(OpenAIServing):
        lang = request.language or "en"
        self.model_cls.validate_language(lang)
-        if len(audio_data) / 1024**2 > MAX_AUDIO_CLIP_FILESIZE_MB:
+        if len(audio_data) / 1024**2 > self.max_audio_filesize_mb:
            raise ValueError("Maximum file size exceeded.")
        with io.BytesIO(audio_data) as bytes_:

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -61,6 +61,7 @@ if TYPE_CHECKING:
    VLLM_IMAGE_FETCH_TIMEOUT: int = 5
    VLLM_VIDEO_FETCH_TIMEOUT: int = 30
    VLLM_AUDIO_FETCH_TIMEOUT: int = 10
+    VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25
    VLLM_VIDEO_LOADER_BACKEND: str = "opencv"
    VLLM_MM_INPUT_CACHE_GIB: int = 8
    VLLM_TARGET_DEVICE: str = "cuda"
@@ -519,6 +520,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_AUDIO_FETCH_TIMEOUT":
    lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")),
+    # Maximum filesize in MB for a single audio file when processing
+    # speech-to-text requests. Files larger than this will be rejected.
+    # Default is 25 MB
+    "VLLM_MAX_AUDIO_CLIP_FILESIZE_MB":
+    lambda: int(os.getenv("VLLM_MAX_AUDIO_CLIP_FILESIZE_MB", "25")),
    # Backend for Video IO
    # - "opencv": Default backend that uses OpenCV stream buffered backend.
    #