Unverified Commit 63d92abb authored by deven-labovitch's avatar deven-labovitch Committed by GitHub
Browse files

[Frontend] Set MAX_AUDIO_CLIP_FILESIZE_MB via env var instead of hardcoding (#21374)


Signed-off-by: default avatarDeven Labovitch <deven@videa.ai>
parent 11599b0e
...@@ -351,6 +351,11 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai ...@@ -351,6 +351,11 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
Code example: <gh-file:examples/online_serving/openai_transcription_client.py> Code example: <gh-file:examples/online_serving/openai_transcription_client.py>
<!-- TODO: api enforced limits + uploading audios --> <!-- TODO: api enforced limits + uploading audios -->
#### API Enforced Limits
Set the maximum audio file size (in MB) that VLLM will accept, via the
`VLLM_MAX_AUDIO_CLIP_FILESIZE_MB` environment variable. Default is 25 MB.
#### Extra Parameters #### Extra Parameters
The following [sampling parameters][sampling-params] are supported. The following [sampling parameters][sampling-params] are supported.
......
...@@ -11,6 +11,7 @@ from typing import Callable, Literal, Optional, TypeVar, Union, cast ...@@ -11,6 +11,7 @@ from typing import Callable, Literal, Optional, TypeVar, Union, cast
import numpy as np import numpy as np
from fastapi import Request from fastapi import Request
import vllm.envs as envs
from vllm.config import ModelConfig from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
...@@ -38,10 +39,6 @@ T = TypeVar("T", bound=SpeechToTextResponse) ...@@ -38,10 +39,6 @@ T = TypeVar("T", bound=SpeechToTextResponse)
logger = init_logger(__name__) logger = init_logger(__name__)
# As per https://platform.openai.com/docs/guides/speech-to-text#overview.
# TODO configurable
MAX_AUDIO_CLIP_FILESIZE_MB = 25
class OpenAISpeechToText(OpenAIServing): class OpenAISpeechToText(OpenAIServing):
"""Base class for speech-to-text operations like transcription and """Base class for speech-to-text operations like transcription and
...@@ -70,6 +67,8 @@ class OpenAISpeechToText(OpenAIServing): ...@@ -70,6 +67,8 @@ class OpenAISpeechToText(OpenAIServing):
self.asr_config = self.model_cls.get_speech_to_text_config( self.asr_config = self.model_cls.get_speech_to_text_config(
model_config, task_type) model_config, task_type)
self.max_audio_filesize_mb = envs.VLLM_MAX_AUDIO_CLIP_FILESIZE_MB
if self.default_sampling_params: if self.default_sampling_params:
logger.info( logger.info(
"Overwriting default completion sampling param with: %s", "Overwriting default completion sampling param with: %s",
...@@ -93,7 +92,7 @@ class OpenAISpeechToText(OpenAIServing): ...@@ -93,7 +92,7 @@ class OpenAISpeechToText(OpenAIServing):
lang = request.language or "en" lang = request.language or "en"
self.model_cls.validate_language(lang) self.model_cls.validate_language(lang)
if len(audio_data) / 1024**2 > MAX_AUDIO_CLIP_FILESIZE_MB: if len(audio_data) / 1024**2 > self.max_audio_filesize_mb:
raise ValueError("Maximum file size exceeded.") raise ValueError("Maximum file size exceeded.")
with io.BytesIO(audio_data) as bytes_: with io.BytesIO(audio_data) as bytes_:
......
...@@ -61,6 +61,7 @@ if TYPE_CHECKING: ...@@ -61,6 +61,7 @@ if TYPE_CHECKING:
VLLM_IMAGE_FETCH_TIMEOUT: int = 5 VLLM_IMAGE_FETCH_TIMEOUT: int = 5
VLLM_VIDEO_FETCH_TIMEOUT: int = 30 VLLM_VIDEO_FETCH_TIMEOUT: int = 30
VLLM_AUDIO_FETCH_TIMEOUT: int = 10 VLLM_AUDIO_FETCH_TIMEOUT: int = 10
VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25
VLLM_VIDEO_LOADER_BACKEND: str = "opencv" VLLM_VIDEO_LOADER_BACKEND: str = "opencv"
VLLM_MM_INPUT_CACHE_GIB: int = 8 VLLM_MM_INPUT_CACHE_GIB: int = 8
VLLM_TARGET_DEVICE: str = "cuda" VLLM_TARGET_DEVICE: str = "cuda"
...@@ -519,6 +520,12 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -519,6 +520,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_AUDIO_FETCH_TIMEOUT": "VLLM_AUDIO_FETCH_TIMEOUT":
lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")), lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")),
# Maximum filesize in MB for a single audio file when processing
# speech-to-text requests. Files larger than this will be rejected.
# Default is 25 MB
"VLLM_MAX_AUDIO_CLIP_FILESIZE_MB":
lambda: int(os.getenv("VLLM_MAX_AUDIO_CLIP_FILESIZE_MB", "25")),
# Backend for Video IO # Backend for Video IO
# - "opencv": Default backend that uses OpenCV stream buffered backend. # - "opencv": Default backend that uses OpenCV stream buffered backend.
# #
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment