Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
63d92abb
Unverified
Commit
63d92abb
authored
Jul 23, 2025
by
deven-labovitch
Committed by
GitHub
Jul 23, 2025
Browse files
[Frontend] Set MAX_AUDIO_CLIP_FILESIZE_MB via env var instead of hardcoding (#21374)
Signed-off-by:
Deven Labovitch
<
deven@videa.ai
>
parent
11599b0e
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
16 additions
and
5 deletions
+16
-5
docs/serving/openai_compatible_server.md
docs/serving/openai_compatible_server.md
+5
-0
vllm/entrypoints/openai/speech_to_text.py
vllm/entrypoints/openai/speech_to_text.py
+4
-5
vllm/envs.py
vllm/envs.py
+7
-0
No files found.
docs/serving/openai_compatible_server.md
View file @
63d92abb
...
...
@@ -351,6 +351,11 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
Code example: <gh-file:examples/online_serving/openai_transcription_client.py>
<!-- TODO: api enforced limits + uploading audios -->
#### API Enforced Limits
Set the maximum audio file size (in MB) that VLLM will accept, via the
`VLLM_MAX_AUDIO_CLIP_FILESIZE_MB` environment variable. Default is 25 MB.
#### Extra Parameters
The following [sampling parameters][sampling-params] are supported.
...
...
vllm/entrypoints/openai/speech_to_text.py
View file @
63d92abb
...
...
@@ -11,6 +11,7 @@ from typing import Callable, Literal, Optional, TypeVar, Union, cast
import
numpy
as
np
from
fastapi
import
Request
import
vllm.envs
as
envs
from
vllm.config
import
ModelConfig
from
vllm.engine.protocol
import
EngineClient
from
vllm.entrypoints.logger
import
RequestLogger
...
...
@@ -38,10 +39,6 @@ T = TypeVar("T", bound=SpeechToTextResponse)
logger
=
init_logger
(
__name__
)
# As per https://platform.openai.com/docs/guides/speech-to-text#overview.
# TODO configurable
MAX_AUDIO_CLIP_FILESIZE_MB
=
25
class
OpenAISpeechToText
(
OpenAIServing
):
"""Base class for speech-to-text operations like transcription and
...
...
@@ -70,6 +67,8 @@ class OpenAISpeechToText(OpenAIServing):
self
.
asr_config
=
self
.
model_cls
.
get_speech_to_text_config
(
model_config
,
task_type
)
self
.
max_audio_filesize_mb
=
envs
.
VLLM_MAX_AUDIO_CLIP_FILESIZE_MB
if
self
.
default_sampling_params
:
logger
.
info
(
"Overwriting default completion sampling param with: %s"
,
...
...
@@ -93,7 +92,7 @@ class OpenAISpeechToText(OpenAIServing):
lang
=
request
.
language
or
"en"
self
.
model_cls
.
validate_language
(
lang
)
if
len
(
audio_data
)
/
1024
**
2
>
MAX_AUDIO_CLIP_FILESIZE_MB
:
if
len
(
audio_data
)
/
1024
**
2
>
self
.
max_audio_filesize_mb
:
raise
ValueError
(
"Maximum file size exceeded."
)
with
io
.
BytesIO
(
audio_data
)
as
bytes_
:
...
...
vllm/envs.py
View file @
63d92abb
...
...
@@ -61,6 +61,7 @@ if TYPE_CHECKING:
VLLM_IMAGE_FETCH_TIMEOUT
:
int
=
5
VLLM_VIDEO_FETCH_TIMEOUT
:
int
=
30
VLLM_AUDIO_FETCH_TIMEOUT
:
int
=
10
VLLM_MAX_AUDIO_CLIP_FILESIZE_MB
:
int
=
25
VLLM_VIDEO_LOADER_BACKEND
:
str
=
"opencv"
VLLM_MM_INPUT_CACHE_GIB
:
int
=
8
VLLM_TARGET_DEVICE
:
str
=
"cuda"
...
...
@@ -519,6 +520,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_AUDIO_FETCH_TIMEOUT"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_AUDIO_FETCH_TIMEOUT"
,
"10"
)),
# Maximum filesize in MB for a single audio file when processing
# speech-to-text requests. Files larger than this will be rejected.
# Default is 25 MB
"VLLM_MAX_AUDIO_CLIP_FILESIZE_MB"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_MAX_AUDIO_CLIP_FILESIZE_MB"
,
"25"
)),
# Backend for Video IO
# - "opencv": Default backend that uses OpenCV stream buffered backend.
#
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment