Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
c7f98b4d
Unverified
Commit
c7f98b4d
authored
Mar 21, 2026
by
Isotr0py
Committed by
GitHub
Mar 21, 2026
Browse files
[Frontend] Remove librosa from audio dependency (#37058)
Signed-off-by:
Isotr0py
<
mozf@mail2.sysu.edu.cn
>
parent
1c472f8f
Changes
18
Hide whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
247 additions
and
188 deletions
+247
-188
requirements/test.in
requirements/test.in
+1
-0
requirements/test.txt
requirements/test.txt
+4
-0
setup.py
setup.py
+2
-2
tests/entrypoints/openai/speech_to_text/test_transcription_validation.py
...ts/openai/speech_to_text/test_transcription_validation.py
+1
-1
tests/entrypoints/openai/test_run_batch.py
tests/entrypoints/openai/test_run_batch.py
+1
-1
tests/models/multimodal/generation/vlm_utils/builders.py
tests/models/multimodal/generation/vlm_utils/builders.py
+1
-4
tests/multimodal/media/test_audio.py
tests/multimodal/media/test_audio.py
+16
-22
tests/multimodal/test_audio.py
tests/multimodal/test_audio.py
+19
-19
vllm/assets/audio.py
vllm/assets/audio.py
+2
-7
vllm/assets/video.py
vllm/assets/video.py
+2
-7
vllm/benchmarks/datasets.py
vllm/benchmarks/datasets.py
+2
-5
vllm/entrypoints/openai/speech_to_text/speech_to_text.py
vllm/entrypoints/openai/speech_to_text/speech_to_text.py
+8
-46
vllm/model_executor/models/nano_nemotron_vl.py
vllm/model_executor/models/nano_nemotron_vl.py
+3
-2
vllm/multimodal/audio.py
vllm/multimodal/audio.py
+87
-9
vllm/multimodal/media/audio.py
vllm/multimodal/media/audio.py
+96
-58
vllm/multimodal/parse.py
vllm/multimodal/parse.py
+1
-1
vllm/renderers/base.py
vllm/renderers/base.py
+0
-3
vllm/transformers_utils/processors/fireredasr2.py
vllm/transformers_utils/processors/fireredasr2.py
+1
-1
No files found.
requirements/test.in
View file @
c7f98b4d
...
...
@@ -21,6 +21,7 @@ vocos # required for minicpmo_26 test
peft>=0.15.0 # required for phi-4-mm test
pqdm
ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests
resampy # required for audio tests
sentence-transformers>=5.2.0 # required for embedding tests
soundfile # required for audio tests
jiwer # required for audio tests
...
...
requirements/test.txt
View file @
c7f98b4d
...
...
@@ -544,6 +544,7 @@ numba==0.61.2
# via
# -r requirements/test.in
# librosa
# resampy
numpy==2.2.6
# via
# -r requirements/test.in
...
...
@@ -584,6 +585,7 @@ numpy==2.2.6
# pyogrio
# pywavelets
# rasterio
# resampy
# rioxarray
# rouge-score
# runai-model-streamer
...
...
@@ -995,6 +997,8 @@ requests==2.32.3
# tiktoken
# transformers
# wandb
resampy==0.4.3
# via -r requirements/test.in
responses==0.25.3
# via genai-perf
rfc3339-validator==0.1.4
...
...
setup.py
View file @
c7f98b4d
...
...
@@ -987,11 +987,11 @@ setup(
"instanttensor"
:
[
"instanttensor >= 0.1.5"
],
"runai"
:
[
"runai-model-streamer[s3,gcs,azure] >= 0.15.7"
],
"audio"
:
[
"librosa"
,
"av"
,
"resampy"
,
"scipy"
,
"soundfile"
,
"mistral_common[audio]"
,
"av"
,
],
# Required for audio processing
"video"
:
[],
# Kept for backwards compatibility
"flashinfer"
:
[],
# Kept for backwards compatibility
...
...
tests/entrypoints/openai/speech_to_text/test_transcription_validation.py
View file @
c7f98b4d
...
...
@@ -152,5 +152,5 @@ async def test_basic_audio_foscolo(foscolo, rocm_aiter_fa_attention, model_name)
model_name
,
foscolo
,
language
=
"it"
,
expected_text
=
"ove il mio corpo fanciulletto
giacque
"
,
expected_text
=
"ove il mio corpo fanciulletto"
,
)
tests/entrypoints/openai/test_run_batch.py
View file @
c7f98b4d
...
...
@@ -275,7 +275,7 @@ INPUT_REASONING_BATCH = "\n".join(
]
)
MINIMAL_WAV_BASE64
=
"UklGRi
Q
AAABXQVZFZm10IBAAAAABAAEA
QB8AAEAfAAABAAg
AZGF0YQAAAAA
=
"
MINIMAL_WAV_BASE64
=
"UklGRi
g
AAABXQVZFZm10IBAAAAABAAEA
gD4AAAB9AAACABA
AZGF0YQ
Q
AAAAA
AP9/
"
INPUT_TRANSCRIPTION_BATCH
=
(
json
.
dumps
(
{
...
...
tests/models/multimodal/generation/vlm_utils/builders.py
View file @
c7f98b4d
...
...
@@ -323,10 +323,7 @@ def build_audio_inputs_from_test_info(
test_info
.
audio_idx_to_prompt
,
test_info
.
prompt_formatter
,
)
resampler
=
AudioResampler
(
target_sr
=
16000
,
method
=
"librosa"
,
)
resampler
=
AudioResampler
(
target_sr
=
16000
)
audios
=
[
asset
.
audio_and_sample_rate
for
asset
in
audio_assets
]
resampled_audios
=
[
(
...
...
tests/multimodal/media/test_audio.py
View file @
c7f98b4d
...
...
@@ -10,6 +10,8 @@ import pytest
from
vllm.multimodal.media
import
AudioMediaIO
from
...conftest
import
AudioTestAssets
pytestmark
=
pytest
.
mark
.
cpu_test
ASSETS_DIR
=
Path
(
__file__
).
parent
.
parent
/
"assets"
...
...
@@ -22,40 +24,32 @@ def dummy_audio():
@
pytest
.
fixture
def
dummy_audio_bytes
():
return
b
"FAKEAUDIOBYTES"
def
dummy_audio_bytes
(
audio_assets
:
AudioTestAssets
):
with
open
(
audio_assets
[
0
].
get_local_path
(),
"rb"
)
as
f
:
return
f
.
read
()
def
test_audio_media_io_load_bytes
(
dummy_audio_bytes
):
audio_io
=
AudioMediaIO
()
with
patch
(
"librosa.load"
)
as
mock_load
:
mock_load
.
return_value
=
(
np
.
array
([
0.1
,
0.2
]),
16000
)
out
=
audio_io
.
load_bytes
(
dummy_audio_bytes
)
mock_load
.
assert_called_once
()
assert
isinstance
(
out
[
0
],
np
.
ndarray
)
assert
out
[
1
]
==
16000
out
=
audio_io
.
load_bytes
(
dummy_audio_bytes
)
assert
isinstance
(
out
[
0
],
np
.
ndarray
)
assert
out
[
1
]
==
16000
def
test_audio_media_io_load_base64
(
dummy_audio_bytes
):
audio_io
=
AudioMediaIO
()
encoded
=
base64
.
b64encode
(
dummy_audio_bytes
).
decode
(
"utf-8"
)
with
patch
.
object
(
AudioMediaIO
,
"load_bytes"
)
as
mock_load_bytes
:
mock_load_bytes
.
return_value
=
(
np
.
array
([
0.1
,
0.2
]),
16000
)
out
=
audio_io
.
load_base64
(
"audio/wav"
,
encoded
)
mock_load_bytes
.
assert_called_once
()
assert
isinstance
(
out
[
0
],
np
.
ndarray
)
assert
out
[
1
]
==
16000
out
=
audio_io
.
load_base64
(
"audio/wav"
,
encoded
)
assert
isinstance
(
out
[
0
],
np
.
ndarray
)
assert
out
[
1
]
==
16000
def
test_audio_media_io_load_file
():
def
test_audio_media_io_load_file
(
audio_assets
:
AudioTestAssets
):
audio_io
=
AudioMediaIO
()
path
=
Path
(
"/fake/path.wav"
)
with
patch
(
"librosa.load"
)
as
mock_load
:
mock_load
.
return_value
=
(
np
.
array
([
0.1
,
0.2
]),
16000
)
out
=
audio_io
.
load_file
(
path
)
mock_load
.
assert_called_once_with
(
path
,
sr
=
None
)
assert
isinstance
(
out
[
0
],
np
.
ndarray
)
assert
out
[
1
]
==
16000
path
=
audio_assets
[
0
].
get_local_path
()
out
=
audio_io
.
load_file
(
path
)
assert
isinstance
(
out
[
0
],
np
.
ndarray
)
assert
out
[
1
]
==
16000
def
test_audio_media_io_encode_base64
(
dummy_audio
):
...
...
tests/multimodal/test_audio.py
View file @
c7f98b4d
...
...
@@ -14,7 +14,7 @@ from vllm.multimodal.audio import (
AudioSpec
,
ChannelReduction
,
normalize_audio
,
resample_audio_
librosa
,
resample_audio_
pyav
,
resample_audio_scipy
,
split_audio
,
)
...
...
@@ -25,14 +25,14 @@ def dummy_audio():
return
np
.
array
([
0.0
,
0.1
,
0.2
,
0.3
,
0.4
],
dtype
=
float
)
def
test_resample_audio_
librosa
(
dummy_audio
):
with
patch
(
"vllm.multimodal.audio.librosa.resample"
)
as
mock_resample
:
mock_resample
.
return_value
=
dummy_audio
*
2
out
=
resample_audio_
librosa
(
dummy_audio
,
orig_sr
=
4
4100
,
target_sr
=
22050
)
mock_resample
.
assert_called_once_with
(
dummy_audio
,
orig_sr
=
44100
,
target_sr
=
22050
)
assert
np
.
all
(
out
==
dummy_audio
*
2
)
def
test_resample_audio_
pyav
(
dummy_audio
):
out_down
=
resample_audio_pyav
(
dummy_audio
,
orig_sr
=
4
,
target_sr
=
2
)
out_up
=
resample_audio_pyav
(
dummy_audio
,
orig_sr
=
2
,
target_sr
=
4
)
out
_same
=
resample_audio_
pyav
(
dummy_audio
,
orig_sr
=
4
,
target_sr
=
4
)
assert
len
(
out_down
)
==
3
assert
len
(
out_up
)
==
10
assert
np
.
all
(
out
_same
==
dummy_audio
)
def
test_resample_audio_scipy
(
dummy_audio
):
...
...
@@ -56,9 +56,9 @@ def test_resample_audio_scipy_non_integer_ratio(dummy_audio):
assert
np
.
isfinite
(
out
).
all
()
def
test_audio_resampler_
librosa
_calls_resample
(
dummy_audio
):
resampler
=
AudioResampler
(
target_sr
=
22050
,
method
=
"
librosa
"
)
with
patch
(
"vllm.multimodal.audio.resample_audio_
librosa
"
)
as
mock_resample
:
def
test_audio_resampler_
pyav
_calls_resample
(
dummy_audio
):
resampler
=
AudioResampler
(
target_sr
=
22050
,
method
=
"
pyav
"
)
with
patch
(
"vllm.multimodal.audio.resample_audio_
pyav
"
)
as
mock_resample
:
mock_resample
.
return_value
=
dummy_audio
out
=
resampler
.
resample
(
dummy_audio
,
orig_sr
=
44100
)
mock_resample
.
assert_called_once_with
(
...
...
@@ -423,13 +423,13 @@ class TestAudioPipelineE2E:
# Verify channel averaging: mean of [0.5, -0.5] = 0.0
np
.
testing
.
assert_array_almost_equal
(
audio_output
,
np
.
zeros
(
16000
),
decimal
=
5
)
def
test_
librosa
_mono_passthrough_e2e
(
self
):
"""Full pipeline:
librosa
mono format → preserved as mono."""
def
test_
pyav
_mono_passthrough_e2e
(
self
):
"""Full pipeline:
pyav
mono format → preserved as mono."""
from
vllm.multimodal.parse
import
MultiModalDataParser
# Simulate
librosa
output: already mono (time,) format
mono_
librosa
=
np
.
random
.
randn
(
16000
).
astype
(
np
.
float32
)
assert
mono_
librosa
.
shape
==
(
16000
,)
# Simulate
pyav
output: already mono (time,) format
mono_
pyav
=
np
.
random
.
randn
(
16000
).
astype
(
np
.
float32
)
assert
mono_
pyav
.
shape
==
(
16000
,)
# Create parser with mono normalization
parser
=
MultiModalDataParser
(
...
...
@@ -438,7 +438,7 @@ class TestAudioPipelineE2E:
)
# Process audio through the parser
result
=
parser
.
_parse_audio_data
((
mono_
librosa
,
16000
))
result
=
parser
.
_parse_audio_data
((
mono_
pyav
,
16000
))
audio_output
=
result
.
get
(
0
)
# Verify output is still mono 1D
...
...
@@ -446,7 +446,7 @@ class TestAudioPipelineE2E:
assert
audio_output
.
shape
==
(
16000
,)
# Verify audio content is preserved
np
.
testing
.
assert_array_almost_equal
(
audio_output
,
mono_
librosa
)
np
.
testing
.
assert_array_almost_equal
(
audio_output
,
mono_
pyav
)
def
test_multichannel_5_1_surround_to_mono_e2e
(
self
):
"""Full pipeline: 5.1 surround (6 channels) → mono output."""
...
...
vllm/assets/audio.py
View file @
c7f98b4d
...
...
@@ -8,15 +8,10 @@ from urllib.parse import urljoin
import
numpy.typing
as
npt
from
vllm.
utils.import_utils
import
PlaceholderModule
from
vllm.
multimodal.media.audio
import
load_audio
from
.base
import
VLLM_S3_BUCKET_URL
,
get_vllm_public_assets
try
:
import
librosa
except
ImportError
:
librosa
=
PlaceholderModule
(
"librosa"
)
# type: ignore[assignment]
ASSET_DIR
=
"multimodal_asset"
AudioAssetName
=
Literal
[
"winning_call"
,
"mary_had_lamb"
]
...
...
@@ -33,7 +28,7 @@ class AudioAsset:
@
property
def
audio_and_sample_rate
(
self
)
->
tuple
[
npt
.
NDArray
,
float
]:
audio_path
=
get_vllm_public_assets
(
filename
=
self
.
filename
,
s3_prefix
=
ASSET_DIR
)
return
l
ibrosa
.
load
(
audio_path
,
sr
=
None
)
return
l
oad_audio
(
audio_path
,
sr
=
None
)
def
get_local_path
(
self
)
->
Path
:
return
get_vllm_public_assets
(
filename
=
self
.
filename
,
s3_prefix
=
ASSET_DIR
)
...
...
vllm/assets/video.py
View file @
c7f98b4d
...
...
@@ -10,15 +10,10 @@ import numpy.typing as npt
from
huggingface_hub
import
hf_hub_download
from
PIL
import
Image
from
vllm.
utils.import_utils
import
PlaceholderModule
from
vllm.
multimodal.media.audio
import
load_audio_pyav
from
.base
import
get_cache_dir
try
:
import
librosa
except
ImportError
:
librosa
=
PlaceholderModule
(
"librosa"
)
# type: ignore[assignment]
@
lru_cache
def
download_video_asset
(
filename
:
str
)
->
str
:
...
...
@@ -146,4 +141,4 @@ class VideoAsset:
See also: examples/offline_inference/qwen2_5_omni/only_thinker.py
"""
return
l
ibrosa
.
load
(
self
.
video_path
,
sr
=
sampling_rate
)[
0
]
return
l
oad_audio_pyav
(
self
.
video_path
,
sr
=
sampling_rate
)[
0
]
vllm/benchmarks/datasets.py
View file @
c7f98b4d
...
...
@@ -38,6 +38,7 @@ from typing_extensions import deprecated
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.utils
import
get_adapter_absolute_path
from
vllm.multimodal
import
MultiModalDataDict
from
vllm.multimodal.audio
import
get_audio_duration
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.tokenizers
import
TokenizerLike
from
vllm.utils.argparse_utils
import
FlexibleArgumentParser
...
...
@@ -54,10 +55,6 @@ try:
except
ImportError
:
pd
=
PlaceholderModule
(
"pandas"
)
try
:
import
librosa
except
ImportError
:
librosa
=
PlaceholderModule
(
"librosa"
)
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -3253,7 +3250,7 @@ class ASRDataset(HuggingFaceDataset):
break
audio
=
item
[
"audio"
]
y
,
sr
=
audio
[
"array"
],
audio
[
"sampling_rate"
]
duration_s
=
librosa
.
get
_duration
(
y
=
y
,
sr
=
sr
)
duration_s
=
get_audio
_duration
(
y
=
y
,
sr
=
sr
)
if
duration_s
<
asr_min_audio_len_sec
or
duration_s
>
asr_max_audio_len_sec
:
skipped
+=
1
continue
...
...
vllm/entrypoints/openai/speech_to_text/speech_to_text.py
View file @
c7f98b4d
...
...
@@ -42,32 +42,13 @@ from vllm.inputs import EncoderDecoderInputs, ProcessorInputs
from
vllm.logger
import
init_logger
from
vllm.logprobs
import
FlatLogprobs
,
Logprob
from
vllm.model_executor.models
import
SupportsTranscription
from
vllm.multimodal.audio
import
split_audio
from
vllm.multimodal.media.audio
import
extract_audio_from_video_bytes
from
vllm.multimodal.audio
import
get_audio_duration
,
split_audio
from
vllm.multimodal.media.audio
import
load_audio
from
vllm.outputs
import
RequestOutput
from
vllm.renderers.inputs
import
DictPrompt
,
EncoderDecoderDictPrompt
from
vllm.renderers.inputs.preprocess
import
parse_enc_dec_prompt
,
parse_model_prompt
from
vllm.sampling_params
import
BeamSearchParams
,
SamplingParams
from
vllm.tokenizers
import
get_tokenizer
from
vllm.utils.import_utils
import
PlaceholderModule
try
:
import
librosa
except
ImportError
:
librosa
=
PlaceholderModule
(
"librosa"
)
# type: ignore[assignment]
try
:
import
soundfile
as
sf
except
ImportError
:
sf
=
PlaceholderModule
(
"soundfile"
)
# type: ignore[assignment]
# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile
# being librosa's main backend. Used to validate if an audio loading error is due to a
# server error vs a client error (invalid audio file).
# 1 = unrecognised format (file is not a supported audio container)
# 3 = malformed file (corrupt or structurally invalid audio)
# 4 = unsupported encoding (codec not supported by this libsndfile build)
_BAD_SF_CODES
=
{
1
,
3
,
4
}
SpeechToTextResponse
:
TypeAlias
=
TranscriptionResponse
|
TranslationResponse
SpeechToTextResponseVerbose
:
TypeAlias
=
(
...
...
@@ -214,32 +195,13 @@ class OpenAISpeechToText(OpenAIServing):
# pre-requisite for chunking, as it assumes Whisper SR.
try
:
with
io
.
BytesIO
(
audio_data
)
as
buf
:
y
,
sr
=
librosa
.
load
(
buf
,
sr
=
self
.
asr_config
.
sample_rate
)
# type: ignore[return-value]
except
sf
.
LibsndfileError
as
exc
:
# Only fall back for known format-detection failures.
# Re-raise anything else (e.g. corrupt but recognised format).
if
exc
.
code
not
in
_BAD_SF_CODES
:
raise
logger
.
debug
(
"librosa/soundfile could not decode audio from BytesIO "
"(code=%s: %s); falling back to pyav in-process decode"
,
exc
.
code
,
exc
,
)
try
:
native_y
,
native_sr
=
extract_audio_from_video_bytes
(
audio_data
)
sr
=
self
.
asr_config
.
sample_rate
y
=
librosa
.
resample
(
native_y
,
orig_sr
=
native_sr
,
target_sr
=
sr
)
except
Exception
as
pyav_exc
:
logger
.
debug
(
"pyAV fallback also failed: %s"
,
pyav_exc
,
)
raise
ValueError
(
"Invalid or unsupported audio file."
)
from
pyav_exc
y
,
sr
=
load_audio
(
buf
,
sr
=
self
.
asr_config
.
sample_rate
)
except
Exception
as
exc
:
raise
ValueError
(
"Invalid or unsupported audio file."
)
from
exc
duration
=
librosa
.
get
_duration
(
y
=
y
,
sr
=
sr
)
do_split_audio
=
(
self
.
asr_config
.
allow
_audio_c
hunking
duration
=
get_audio
_duration
(
y
=
y
,
sr
=
sr
)
do_split_audio
=
self
.
asr_config
.
allow_audio_chunking
and
(
self
.
asr_config
.
max
_audio_c
lip_s
is
not
None
and
duration
>
self
.
asr_config
.
max_audio_clip_s
)
...
...
vllm/model_executor/models/nano_nemotron_vl.py
View file @
c7f98b4d
...
...
@@ -12,6 +12,7 @@ import math
import
warnings
from
collections.abc
import
Iterable
,
Mapping
,
Sequence
from
functools
import
cached_property
from
io
import
BytesIO
from
typing
import
Annotated
,
Literal
,
TypeAlias
import
torch
...
...
@@ -53,7 +54,7 @@ from vllm.multimodal.inputs import (
MultiModalKwargsItems
,
VideoItem
,
)
from
vllm.multimodal.media.audio
import
extract_audio_from_video_bytes
from
vllm.multimodal.media.audio
import
load_audio_pyav
from
vllm.multimodal.parse
import
(
AudioProcessorItems
,
ImageEmbeddingItems
,
...
...
@@ -553,7 +554,7 @@ class NanoNemotronVLMultiModalProcessor(
"video must be loaded with keep_video_bytes=True (e.g. via "
"the chat API with a model that sets use_audio_in_video)."
)
audio_items
.
append
(
extract_audio_from_video_b
ytes
(
video_bytes
))
audio_items
.
append
(
load_audio_pyav
(
B
ytes
IO
(
video_bytes
))
)
# Create a new VideoProcessorItems with metadata that does not contain
# the large video bytes, to avoid modifying the input `mm_items`.
...
...
vllm/multimodal/audio.py
View file @
c7f98b4d
...
...
@@ -12,17 +12,35 @@ import torch
from
vllm.utils.import_utils
import
PlaceholderModule
try
:
import
librosa
import
av
as
av
except
ImportError
:
libros
a
=
PlaceholderModule
(
"
libros
a"
)
# type: ignore[assignment]
a
v
=
PlaceholderModule
(
"a
v
"
)
# type: ignore[assignment]
try
:
import
resampy
except
ImportError
:
resampy
=
PlaceholderModule
(
"resampy"
)
# type: ignore[assignment]
try
:
import
scipy.signal
as
scipy_signal
except
ImportError
:
scipy_signal
=
PlaceholderModule
(
"scipy"
).
placeholder_attr
(
"signal"
)
# type: ignore[assignment]
# ============================================================
# Aligned with `librosa.get_duration` function
def
get_audio_duration
(
*
,
y
:
npt
.
NDArray
[
np
.
floating
],
sr
:
float
=
22050
)
->
float
:
"""Get the duration of an audio array in seconds.
Args:
y: Audio time series. Can be 1D (samples,) or 2D (channels, samples).
sr: Sample rate of the audio in Hz.
Returns:
Duration of the audio in seconds.
"""
n_samples
=
y
.
shape
[
-
1
]
return
float
(
n_samples
)
/
sr
class
ChannelReduction
(
str
,
Enum
):
...
...
@@ -153,13 +171,71 @@ def normalize_audio(
# ============================================================
def
resample_audio_
librosa
(
def
resample_audio_
pyav
(
audio
:
npt
.
NDArray
[
np
.
floating
],
*
,
orig_sr
:
float
,
target_sr
:
float
,
)
->
npt
.
NDArray
[
np
.
floating
]:
return
librosa
.
resample
(
audio
,
orig_sr
=
orig_sr
,
target_sr
=
target_sr
)
"""Resample audio using PyAV (libswresample via FFmpeg).
Args:
audio: Input audio. Can be:
- 1D array ``(samples,)``: mono audio
- 2D array ``(channels, samples)``: stereo audio
orig_sr: Original sample rate in Hz.
target_sr: Target sample rate in Hz.
Returns:
Resampled audio with the same shape as the input (1D → 1D, 2D → 2D).
"""
orig_sr_int
=
int
(
round
(
orig_sr
))
target_sr_int
=
int
(
round
(
target_sr
))
if
orig_sr_int
==
target_sr_int
:
return
audio
if
audio
.
ndim
==
2
:
# Resample each channel independently and re-stack.
return
np
.
stack
(
[
resample_audio_pyav
(
ch
,
orig_sr
=
orig_sr
,
target_sr
=
target_sr
)
for
ch
in
audio
],
axis
=
0
,
)
expected_len
=
int
(
math
.
ceil
(
audio
.
shape
[
-
1
]
*
target_sr_int
/
orig_sr_int
))
# from_ndarray expects shape (channels, samples) for planar formats.
# libswresample requires a minimum number of input samples to produce
# output frames; pad short inputs with zeros so we always get output,
# then trim to the expected output length.
_MIN_SAMPLES
=
1024
audio_f32
=
np
.
asarray
(
audio
,
dtype
=
np
.
float32
)
if
len
(
audio_f32
)
<
_MIN_SAMPLES
:
audio_f32
=
np
.
pad
(
audio_f32
,
(
0
,
_MIN_SAMPLES
-
len
(
audio_f32
)))
audio_f32
=
audio_f32
.
reshape
(
1
,
-
1
)
resampler
=
av
.
AudioResampler
(
format
=
"fltp"
,
layout
=
"mono"
,
rate
=
target_sr_int
)
frame
=
av
.
AudioFrame
.
from_ndarray
(
audio_f32
,
format
=
"fltp"
,
layout
=
"mono"
)
frame
.
sample_rate
=
orig_sr_int
out_frames
=
resampler
.
resample
(
frame
)
out_frames
.
extend
(
resampler
.
resample
(
None
))
# flush buffered samples
result
=
np
.
concatenate
([
f
.
to_ndarray
()
for
f
in
out_frames
],
axis
=
1
).
squeeze
(
0
)
return
result
[:
expected_len
]
def
resample_audio_resampy
(
audio
:
npt
.
NDArray
[
np
.
floating
],
*
,
orig_sr
:
float
,
target_sr
:
float
,
)
->
npt
.
NDArray
[
np
.
floating
]:
return
resampy
.
resample
(
audio
,
sr_orig
=
orig_sr
,
sr_new
=
target_sr
)
def
resample_audio_scipy
(
...
...
@@ -167,7 +243,7 @@ def resample_audio_scipy(
*
,
orig_sr
:
float
,
target_sr
:
float
,
):
)
->
npt
.
NDArray
[
np
.
floating
]
:
if
orig_sr
>
target_sr
:
return
scipy_signal
.
resample_poly
(
audio
,
1
,
orig_sr
//
target_sr
)
elif
orig_sr
<
target_sr
:
...
...
@@ -181,7 +257,7 @@ class AudioResampler:
def
__init__
(
self
,
target_sr
:
float
|
None
=
None
,
method
:
Literal
[
"
librosa
"
,
"scipy"
]
=
"
librosa
"
,
method
:
Literal
[
"
pyav"
,
"resampy
"
,
"scipy"
]
=
"
resampy
"
,
):
self
.
target_sr
=
target_sr
self
.
method
=
method
...
...
@@ -203,8 +279,10 @@ class AudioResampler:
abs_tol
=
1e-6
,
):
return
audio
if
self
.
method
==
"librosa"
:
return
resample_audio_librosa
(
if
self
.
method
==
"pyav"
:
return
resample_audio_pyav
(
audio
,
orig_sr
=
orig_sr
,
target_sr
=
self
.
target_sr
)
if
self
.
method
==
"resampy"
:
return
resample_audio_resampy
(
audio
,
orig_sr
=
orig_sr
,
target_sr
=
self
.
target_sr
)
elif
self
.
method
==
"scipy"
:
...
...
@@ -214,7 +292,7 @@ class AudioResampler:
else
:
raise
ValueError
(
f
"Invalid resampling method:
{
self
.
method
}
. "
"Supported methods are '
librosa
' and 'scipy'."
"Supported methods are '
pyav
' and 'scipy'."
)
...
...
vllm/multimodal/media/audio.py
View file @
c7f98b4d
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
math
from
io
import
BytesIO
from
pathlib
import
Path
...
...
@@ -14,58 +15,80 @@ from vllm.utils.serial_utils import tensor2base64
from
.base
import
MediaIO
try
:
import
libros
a
import
a
v
except
ImportError
:
libros
a
=
PlaceholderModule
(
"
libros
a"
)
# type: ignore[assignment]
a
v
=
PlaceholderModule
(
"a
v
"
)
# type: ignore[assignment]
try
:
import
soundfile
except
ImportError
:
soundfile
=
PlaceholderModule
(
"soundfile"
)
# type: ignore[assignment]
try
:
import
av
import
resampy
except
ImportError
:
av
=
PlaceholderModule
(
"
av
"
)
# type: ignore[assignment]
resampy
=
PlaceholderModule
(
"
resampy
"
)
# type: ignore[assignment]
def
extract_audio_from_video_bytes
(
data
:
bytes
,
)
->
tuple
[
npt
.
NDArray
,
float
]:
"""Extract the audio track from raw video bytes using PyAV.
# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile
# being librosa's main backend. Used to validate if an audio loading error is due to a
# server error vs a client error (invalid audio file).
# 1 = unrecognised format (file is not a supported audio container)
# 3 = malformed file (corrupt or structurally invalid audio)
# 4 = unsupported encoding (codec not supported by this libsndfile build)
_BAD_SF_CODES
=
{
1
,
3
,
4
}
PyAV wraps FFmpeg's C libraries in-process — no subprocess is
spawned, which is critical to avoid crashing CUDA-active vLLM
worker processes.
The returned waveform is at the native sample rate of the video's
audio stream. Resampling to a model-specific rate is left to the
downstream :class:`AudioResampler` in the parsing pipeline.
def
load_audio_pyav
(
path
:
BytesIO
|
Path
|
str
,
*
,
sr
:
float
|
None
=
22050
,
mono
:
bool
=
True
,
)
->
tuple
[
npt
.
NDArray
,
float
]:
"""Load an audio file using PyAV (FFmpeg), returning float32 mono waveform.
Decodes the audio stream at its native sample rate. Channel reduction to
mono is performed by averaging across channels. Resampling to a
model-specific rate is left to the downstream :class:`AudioResampler`.
Args:
data: Raw video file bytes (e.g. from an mp4 file).
path: A :class:`~io.BytesIO` buffer, a filesystem
:class:`~pathlib.Path`, or a string path.
Returns:
A tuple of
``(waveform, sample_rate)``
suitable for use as an
:class:`AudioItem`
.
``(waveform, sample_rate)``
where *waveform* is a 1-D float32
NumPy array and *sample_rate* is the native sample rate in Hz
.
"""
if
data
is
None
or
len
(
data
)
==
0
:
raise
ValueError
(
"Cannot extract audio: video bytes are missing or empty. "
"Ensure video was loaded with keep_video_bytes=True for "
"audio-in-video extraction."
)
native_sr
=
None
try
:
with
av
.
open
(
BytesIO
(
data
)
)
as
container
:
with
av
.
open
(
path
)
as
container
:
if
not
container
.
streams
.
audio
:
raise
ValueError
(
"No audio stream found
in the video
."
)
raise
ValueError
(
"No audio stream found."
)
stream
=
container
.
streams
.
audio
[
0
]
stream
.
thread_type
=
"AUTO"
native_sr
=
stream
.
rate
sr
=
sr
or
native_sr
chunks
:
list
[
npt
.
NDArray
]
=
[]
for
frame
in
container
.
decode
(
audio
=
0
):
arr
=
frame
.
to_ndarray
()
chunks
.
append
(
arr
.
mean
(
axis
=
0
)
if
arr
.
ndim
>
1
else
arr
)
needs_resampling
=
not
math
.
isclose
(
float
(
sr
),
float
(
native_sr
),
rel_tol
=
0.0
,
abs_tol
=
1e-6
,
)
resampler
=
(
av
.
AudioResampler
(
format
=
"fltp"
,
layout
=
"mono"
,
rate
=
sr
)
if
needs_resampling
else
None
)
for
frame
in
container
.
decode
(
stream
):
if
needs_resampling
:
assert
resampler
is
not
None
for
out_frame
in
resampler
.
resample
(
frame
):
chunks
.
append
(
out_frame
.
to_ndarray
())
else
:
chunks
.
append
(
frame
.
to_ndarray
())
except
ValueError
:
raise
except
Exception
as
e
:
...
...
@@ -77,37 +100,54 @@ def extract_audio_from_video_bytes(
if
not
chunks
:
raise
ValueError
(
"No audio found in the video."
)
audio
=
np
.
concatenate
(
chunks
).
astype
(
np
.
float32
)
return
audio
,
float
(
native_sr
)
audio
=
np
.
concatenate
(
chunks
,
axis
=-
1
).
astype
(
np
.
float32
)
if
mono
and
audio
.
ndim
>
1
:
audio
=
np
.
mean
(
audio
,
axis
=
0
)
return
audio
,
sr
def
is_video
(
data
:
bytes
)
->
bool
:
"""Check if the fetched bytes are video"""
if
len
(
data
)
<
12
:
return
False
box_type
=
data
[
4
:
8
]
major_brand
=
data
[
8
:
12
]
def
load_audio_soundfile
(
path
:
BytesIO
|
Path
|
str
,
*
,
sr
:
float
|
None
=
22050
,
mono
:
bool
=
True
,
)
->
tuple
[
np
.
ndarray
,
int
]:
"""Load audio via soundfile"""
with
soundfile
.
SoundFile
(
path
)
as
f
:
native_sr
=
f
.
samplerate
y
=
f
.
read
(
dtype
=
"float32"
,
always_2d
=
False
).
T
MP4_BRANDS
=
{
b
"mp41"
,
b
"mp42"
,
# MP4
b
"isom"
,
# ISO Base Media
b
"iso2"
,
b
"iso4"
,
b
"iso5"
,
b
"iso6"
,
b
"M4V "
,
b
"M4A "
,
# Apple
b
"avc1"
,
# H.264
b
"dash"
,
# DASH
b
"mmp4"
,
b
"MSNV"
,
}
if
mono
and
y
.
ndim
>
1
:
y
=
np
.
mean
(
y
,
axis
=
tuple
(
range
(
y
.
ndim
-
1
)))
is_avi
=
data
[:
4
]
==
b
"RIFF"
and
major_brand
==
b
"AVI "
is_mp4
=
box_type
==
b
"ftyp"
and
major_brand
in
MP4_BRANDS
return
is_mp4
or
is_avi
if
sr
is
not
None
and
sr
!=
native_sr
:
y
=
resampy
.
resample
(
y
,
sr_orig
=
native_sr
,
sr_new
=
sr
)
return
y
,
int
(
sr
)
return
y
,
native_sr
def
load_audio
(
path
:
BytesIO
|
Path
|
str
,
*
,
sr
:
float
|
None
=
22050
,
mono
:
bool
=
True
,
):
try
:
return
load_audio_soundfile
(
path
,
sr
=
sr
,
mono
=
mono
)
except
soundfile
.
LibsndfileError
as
exc
:
# Only fall back for known format-detection failures.
# Re-raise anything else (e.g. corrupt but recognised format).
if
exc
.
code
not
in
_BAD_SF_CODES
:
raise
# soundfile may have advanced the BytesIO seek position before failing;
# reset it so PyAV can read from the beginning.
if
isinstance
(
path
,
BytesIO
):
path
.
seek
(
0
)
try
:
return
load_audio_pyav
(
path
,
sr
=
sr
,
mono
=
mono
)
except
Exception
as
pyav_exc
:
raise
ValueError
(
"Invalid or unsupported audio file."
)
from
pyav_exc
class
AudioMediaIO
(
MediaIO
[
tuple
[
npt
.
NDArray
,
float
]]):
...
...
@@ -128,9 +168,7 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
self
.
kwargs
=
kwargs
def
load_bytes
(
self
,
data
:
bytes
)
->
tuple
[
npt
.
NDArray
,
float
]:
if
is_video
(
data
):
return
extract_audio_from_video_bytes
(
data
)
return
librosa
.
load
(
BytesIO
(
data
),
sr
=
None
)
return
load_audio
(
BytesIO
(
data
),
sr
=
None
)
def
load_base64
(
self
,
...
...
@@ -140,7 +178,7 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
return
self
.
load_bytes
(
pybase64
.
b64decode
(
data
))
def
load_file
(
self
,
filepath
:
Path
)
->
tuple
[
npt
.
NDArray
,
float
]:
return
l
ibrosa
.
load
(
filepath
,
sr
=
None
)
return
l
oad_audio
(
filepath
,
sr
=
None
)
def
encode_base64
(
self
,
...
...
vllm/multimodal/parse.py
View file @
c7f98b4d
...
...
@@ -497,7 +497,7 @@ class MultiModalDataParser:
*
,
target_sr
:
float
|
None
=
None
,
target_channels
:
int
|
None
=
None
,
audio_resample_method
:
Literal
[
"
librosa
"
,
"scipy"
]
=
"
librosa
"
,
audio_resample_method
:
Literal
[
"
pyav
"
,
"scipy"
]
=
"
pyav
"
,
video_needs_metadata
:
bool
=
False
,
expected_hidden_size
:
int
|
None
=
None
,
)
->
None
:
...
...
vllm/renderers/base.py
View file @
c7f98b4d
...
...
@@ -172,9 +172,6 @@ class BaseRenderer(ABC, Generic[_T]):
For chat requests:
- Jinja2 template compilation
For multi-modal requests:
- Importing libraries such as librosa triggers JIT compilation.
"""
from
vllm.entrypoints.chat_utils
import
ChatTemplateResolutionError
...
...
vllm/transformers_utils/processors/fireredasr2.py
View file @
c7f98b4d
...
...
@@ -188,7 +188,7 @@ class FireRedASR2FeatureExtractor(SequenceFeatureExtractor):
for
speech
in
raw_speech
:
"""
We must multiply by 32768 here because FireRedASR2 loads audio data
using kaldiio.load_mat, while vLLM loads audio data using
librosa
.
using kaldiio.load_mat, while vLLM loads audio data using
pyav
.
"""
speech
=
speech
*
32768
fbank
=
self
.
fbank
(
sampling_rate
,
speech
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment