Unverified Commit c7f98b4d authored by Isotr0py's avatar Isotr0py Committed by GitHub
Browse files

[Frontend] Remove librosa from audio dependency (#37058)


Signed-off-by: default avatarIsotr0py <mozf@mail2.sysu.edu.cn>
parent 1c472f8f
...@@ -21,6 +21,7 @@ vocos # required for minicpmo_26 test ...@@ -21,6 +21,7 @@ vocos # required for minicpmo_26 test
peft>=0.15.0 # required for phi-4-mm test peft>=0.15.0 # required for phi-4-mm test
pqdm pqdm
ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests
resampy # required for audio tests
sentence-transformers>=5.2.0 # required for embedding tests sentence-transformers>=5.2.0 # required for embedding tests
soundfile # required for audio tests soundfile # required for audio tests
jiwer # required for audio tests jiwer # required for audio tests
......
...@@ -544,6 +544,7 @@ numba==0.61.2 ...@@ -544,6 +544,7 @@ numba==0.61.2
# via # via
# -r requirements/test.in # -r requirements/test.in
# librosa # librosa
# resampy
numpy==2.2.6 numpy==2.2.6
# via # via
# -r requirements/test.in # -r requirements/test.in
...@@ -584,6 +585,7 @@ numpy==2.2.6 ...@@ -584,6 +585,7 @@ numpy==2.2.6
# pyogrio # pyogrio
# pywavelets # pywavelets
# rasterio # rasterio
# resampy
# rioxarray # rioxarray
# rouge-score # rouge-score
# runai-model-streamer # runai-model-streamer
...@@ -995,6 +997,8 @@ requests==2.32.3 ...@@ -995,6 +997,8 @@ requests==2.32.3
# tiktoken # tiktoken
# transformers # transformers
# wandb # wandb
resampy==0.4.3
# via -r requirements/test.in
responses==0.25.3 responses==0.25.3
# via genai-perf # via genai-perf
rfc3339-validator==0.1.4 rfc3339-validator==0.1.4
......
...@@ -987,11 +987,11 @@ setup( ...@@ -987,11 +987,11 @@ setup(
"instanttensor": ["instanttensor >= 0.1.5"], "instanttensor": ["instanttensor >= 0.1.5"],
"runai": ["runai-model-streamer[s3,gcs,azure] >= 0.15.7"], "runai": ["runai-model-streamer[s3,gcs,azure] >= 0.15.7"],
"audio": [ "audio": [
"librosa", "av",
"resampy",
"scipy", "scipy",
"soundfile", "soundfile",
"mistral_common[audio]", "mistral_common[audio]",
"av",
], # Required for audio processing ], # Required for audio processing
"video": [], # Kept for backwards compatibility "video": [], # Kept for backwards compatibility
"flashinfer": [], # Kept for backwards compatibility "flashinfer": [], # Kept for backwards compatibility
......
...@@ -152,5 +152,5 @@ async def test_basic_audio_foscolo(foscolo, rocm_aiter_fa_attention, model_name) ...@@ -152,5 +152,5 @@ async def test_basic_audio_foscolo(foscolo, rocm_aiter_fa_attention, model_name)
model_name, model_name,
foscolo, foscolo,
language="it", language="it",
expected_text="ove il mio corpo fanciulletto giacque", expected_text="ove il mio corpo fanciulletto",
) )
...@@ -275,7 +275,7 @@ INPUT_REASONING_BATCH = "\n".join( ...@@ -275,7 +275,7 @@ INPUT_REASONING_BATCH = "\n".join(
] ]
) )
MINIMAL_WAV_BASE64 = "UklGRiQAAABXQVZFZm10IBAAAAABAAEAQB8AAEAfAAABAAgAZGF0YQAAAAA=" MINIMAL_WAV_BASE64 = "UklGRigAAABXQVZFZm10IBAAAAABAAEAgD4AAAB9AAACABAAZGF0YQQAAAAAAP9/"
INPUT_TRANSCRIPTION_BATCH = ( INPUT_TRANSCRIPTION_BATCH = (
json.dumps( json.dumps(
{ {
......
...@@ -323,10 +323,7 @@ def build_audio_inputs_from_test_info( ...@@ -323,10 +323,7 @@ def build_audio_inputs_from_test_info(
test_info.audio_idx_to_prompt, test_info.audio_idx_to_prompt,
test_info.prompt_formatter, test_info.prompt_formatter,
) )
resampler = AudioResampler( resampler = AudioResampler(target_sr=16000)
target_sr=16000,
method="librosa",
)
audios = [asset.audio_and_sample_rate for asset in audio_assets] audios = [asset.audio_and_sample_rate for asset in audio_assets]
resampled_audios = [ resampled_audios = [
( (
......
...@@ -10,6 +10,8 @@ import pytest ...@@ -10,6 +10,8 @@ import pytest
from vllm.multimodal.media import AudioMediaIO from vllm.multimodal.media import AudioMediaIO
from ...conftest import AudioTestAssets
pytestmark = pytest.mark.cpu_test pytestmark = pytest.mark.cpu_test
ASSETS_DIR = Path(__file__).parent.parent / "assets" ASSETS_DIR = Path(__file__).parent.parent / "assets"
...@@ -22,40 +24,32 @@ def dummy_audio(): ...@@ -22,40 +24,32 @@ def dummy_audio():
@pytest.fixture @pytest.fixture
def dummy_audio_bytes(): def dummy_audio_bytes(audio_assets: AudioTestAssets):
return b"FAKEAUDIOBYTES" with open(audio_assets[0].get_local_path(), "rb") as f:
return f.read()
def test_audio_media_io_load_bytes(dummy_audio_bytes): def test_audio_media_io_load_bytes(dummy_audio_bytes):
audio_io = AudioMediaIO() audio_io = AudioMediaIO()
with patch("librosa.load") as mock_load: out = audio_io.load_bytes(dummy_audio_bytes)
mock_load.return_value = (np.array([0.1, 0.2]), 16000) assert isinstance(out[0], np.ndarray)
out = audio_io.load_bytes(dummy_audio_bytes) assert out[1] == 16000
mock_load.assert_called_once()
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
def test_audio_media_io_load_base64(dummy_audio_bytes): def test_audio_media_io_load_base64(dummy_audio_bytes):
audio_io = AudioMediaIO() audio_io = AudioMediaIO()
encoded = base64.b64encode(dummy_audio_bytes).decode("utf-8") encoded = base64.b64encode(dummy_audio_bytes).decode("utf-8")
with patch.object(AudioMediaIO, "load_bytes") as mock_load_bytes: out = audio_io.load_base64("audio/wav", encoded)
mock_load_bytes.return_value = (np.array([0.1, 0.2]), 16000) assert isinstance(out[0], np.ndarray)
out = audio_io.load_base64("audio/wav", encoded) assert out[1] == 16000
mock_load_bytes.assert_called_once()
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
def test_audio_media_io_load_file(): def test_audio_media_io_load_file(audio_assets: AudioTestAssets):
audio_io = AudioMediaIO() audio_io = AudioMediaIO()
path = Path("/fake/path.wav") path = audio_assets[0].get_local_path()
with patch("librosa.load") as mock_load: out = audio_io.load_file(path)
mock_load.return_value = (np.array([0.1, 0.2]), 16000) assert isinstance(out[0], np.ndarray)
out = audio_io.load_file(path) assert out[1] == 16000
mock_load.assert_called_once_with(path, sr=None)
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
def test_audio_media_io_encode_base64(dummy_audio): def test_audio_media_io_encode_base64(dummy_audio):
......
...@@ -14,7 +14,7 @@ from vllm.multimodal.audio import ( ...@@ -14,7 +14,7 @@ from vllm.multimodal.audio import (
AudioSpec, AudioSpec,
ChannelReduction, ChannelReduction,
normalize_audio, normalize_audio,
resample_audio_librosa, resample_audio_pyav,
resample_audio_scipy, resample_audio_scipy,
split_audio, split_audio,
) )
...@@ -25,14 +25,14 @@ def dummy_audio(): ...@@ -25,14 +25,14 @@ def dummy_audio():
return np.array([0.0, 0.1, 0.2, 0.3, 0.4], dtype=float) return np.array([0.0, 0.1, 0.2, 0.3, 0.4], dtype=float)
def test_resample_audio_librosa(dummy_audio): def test_resample_audio_pyav(dummy_audio):
with patch("vllm.multimodal.audio.librosa.resample") as mock_resample: out_down = resample_audio_pyav(dummy_audio, orig_sr=4, target_sr=2)
mock_resample.return_value = dummy_audio * 2 out_up = resample_audio_pyav(dummy_audio, orig_sr=2, target_sr=4)
out = resample_audio_librosa(dummy_audio, orig_sr=44100, target_sr=22050) out_same = resample_audio_pyav(dummy_audio, orig_sr=4, target_sr=4)
mock_resample.assert_called_once_with(
dummy_audio, orig_sr=44100, target_sr=22050 assert len(out_down) == 3
) assert len(out_up) == 10
assert np.all(out == dummy_audio * 2) assert np.all(out_same == dummy_audio)
def test_resample_audio_scipy(dummy_audio): def test_resample_audio_scipy(dummy_audio):
...@@ -56,9 +56,9 @@ def test_resample_audio_scipy_non_integer_ratio(dummy_audio): ...@@ -56,9 +56,9 @@ def test_resample_audio_scipy_non_integer_ratio(dummy_audio):
assert np.isfinite(out).all() assert np.isfinite(out).all()
def test_audio_resampler_librosa_calls_resample(dummy_audio): def test_audio_resampler_pyav_calls_resample(dummy_audio):
resampler = AudioResampler(target_sr=22050, method="librosa") resampler = AudioResampler(target_sr=22050, method="pyav")
with patch("vllm.multimodal.audio.resample_audio_librosa") as mock_resample: with patch("vllm.multimodal.audio.resample_audio_pyav") as mock_resample:
mock_resample.return_value = dummy_audio mock_resample.return_value = dummy_audio
out = resampler.resample(dummy_audio, orig_sr=44100) out = resampler.resample(dummy_audio, orig_sr=44100)
mock_resample.assert_called_once_with( mock_resample.assert_called_once_with(
...@@ -423,13 +423,13 @@ class TestAudioPipelineE2E: ...@@ -423,13 +423,13 @@ class TestAudioPipelineE2E:
# Verify channel averaging: mean of [0.5, -0.5] = 0.0 # Verify channel averaging: mean of [0.5, -0.5] = 0.0
np.testing.assert_array_almost_equal(audio_output, np.zeros(16000), decimal=5) np.testing.assert_array_almost_equal(audio_output, np.zeros(16000), decimal=5)
def test_librosa_mono_passthrough_e2e(self): def test_pyav_mono_passthrough_e2e(self):
"""Full pipeline: librosa mono format → preserved as mono.""" """Full pipeline: pyav mono format → preserved as mono."""
from vllm.multimodal.parse import MultiModalDataParser from vllm.multimodal.parse import MultiModalDataParser
# Simulate librosa output: already mono (time,) format # Simulate pyav output: already mono (time,) format
mono_librosa = np.random.randn(16000).astype(np.float32) mono_pyav = np.random.randn(16000).astype(np.float32)
assert mono_librosa.shape == (16000,) assert mono_pyav.shape == (16000,)
# Create parser with mono normalization # Create parser with mono normalization
parser = MultiModalDataParser( parser = MultiModalDataParser(
...@@ -438,7 +438,7 @@ class TestAudioPipelineE2E: ...@@ -438,7 +438,7 @@ class TestAudioPipelineE2E:
) )
# Process audio through the parser # Process audio through the parser
result = parser._parse_audio_data((mono_librosa, 16000)) result = parser._parse_audio_data((mono_pyav, 16000))
audio_output = result.get(0) audio_output = result.get(0)
# Verify output is still mono 1D # Verify output is still mono 1D
...@@ -446,7 +446,7 @@ class TestAudioPipelineE2E: ...@@ -446,7 +446,7 @@ class TestAudioPipelineE2E:
assert audio_output.shape == (16000,) assert audio_output.shape == (16000,)
# Verify audio content is preserved # Verify audio content is preserved
np.testing.assert_array_almost_equal(audio_output, mono_librosa) np.testing.assert_array_almost_equal(audio_output, mono_pyav)
def test_multichannel_5_1_surround_to_mono_e2e(self): def test_multichannel_5_1_surround_to_mono_e2e(self):
"""Full pipeline: 5.1 surround (6 channels) → mono output.""" """Full pipeline: 5.1 surround (6 channels) → mono output."""
......
...@@ -8,15 +8,10 @@ from urllib.parse import urljoin ...@@ -8,15 +8,10 @@ from urllib.parse import urljoin
import numpy.typing as npt import numpy.typing as npt
from vllm.utils.import_utils import PlaceholderModule from vllm.multimodal.media.audio import load_audio
from .base import VLLM_S3_BUCKET_URL, get_vllm_public_assets from .base import VLLM_S3_BUCKET_URL, get_vllm_public_assets
try:
import librosa
except ImportError:
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
ASSET_DIR = "multimodal_asset" ASSET_DIR = "multimodal_asset"
AudioAssetName = Literal["winning_call", "mary_had_lamb"] AudioAssetName = Literal["winning_call", "mary_had_lamb"]
...@@ -33,7 +28,7 @@ class AudioAsset: ...@@ -33,7 +28,7 @@ class AudioAsset:
@property @property
def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]: def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]:
audio_path = get_vllm_public_assets(filename=self.filename, s3_prefix=ASSET_DIR) audio_path = get_vllm_public_assets(filename=self.filename, s3_prefix=ASSET_DIR)
return librosa.load(audio_path, sr=None) return load_audio(audio_path, sr=None)
def get_local_path(self) -> Path: def get_local_path(self) -> Path:
return get_vllm_public_assets(filename=self.filename, s3_prefix=ASSET_DIR) return get_vllm_public_assets(filename=self.filename, s3_prefix=ASSET_DIR)
......
...@@ -10,15 +10,10 @@ import numpy.typing as npt ...@@ -10,15 +10,10 @@ import numpy.typing as npt
from huggingface_hub import hf_hub_download from huggingface_hub import hf_hub_download
from PIL import Image from PIL import Image
from vllm.utils.import_utils import PlaceholderModule from vllm.multimodal.media.audio import load_audio_pyav
from .base import get_cache_dir from .base import get_cache_dir
try:
import librosa
except ImportError:
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
@lru_cache @lru_cache
def download_video_asset(filename: str) -> str: def download_video_asset(filename: str) -> str:
...@@ -146,4 +141,4 @@ class VideoAsset: ...@@ -146,4 +141,4 @@ class VideoAsset:
See also: examples/offline_inference/qwen2_5_omni/only_thinker.py See also: examples/offline_inference/qwen2_5_omni/only_thinker.py
""" """
return librosa.load(self.video_path, sr=sampling_rate)[0] return load_audio_pyav(self.video_path, sr=sampling_rate)[0]
...@@ -38,6 +38,7 @@ from typing_extensions import deprecated ...@@ -38,6 +38,7 @@ from typing_extensions import deprecated
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.lora.utils import get_adapter_absolute_path from vllm.lora.utils import get_adapter_absolute_path
from vllm.multimodal import MultiModalDataDict from vllm.multimodal import MultiModalDataDict
from vllm.multimodal.audio import get_audio_duration
from vllm.multimodal.image import convert_image_mode from vllm.multimodal.image import convert_image_mode
from vllm.tokenizers import TokenizerLike from vllm.tokenizers import TokenizerLike
from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.argparse_utils import FlexibleArgumentParser
...@@ -54,10 +55,6 @@ try: ...@@ -54,10 +55,6 @@ try:
except ImportError: except ImportError:
pd = PlaceholderModule("pandas") pd = PlaceholderModule("pandas")
try:
import librosa
except ImportError:
librosa = PlaceholderModule("librosa")
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -3253,7 +3250,7 @@ class ASRDataset(HuggingFaceDataset): ...@@ -3253,7 +3250,7 @@ class ASRDataset(HuggingFaceDataset):
break break
audio = item["audio"] audio = item["audio"]
y, sr = audio["array"], audio["sampling_rate"] y, sr = audio["array"], audio["sampling_rate"]
duration_s = librosa.get_duration(y=y, sr=sr) duration_s = get_audio_duration(y=y, sr=sr)
if duration_s < asr_min_audio_len_sec or duration_s > asr_max_audio_len_sec: if duration_s < asr_min_audio_len_sec or duration_s > asr_max_audio_len_sec:
skipped += 1 skipped += 1
continue continue
......
...@@ -42,32 +42,13 @@ from vllm.inputs import EncoderDecoderInputs, ProcessorInputs ...@@ -42,32 +42,13 @@ from vllm.inputs import EncoderDecoderInputs, ProcessorInputs
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.logprobs import FlatLogprobs, Logprob from vllm.logprobs import FlatLogprobs, Logprob
from vllm.model_executor.models import SupportsTranscription from vllm.model_executor.models import SupportsTranscription
from vllm.multimodal.audio import split_audio from vllm.multimodal.audio import get_audio_duration, split_audio
from vllm.multimodal.media.audio import extract_audio_from_video_bytes from vllm.multimodal.media.audio import load_audio
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
from vllm.renderers.inputs import DictPrompt, EncoderDecoderDictPrompt from vllm.renderers.inputs import DictPrompt, EncoderDecoderDictPrompt
from vllm.renderers.inputs.preprocess import parse_enc_dec_prompt, parse_model_prompt from vllm.renderers.inputs.preprocess import parse_enc_dec_prompt, parse_model_prompt
from vllm.sampling_params import BeamSearchParams, SamplingParams from vllm.sampling_params import BeamSearchParams, SamplingParams
from vllm.tokenizers import get_tokenizer from vllm.tokenizers import get_tokenizer
from vllm.utils.import_utils import PlaceholderModule
try:
import librosa
except ImportError:
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
try:
import soundfile as sf
except ImportError:
sf = PlaceholderModule("soundfile") # type: ignore[assignment]
# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile
# being librosa's main backend. Used to validate if an audio loading error is due to a
# server error vs a client error (invalid audio file).
# 1 = unrecognised format (file is not a supported audio container)
# 3 = malformed file (corrupt or structurally invalid audio)
# 4 = unsupported encoding (codec not supported by this libsndfile build)
_BAD_SF_CODES = {1, 3, 4}
SpeechToTextResponse: TypeAlias = TranscriptionResponse | TranslationResponse SpeechToTextResponse: TypeAlias = TranscriptionResponse | TranslationResponse
SpeechToTextResponseVerbose: TypeAlias = ( SpeechToTextResponseVerbose: TypeAlias = (
...@@ -214,32 +195,13 @@ class OpenAISpeechToText(OpenAIServing): ...@@ -214,32 +195,13 @@ class OpenAISpeechToText(OpenAIServing):
# pre-requisite for chunking, as it assumes Whisper SR. # pre-requisite for chunking, as it assumes Whisper SR.
try: try:
with io.BytesIO(audio_data) as buf: with io.BytesIO(audio_data) as buf:
y, sr = librosa.load(buf, sr=self.asr_config.sample_rate) # type: ignore[return-value] y, sr = load_audio(buf, sr=self.asr_config.sample_rate)
except sf.LibsndfileError as exc: except Exception as exc:
# Only fall back for known format-detection failures. raise ValueError("Invalid or unsupported audio file.") from exc
# Re-raise anything else (e.g. corrupt but recognised format).
if exc.code not in _BAD_SF_CODES:
raise
logger.debug(
"librosa/soundfile could not decode audio from BytesIO "
"(code=%s: %s); falling back to pyav in-process decode",
exc.code,
exc,
)
try:
native_y, native_sr = extract_audio_from_video_bytes(audio_data)
sr = self.asr_config.sample_rate
y = librosa.resample(native_y, orig_sr=native_sr, target_sr=sr)
except Exception as pyav_exc:
logger.debug(
"pyAV fallback also failed: %s",
pyav_exc,
)
raise ValueError("Invalid or unsupported audio file.") from pyav_exc
duration = librosa.get_duration(y=y, sr=sr) duration = get_audio_duration(y=y, sr=sr)
do_split_audio = ( do_split_audio = self.asr_config.allow_audio_chunking and (
self.asr_config.allow_audio_chunking self.asr_config.max_audio_clip_s is not None
and duration > self.asr_config.max_audio_clip_s and duration > self.asr_config.max_audio_clip_s
) )
......
...@@ -12,6 +12,7 @@ import math ...@@ -12,6 +12,7 @@ import math
import warnings import warnings
from collections.abc import Iterable, Mapping, Sequence from collections.abc import Iterable, Mapping, Sequence
from functools import cached_property from functools import cached_property
from io import BytesIO
from typing import Annotated, Literal, TypeAlias from typing import Annotated, Literal, TypeAlias
import torch import torch
...@@ -53,7 +54,7 @@ from vllm.multimodal.inputs import ( ...@@ -53,7 +54,7 @@ from vllm.multimodal.inputs import (
MultiModalKwargsItems, MultiModalKwargsItems,
VideoItem, VideoItem,
) )
from vllm.multimodal.media.audio import extract_audio_from_video_bytes from vllm.multimodal.media.audio import load_audio_pyav
from vllm.multimodal.parse import ( from vllm.multimodal.parse import (
AudioProcessorItems, AudioProcessorItems,
ImageEmbeddingItems, ImageEmbeddingItems,
...@@ -553,7 +554,7 @@ class NanoNemotronVLMultiModalProcessor( ...@@ -553,7 +554,7 @@ class NanoNemotronVLMultiModalProcessor(
"video must be loaded with keep_video_bytes=True (e.g. via " "video must be loaded with keep_video_bytes=True (e.g. via "
"the chat API with a model that sets use_audio_in_video)." "the chat API with a model that sets use_audio_in_video)."
) )
audio_items.append(extract_audio_from_video_bytes(video_bytes)) audio_items.append(load_audio_pyav(BytesIO(video_bytes)))
# Create a new VideoProcessorItems with metadata that does not contain # Create a new VideoProcessorItems with metadata that does not contain
# the large video bytes, to avoid modifying the input `mm_items`. # the large video bytes, to avoid modifying the input `mm_items`.
......
...@@ -12,17 +12,35 @@ import torch ...@@ -12,17 +12,35 @@ import torch
from vllm.utils.import_utils import PlaceholderModule from vllm.utils.import_utils import PlaceholderModule
try: try:
import librosa import av as av
except ImportError: except ImportError:
librosa = PlaceholderModule("librosa") # type: ignore[assignment] av = PlaceholderModule("av") # type: ignore[assignment]
try:
import resampy
except ImportError:
resampy = PlaceholderModule("resampy") # type: ignore[assignment]
try: try:
import scipy.signal as scipy_signal import scipy.signal as scipy_signal
except ImportError: except ImportError:
scipy_signal = PlaceholderModule("scipy").placeholder_attr("signal") # type: ignore[assignment] scipy_signal = PlaceholderModule("scipy").placeholder_attr("signal") # type: ignore[assignment]
# ============================================================ # ============================================================
# Aligned with `librosa.get_duration` function
def get_audio_duration(*, y: npt.NDArray[np.floating], sr: float = 22050) -> float:
"""Get the duration of an audio array in seconds.
Args:
y: Audio time series. Can be 1D (samples,) or 2D (channels, samples).
sr: Sample rate of the audio in Hz.
Returns:
Duration of the audio in seconds.
"""
n_samples = y.shape[-1]
return float(n_samples) / sr
class ChannelReduction(str, Enum): class ChannelReduction(str, Enum):
...@@ -153,13 +171,71 @@ def normalize_audio( ...@@ -153,13 +171,71 @@ def normalize_audio(
# ============================================================ # ============================================================
def resample_audio_librosa( def resample_audio_pyav(
audio: npt.NDArray[np.floating], audio: npt.NDArray[np.floating],
*, *,
orig_sr: float, orig_sr: float,
target_sr: float, target_sr: float,
) -> npt.NDArray[np.floating]: ) -> npt.NDArray[np.floating]:
return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr) """Resample audio using PyAV (libswresample via FFmpeg).
Args:
audio: Input audio. Can be:
- 1D array ``(samples,)``: mono audio
- 2D array ``(channels, samples)``: stereo audio
orig_sr: Original sample rate in Hz.
target_sr: Target sample rate in Hz.
Returns:
Resampled audio with the same shape as the input (1D → 1D, 2D → 2D).
"""
orig_sr_int = int(round(orig_sr))
target_sr_int = int(round(target_sr))
if orig_sr_int == target_sr_int:
return audio
if audio.ndim == 2:
# Resample each channel independently and re-stack.
return np.stack(
[
resample_audio_pyav(ch, orig_sr=orig_sr, target_sr=target_sr)
for ch in audio
],
axis=0,
)
expected_len = int(math.ceil(audio.shape[-1] * target_sr_int / orig_sr_int))
# from_ndarray expects shape (channels, samples) for planar formats.
# libswresample requires a minimum number of input samples to produce
# output frames; pad short inputs with zeros so we always get output,
# then trim to the expected output length.
_MIN_SAMPLES = 1024
audio_f32 = np.asarray(audio, dtype=np.float32)
if len(audio_f32) < _MIN_SAMPLES:
audio_f32 = np.pad(audio_f32, (0, _MIN_SAMPLES - len(audio_f32)))
audio_f32 = audio_f32.reshape(1, -1)
resampler = av.AudioResampler(format="fltp", layout="mono", rate=target_sr_int)
frame = av.AudioFrame.from_ndarray(audio_f32, format="fltp", layout="mono")
frame.sample_rate = orig_sr_int
out_frames = resampler.resample(frame)
out_frames.extend(resampler.resample(None)) # flush buffered samples
result = np.concatenate([f.to_ndarray() for f in out_frames], axis=1).squeeze(0)
return result[:expected_len]
def resample_audio_resampy(
audio: npt.NDArray[np.floating],
*,
orig_sr: float,
target_sr: float,
) -> npt.NDArray[np.floating]:
return resampy.resample(audio, sr_orig=orig_sr, sr_new=target_sr)
def resample_audio_scipy( def resample_audio_scipy(
...@@ -167,7 +243,7 @@ def resample_audio_scipy( ...@@ -167,7 +243,7 @@ def resample_audio_scipy(
*, *,
orig_sr: float, orig_sr: float,
target_sr: float, target_sr: float,
): ) -> npt.NDArray[np.floating]:
if orig_sr > target_sr: if orig_sr > target_sr:
return scipy_signal.resample_poly(audio, 1, orig_sr // target_sr) return scipy_signal.resample_poly(audio, 1, orig_sr // target_sr)
elif orig_sr < target_sr: elif orig_sr < target_sr:
...@@ -181,7 +257,7 @@ class AudioResampler: ...@@ -181,7 +257,7 @@ class AudioResampler:
def __init__( def __init__(
self, self,
target_sr: float | None = None, target_sr: float | None = None,
method: Literal["librosa", "scipy"] = "librosa", method: Literal["pyav", "resampy", "scipy"] = "resampy",
): ):
self.target_sr = target_sr self.target_sr = target_sr
self.method = method self.method = method
...@@ -203,8 +279,10 @@ class AudioResampler: ...@@ -203,8 +279,10 @@ class AudioResampler:
abs_tol=1e-6, abs_tol=1e-6,
): ):
return audio return audio
if self.method == "librosa": if self.method == "pyav":
return resample_audio_librosa( return resample_audio_pyav(audio, orig_sr=orig_sr, target_sr=self.target_sr)
if self.method == "resampy":
return resample_audio_resampy(
audio, orig_sr=orig_sr, target_sr=self.target_sr audio, orig_sr=orig_sr, target_sr=self.target_sr
) )
elif self.method == "scipy": elif self.method == "scipy":
...@@ -214,7 +292,7 @@ class AudioResampler: ...@@ -214,7 +292,7 @@ class AudioResampler:
else: else:
raise ValueError( raise ValueError(
f"Invalid resampling method: {self.method}. " f"Invalid resampling method: {self.method}. "
"Supported methods are 'librosa' and 'scipy'." "Supported methods are 'pyav' and 'scipy'."
) )
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import math
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
...@@ -14,58 +15,80 @@ from vllm.utils.serial_utils import tensor2base64 ...@@ -14,58 +15,80 @@ from vllm.utils.serial_utils import tensor2base64
from .base import MediaIO from .base import MediaIO
try: try:
import librosa import av
except ImportError: except ImportError:
librosa = PlaceholderModule("librosa") # type: ignore[assignment] av = PlaceholderModule("av") # type: ignore[assignment]
try: try:
import soundfile import soundfile
except ImportError: except ImportError:
soundfile = PlaceholderModule("soundfile") # type: ignore[assignment] soundfile = PlaceholderModule("soundfile") # type: ignore[assignment]
try: try:
import av import resampy
except ImportError: except ImportError:
av = PlaceholderModule("av") # type: ignore[assignment] resampy = PlaceholderModule("resampy") # type: ignore[assignment]
def extract_audio_from_video_bytes( # Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile
data: bytes, # being librosa's main backend. Used to validate if an audio loading error is due to a
) -> tuple[npt.NDArray, float]: # server error vs a client error (invalid audio file).
"""Extract the audio track from raw video bytes using PyAV. # 1 = unrecognised format (file is not a supported audio container)
# 3 = malformed file (corrupt or structurally invalid audio)
# 4 = unsupported encoding (codec not supported by this libsndfile build)
_BAD_SF_CODES = {1, 3, 4}
PyAV wraps FFmpeg's C libraries in-process — no subprocess is
spawned, which is critical to avoid crashing CUDA-active vLLM
worker processes.
The returned waveform is at the native sample rate of the video's def load_audio_pyav(
audio stream. Resampling to a model-specific rate is left to the path: BytesIO | Path | str,
downstream :class:`AudioResampler` in the parsing pipeline. *,
sr: float | None = 22050,
mono: bool = True,
) -> tuple[npt.NDArray, float]:
"""Load an audio file using PyAV (FFmpeg), returning float32 mono waveform.
Decodes the audio stream at its native sample rate. Channel reduction to
mono is performed by averaging across channels. Resampling to a
model-specific rate is left to the downstream :class:`AudioResampler`.
Args: Args:
data: Raw video file bytes (e.g. from an mp4 file). path: A :class:`~io.BytesIO` buffer, a filesystem
:class:`~pathlib.Path`, or a string path.
Returns: Returns:
A tuple of ``(waveform, sample_rate)`` suitable for use as an ``(waveform, sample_rate)`` where *waveform* is a 1-D float32
:class:`AudioItem`. NumPy array and *sample_rate* is the native sample rate in Hz.
""" """
if data is None or len(data) == 0: native_sr = None
raise ValueError(
"Cannot extract audio: video bytes are missing or empty. "
"Ensure video was loaded with keep_video_bytes=True for "
"audio-in-video extraction."
)
try: try:
with av.open(BytesIO(data)) as container: with av.open(path) as container:
if not container.streams.audio: if not container.streams.audio:
raise ValueError("No audio stream found in the video.") raise ValueError("No audio stream found.")
stream = container.streams.audio[0] stream = container.streams.audio[0]
stream.thread_type = "AUTO"
native_sr = stream.rate native_sr = stream.rate
sr = sr or native_sr
chunks: list[npt.NDArray] = [] chunks: list[npt.NDArray] = []
for frame in container.decode(audio=0): needs_resampling = not math.isclose(
arr = frame.to_ndarray() float(sr),
chunks.append(arr.mean(axis=0) if arr.ndim > 1 else arr) float(native_sr),
rel_tol=0.0,
abs_tol=1e-6,
)
resampler = (
av.AudioResampler(format="fltp", layout="mono", rate=sr)
if needs_resampling
else None
)
for frame in container.decode(stream):
if needs_resampling:
assert resampler is not None
for out_frame in resampler.resample(frame):
chunks.append(out_frame.to_ndarray())
else:
chunks.append(frame.to_ndarray())
except ValueError: except ValueError:
raise raise
except Exception as e: except Exception as e:
...@@ -77,37 +100,54 @@ def extract_audio_from_video_bytes( ...@@ -77,37 +100,54 @@ def extract_audio_from_video_bytes(
if not chunks: if not chunks:
raise ValueError("No audio found in the video.") raise ValueError("No audio found in the video.")
audio = np.concatenate(chunks).astype(np.float32) audio = np.concatenate(chunks, axis=-1).astype(np.float32)
return audio, float(native_sr) if mono and audio.ndim > 1:
audio = np.mean(audio, axis=0)
return audio, sr
def is_video(data: bytes) -> bool:
"""Check if the fetched bytes are video"""
if len(data) < 12:
return False
box_type = data[4:8] def load_audio_soundfile(
major_brand = data[8:12] path: BytesIO | Path | str,
*,
sr: float | None = 22050,
mono: bool = True,
) -> tuple[np.ndarray, int]:
"""Load audio via soundfile"""
with soundfile.SoundFile(path) as f:
native_sr = f.samplerate
y = f.read(dtype="float32", always_2d=False).T
MP4_BRANDS = { if mono and y.ndim > 1:
b"mp41", y = np.mean(y, axis=tuple(range(y.ndim - 1)))
b"mp42", # MP4
b"isom", # ISO Base Media
b"iso2",
b"iso4",
b"iso5",
b"iso6",
b"M4V ",
b"M4A ", # Apple
b"avc1", # H.264
b"dash", # DASH
b"mmp4",
b"MSNV",
}
is_avi = data[:4] == b"RIFF" and major_brand == b"AVI " if sr is not None and sr != native_sr:
is_mp4 = box_type == b"ftyp" and major_brand in MP4_BRANDS y = resampy.resample(y, sr_orig=native_sr, sr_new=sr)
return is_mp4 or is_avi return y, int(sr)
return y, native_sr
def load_audio(
path: BytesIO | Path | str,
*,
sr: float | None = 22050,
mono: bool = True,
):
try:
return load_audio_soundfile(path, sr=sr, mono=mono)
except soundfile.LibsndfileError as exc:
# Only fall back for known format-detection failures.
# Re-raise anything else (e.g. corrupt but recognised format).
if exc.code not in _BAD_SF_CODES:
raise
# soundfile may have advanced the BytesIO seek position before failing;
# reset it so PyAV can read from the beginning.
if isinstance(path, BytesIO):
path.seek(0)
try:
return load_audio_pyav(path, sr=sr, mono=mono)
except Exception as pyav_exc:
raise ValueError("Invalid or unsupported audio file.") from pyav_exc
class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]): class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
...@@ -128,9 +168,7 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]): ...@@ -128,9 +168,7 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
self.kwargs = kwargs self.kwargs = kwargs
def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]: def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
if is_video(data): return load_audio(BytesIO(data), sr=None)
return extract_audio_from_video_bytes(data)
return librosa.load(BytesIO(data), sr=None)
def load_base64( def load_base64(
self, self,
...@@ -140,7 +178,7 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]): ...@@ -140,7 +178,7 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
return self.load_bytes(pybase64.b64decode(data)) return self.load_bytes(pybase64.b64decode(data))
def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]: def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
return librosa.load(filepath, sr=None) return load_audio(filepath, sr=None)
def encode_base64( def encode_base64(
self, self,
......
...@@ -497,7 +497,7 @@ class MultiModalDataParser: ...@@ -497,7 +497,7 @@ class MultiModalDataParser:
*, *,
target_sr: float | None = None, target_sr: float | None = None,
target_channels: int | None = None, target_channels: int | None = None,
audio_resample_method: Literal["librosa", "scipy"] = "librosa", audio_resample_method: Literal["pyav", "scipy"] = "pyav",
video_needs_metadata: bool = False, video_needs_metadata: bool = False,
expected_hidden_size: int | None = None, expected_hidden_size: int | None = None,
) -> None: ) -> None:
......
...@@ -172,9 +172,6 @@ class BaseRenderer(ABC, Generic[_T]): ...@@ -172,9 +172,6 @@ class BaseRenderer(ABC, Generic[_T]):
For chat requests: For chat requests:
- Jinja2 template compilation - Jinja2 template compilation
For multi-modal requests:
- Importing libraries such as librosa triggers JIT compilation.
""" """
from vllm.entrypoints.chat_utils import ChatTemplateResolutionError from vllm.entrypoints.chat_utils import ChatTemplateResolutionError
......
...@@ -188,7 +188,7 @@ class FireRedASR2FeatureExtractor(SequenceFeatureExtractor): ...@@ -188,7 +188,7 @@ class FireRedASR2FeatureExtractor(SequenceFeatureExtractor):
for speech in raw_speech: for speech in raw_speech:
""" """
We must multiply by 32768 here because FireRedASR2 loads audio data We must multiply by 32768 here because FireRedASR2 loads audio data
using kaldiio.load_mat, while vLLM loads audio data using librosa. using kaldiio.load_mat, while vLLM loads audio data using pyav.
""" """
speech = speech * 32768 speech = speech * 32768
fbank = self.fbank(sampling_rate, speech) fbank = self.fbank(sampling_rate, speech)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment