Unverified Commit c7f98b4d authored by Isotr0py's avatar Isotr0py Committed by GitHub
Browse files

[Frontend] Remove librosa from audio dependency (#37058)


Signed-off-by: default avatarIsotr0py <mozf@mail2.sysu.edu.cn>
parent 1c472f8f
......@@ -21,6 +21,7 @@ vocos # required for minicpmo_26 test
peft>=0.15.0 # required for phi-4-mm test
pqdm
ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests
resampy # required for audio tests
sentence-transformers>=5.2.0 # required for embedding tests
soundfile # required for audio tests
jiwer # required for audio tests
......
......@@ -544,6 +544,7 @@ numba==0.61.2
# via
# -r requirements/test.in
# librosa
# resampy
numpy==2.2.6
# via
# -r requirements/test.in
......@@ -584,6 +585,7 @@ numpy==2.2.6
# pyogrio
# pywavelets
# rasterio
# resampy
# rioxarray
# rouge-score
# runai-model-streamer
......@@ -995,6 +997,8 @@ requests==2.32.3
# tiktoken
# transformers
# wandb
resampy==0.4.3
# via -r requirements/test.in
responses==0.25.3
# via genai-perf
rfc3339-validator==0.1.4
......
......@@ -987,11 +987,11 @@ setup(
"instanttensor": ["instanttensor >= 0.1.5"],
"runai": ["runai-model-streamer[s3,gcs,azure] >= 0.15.7"],
"audio": [
"librosa",
"av",
"resampy",
"scipy",
"soundfile",
"mistral_common[audio]",
"av",
], # Required for audio processing
"video": [], # Kept for backwards compatibility
"flashinfer": [], # Kept for backwards compatibility
......
......@@ -152,5 +152,5 @@ async def test_basic_audio_foscolo(foscolo, rocm_aiter_fa_attention, model_name)
model_name,
foscolo,
language="it",
expected_text="ove il mio corpo fanciulletto giacque",
expected_text="ove il mio corpo fanciulletto",
)
......@@ -275,7 +275,7 @@ INPUT_REASONING_BATCH = "\n".join(
]
)
MINIMAL_WAV_BASE64 = "UklGRiQAAABXQVZFZm10IBAAAAABAAEAQB8AAEAfAAABAAgAZGF0YQAAAAA="
MINIMAL_WAV_BASE64 = "UklGRigAAABXQVZFZm10IBAAAAABAAEAgD4AAAB9AAACABAAZGF0YQQAAAAAAP9/"
INPUT_TRANSCRIPTION_BATCH = (
json.dumps(
{
......
......@@ -323,10 +323,7 @@ def build_audio_inputs_from_test_info(
test_info.audio_idx_to_prompt,
test_info.prompt_formatter,
)
resampler = AudioResampler(
target_sr=16000,
method="librosa",
)
resampler = AudioResampler(target_sr=16000)
audios = [asset.audio_and_sample_rate for asset in audio_assets]
resampled_audios = [
(
......
......@@ -10,6 +10,8 @@ import pytest
from vllm.multimodal.media import AudioMediaIO
from ...conftest import AudioTestAssets
pytestmark = pytest.mark.cpu_test
ASSETS_DIR = Path(__file__).parent.parent / "assets"
......@@ -22,40 +24,32 @@ def dummy_audio():
@pytest.fixture
def dummy_audio_bytes():
return b"FAKEAUDIOBYTES"
def dummy_audio_bytes(audio_assets: AudioTestAssets):
with open(audio_assets[0].get_local_path(), "rb") as f:
return f.read()
def test_audio_media_io_load_bytes(dummy_audio_bytes):
audio_io = AudioMediaIO()
with patch("librosa.load") as mock_load:
mock_load.return_value = (np.array([0.1, 0.2]), 16000)
out = audio_io.load_bytes(dummy_audio_bytes)
mock_load.assert_called_once()
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
out = audio_io.load_bytes(dummy_audio_bytes)
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
def test_audio_media_io_load_base64(dummy_audio_bytes):
audio_io = AudioMediaIO()
encoded = base64.b64encode(dummy_audio_bytes).decode("utf-8")
with patch.object(AudioMediaIO, "load_bytes") as mock_load_bytes:
mock_load_bytes.return_value = (np.array([0.1, 0.2]), 16000)
out = audio_io.load_base64("audio/wav", encoded)
mock_load_bytes.assert_called_once()
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
out = audio_io.load_base64("audio/wav", encoded)
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
def test_audio_media_io_load_file():
def test_audio_media_io_load_file(audio_assets: AudioTestAssets):
audio_io = AudioMediaIO()
path = Path("/fake/path.wav")
with patch("librosa.load") as mock_load:
mock_load.return_value = (np.array([0.1, 0.2]), 16000)
out = audio_io.load_file(path)
mock_load.assert_called_once_with(path, sr=None)
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
path = audio_assets[0].get_local_path()
out = audio_io.load_file(path)
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
def test_audio_media_io_encode_base64(dummy_audio):
......
......@@ -14,7 +14,7 @@ from vllm.multimodal.audio import (
AudioSpec,
ChannelReduction,
normalize_audio,
resample_audio_librosa,
resample_audio_pyav,
resample_audio_scipy,
split_audio,
)
......@@ -25,14 +25,14 @@ def dummy_audio():
return np.array([0.0, 0.1, 0.2, 0.3, 0.4], dtype=float)
def test_resample_audio_librosa(dummy_audio):
with patch("vllm.multimodal.audio.librosa.resample") as mock_resample:
mock_resample.return_value = dummy_audio * 2
out = resample_audio_librosa(dummy_audio, orig_sr=44100, target_sr=22050)
mock_resample.assert_called_once_with(
dummy_audio, orig_sr=44100, target_sr=22050
)
assert np.all(out == dummy_audio * 2)
def test_resample_audio_pyav(dummy_audio):
out_down = resample_audio_pyav(dummy_audio, orig_sr=4, target_sr=2)
out_up = resample_audio_pyav(dummy_audio, orig_sr=2, target_sr=4)
out_same = resample_audio_pyav(dummy_audio, orig_sr=4, target_sr=4)
assert len(out_down) == 3
assert len(out_up) == 10
assert np.all(out_same == dummy_audio)
def test_resample_audio_scipy(dummy_audio):
......@@ -56,9 +56,9 @@ def test_resample_audio_scipy_non_integer_ratio(dummy_audio):
assert np.isfinite(out).all()
def test_audio_resampler_librosa_calls_resample(dummy_audio):
resampler = AudioResampler(target_sr=22050, method="librosa")
with patch("vllm.multimodal.audio.resample_audio_librosa") as mock_resample:
def test_audio_resampler_pyav_calls_resample(dummy_audio):
resampler = AudioResampler(target_sr=22050, method="pyav")
with patch("vllm.multimodal.audio.resample_audio_pyav") as mock_resample:
mock_resample.return_value = dummy_audio
out = resampler.resample(dummy_audio, orig_sr=44100)
mock_resample.assert_called_once_with(
......@@ -423,13 +423,13 @@ class TestAudioPipelineE2E:
# Verify channel averaging: mean of [0.5, -0.5] = 0.0
np.testing.assert_array_almost_equal(audio_output, np.zeros(16000), decimal=5)
def test_librosa_mono_passthrough_e2e(self):
"""Full pipeline: librosa mono format → preserved as mono."""
def test_pyav_mono_passthrough_e2e(self):
"""Full pipeline: pyav mono format → preserved as mono."""
from vllm.multimodal.parse import MultiModalDataParser
# Simulate librosa output: already mono (time,) format
mono_librosa = np.random.randn(16000).astype(np.float32)
assert mono_librosa.shape == (16000,)
# Simulate pyav output: already mono (time,) format
mono_pyav = np.random.randn(16000).astype(np.float32)
assert mono_pyav.shape == (16000,)
# Create parser with mono normalization
parser = MultiModalDataParser(
......@@ -438,7 +438,7 @@ class TestAudioPipelineE2E:
)
# Process audio through the parser
result = parser._parse_audio_data((mono_librosa, 16000))
result = parser._parse_audio_data((mono_pyav, 16000))
audio_output = result.get(0)
# Verify output is still mono 1D
......@@ -446,7 +446,7 @@ class TestAudioPipelineE2E:
assert audio_output.shape == (16000,)
# Verify audio content is preserved
np.testing.assert_array_almost_equal(audio_output, mono_librosa)
np.testing.assert_array_almost_equal(audio_output, mono_pyav)
def test_multichannel_5_1_surround_to_mono_e2e(self):
"""Full pipeline: 5.1 surround (6 channels) → mono output."""
......
......@@ -8,15 +8,10 @@ from urllib.parse import urljoin
import numpy.typing as npt
from vllm.utils.import_utils import PlaceholderModule
from vllm.multimodal.media.audio import load_audio
from .base import VLLM_S3_BUCKET_URL, get_vllm_public_assets
try:
import librosa
except ImportError:
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
ASSET_DIR = "multimodal_asset"
AudioAssetName = Literal["winning_call", "mary_had_lamb"]
......@@ -33,7 +28,7 @@ class AudioAsset:
@property
def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]:
audio_path = get_vllm_public_assets(filename=self.filename, s3_prefix=ASSET_DIR)
return librosa.load(audio_path, sr=None)
return load_audio(audio_path, sr=None)
def get_local_path(self) -> Path:
return get_vllm_public_assets(filename=self.filename, s3_prefix=ASSET_DIR)
......
......@@ -10,15 +10,10 @@ import numpy.typing as npt
from huggingface_hub import hf_hub_download
from PIL import Image
from vllm.utils.import_utils import PlaceholderModule
from vllm.multimodal.media.audio import load_audio_pyav
from .base import get_cache_dir
try:
import librosa
except ImportError:
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
@lru_cache
def download_video_asset(filename: str) -> str:
......@@ -146,4 +141,4 @@ class VideoAsset:
See also: examples/offline_inference/qwen2_5_omni/only_thinker.py
"""
return librosa.load(self.video_path, sr=sampling_rate)[0]
return load_audio_pyav(self.video_path, sr=sampling_rate)[0]
......@@ -38,6 +38,7 @@ from typing_extensions import deprecated
from vllm.lora.request import LoRARequest
from vllm.lora.utils import get_adapter_absolute_path
from vllm.multimodal import MultiModalDataDict
from vllm.multimodal.audio import get_audio_duration
from vllm.multimodal.image import convert_image_mode
from vllm.tokenizers import TokenizerLike
from vllm.utils.argparse_utils import FlexibleArgumentParser
......@@ -54,10 +55,6 @@ try:
except ImportError:
pd = PlaceholderModule("pandas")
try:
import librosa
except ImportError:
librosa = PlaceholderModule("librosa")
logger = logging.getLogger(__name__)
......@@ -3253,7 +3250,7 @@ class ASRDataset(HuggingFaceDataset):
break
audio = item["audio"]
y, sr = audio["array"], audio["sampling_rate"]
duration_s = librosa.get_duration(y=y, sr=sr)
duration_s = get_audio_duration(y=y, sr=sr)
if duration_s < asr_min_audio_len_sec or duration_s > asr_max_audio_len_sec:
skipped += 1
continue
......
......@@ -42,32 +42,13 @@ from vllm.inputs import EncoderDecoderInputs, ProcessorInputs
from vllm.logger import init_logger
from vllm.logprobs import FlatLogprobs, Logprob
from vllm.model_executor.models import SupportsTranscription
from vllm.multimodal.audio import split_audio
from vllm.multimodal.media.audio import extract_audio_from_video_bytes
from vllm.multimodal.audio import get_audio_duration, split_audio
from vllm.multimodal.media.audio import load_audio
from vllm.outputs import RequestOutput
from vllm.renderers.inputs import DictPrompt, EncoderDecoderDictPrompt
from vllm.renderers.inputs.preprocess import parse_enc_dec_prompt, parse_model_prompt
from vllm.sampling_params import BeamSearchParams, SamplingParams
from vllm.tokenizers import get_tokenizer
from vllm.utils.import_utils import PlaceholderModule
try:
import librosa
except ImportError:
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
try:
import soundfile as sf
except ImportError:
sf = PlaceholderModule("soundfile") # type: ignore[assignment]
# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile
# being librosa's main backend. Used to validate if an audio loading error is due to a
# server error vs a client error (invalid audio file).
# 1 = unrecognised format (file is not a supported audio container)
# 3 = malformed file (corrupt or structurally invalid audio)
# 4 = unsupported encoding (codec not supported by this libsndfile build)
_BAD_SF_CODES = {1, 3, 4}
SpeechToTextResponse: TypeAlias = TranscriptionResponse | TranslationResponse
SpeechToTextResponseVerbose: TypeAlias = (
......@@ -214,32 +195,13 @@ class OpenAISpeechToText(OpenAIServing):
# pre-requisite for chunking, as it assumes Whisper SR.
try:
with io.BytesIO(audio_data) as buf:
y, sr = librosa.load(buf, sr=self.asr_config.sample_rate) # type: ignore[return-value]
except sf.LibsndfileError as exc:
# Only fall back for known format-detection failures.
# Re-raise anything else (e.g. corrupt but recognised format).
if exc.code not in _BAD_SF_CODES:
raise
logger.debug(
"librosa/soundfile could not decode audio from BytesIO "
"(code=%s: %s); falling back to pyav in-process decode",
exc.code,
exc,
)
try:
native_y, native_sr = extract_audio_from_video_bytes(audio_data)
sr = self.asr_config.sample_rate
y = librosa.resample(native_y, orig_sr=native_sr, target_sr=sr)
except Exception as pyav_exc:
logger.debug(
"pyAV fallback also failed: %s",
pyav_exc,
)
raise ValueError("Invalid or unsupported audio file.") from pyav_exc
y, sr = load_audio(buf, sr=self.asr_config.sample_rate)
except Exception as exc:
raise ValueError("Invalid or unsupported audio file.") from exc
duration = librosa.get_duration(y=y, sr=sr)
do_split_audio = (
self.asr_config.allow_audio_chunking
duration = get_audio_duration(y=y, sr=sr)
do_split_audio = self.asr_config.allow_audio_chunking and (
self.asr_config.max_audio_clip_s is not None
and duration > self.asr_config.max_audio_clip_s
)
......
......@@ -12,6 +12,7 @@ import math
import warnings
from collections.abc import Iterable, Mapping, Sequence
from functools import cached_property
from io import BytesIO
from typing import Annotated, Literal, TypeAlias
import torch
......@@ -53,7 +54,7 @@ from vllm.multimodal.inputs import (
MultiModalKwargsItems,
VideoItem,
)
from vllm.multimodal.media.audio import extract_audio_from_video_bytes
from vllm.multimodal.media.audio import load_audio_pyav
from vllm.multimodal.parse import (
AudioProcessorItems,
ImageEmbeddingItems,
......@@ -553,7 +554,7 @@ class NanoNemotronVLMultiModalProcessor(
"video must be loaded with keep_video_bytes=True (e.g. via "
"the chat API with a model that sets use_audio_in_video)."
)
audio_items.append(extract_audio_from_video_bytes(video_bytes))
audio_items.append(load_audio_pyav(BytesIO(video_bytes)))
# Create a new VideoProcessorItems with metadata that does not contain
# the large video bytes, to avoid modifying the input `mm_items`.
......
......@@ -12,17 +12,35 @@ import torch
from vllm.utils.import_utils import PlaceholderModule
try:
import librosa
import av as av
except ImportError:
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
av = PlaceholderModule("av") # type: ignore[assignment]
try:
import resampy
except ImportError:
resampy = PlaceholderModule("resampy") # type: ignore[assignment]
try:
import scipy.signal as scipy_signal
except ImportError:
scipy_signal = PlaceholderModule("scipy").placeholder_attr("signal") # type: ignore[assignment]
# ============================================================
# Aligned with `librosa.get_duration` function
def get_audio_duration(*, y: npt.NDArray[np.floating], sr: float = 22050) -> float:
"""Get the duration of an audio array in seconds.
Args:
y: Audio time series. Can be 1D (samples,) or 2D (channels, samples).
sr: Sample rate of the audio in Hz.
Returns:
Duration of the audio in seconds.
"""
n_samples = y.shape[-1]
return float(n_samples) / sr
class ChannelReduction(str, Enum):
......@@ -153,13 +171,71 @@ def normalize_audio(
# ============================================================
def resample_audio_librosa(
def resample_audio_pyav(
audio: npt.NDArray[np.floating],
*,
orig_sr: float,
target_sr: float,
) -> npt.NDArray[np.floating]:
return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
"""Resample audio using PyAV (libswresample via FFmpeg).
Args:
audio: Input audio. Can be:
- 1D array ``(samples,)``: mono audio
- 2D array ``(channels, samples)``: stereo audio
orig_sr: Original sample rate in Hz.
target_sr: Target sample rate in Hz.
Returns:
Resampled audio with the same shape as the input (1D → 1D, 2D → 2D).
"""
orig_sr_int = int(round(orig_sr))
target_sr_int = int(round(target_sr))
if orig_sr_int == target_sr_int:
return audio
if audio.ndim == 2:
# Resample each channel independently and re-stack.
return np.stack(
[
resample_audio_pyav(ch, orig_sr=orig_sr, target_sr=target_sr)
for ch in audio
],
axis=0,
)
expected_len = int(math.ceil(audio.shape[-1] * target_sr_int / orig_sr_int))
# from_ndarray expects shape (channels, samples) for planar formats.
# libswresample requires a minimum number of input samples to produce
# output frames; pad short inputs with zeros so we always get output,
# then trim to the expected output length.
_MIN_SAMPLES = 1024
audio_f32 = np.asarray(audio, dtype=np.float32)
if len(audio_f32) < _MIN_SAMPLES:
audio_f32 = np.pad(audio_f32, (0, _MIN_SAMPLES - len(audio_f32)))
audio_f32 = audio_f32.reshape(1, -1)
resampler = av.AudioResampler(format="fltp", layout="mono", rate=target_sr_int)
frame = av.AudioFrame.from_ndarray(audio_f32, format="fltp", layout="mono")
frame.sample_rate = orig_sr_int
out_frames = resampler.resample(frame)
out_frames.extend(resampler.resample(None)) # flush buffered samples
result = np.concatenate([f.to_ndarray() for f in out_frames], axis=1).squeeze(0)
return result[:expected_len]
def resample_audio_resampy(
audio: npt.NDArray[np.floating],
*,
orig_sr: float,
target_sr: float,
) -> npt.NDArray[np.floating]:
return resampy.resample(audio, sr_orig=orig_sr, sr_new=target_sr)
def resample_audio_scipy(
......@@ -167,7 +243,7 @@ def resample_audio_scipy(
*,
orig_sr: float,
target_sr: float,
):
) -> npt.NDArray[np.floating]:
if orig_sr > target_sr:
return scipy_signal.resample_poly(audio, 1, orig_sr // target_sr)
elif orig_sr < target_sr:
......@@ -181,7 +257,7 @@ class AudioResampler:
def __init__(
self,
target_sr: float | None = None,
method: Literal["librosa", "scipy"] = "librosa",
method: Literal["pyav", "resampy", "scipy"] = "resampy",
):
self.target_sr = target_sr
self.method = method
......@@ -203,8 +279,10 @@ class AudioResampler:
abs_tol=1e-6,
):
return audio
if self.method == "librosa":
return resample_audio_librosa(
if self.method == "pyav":
return resample_audio_pyav(audio, orig_sr=orig_sr, target_sr=self.target_sr)
if self.method == "resampy":
return resample_audio_resampy(
audio, orig_sr=orig_sr, target_sr=self.target_sr
)
elif self.method == "scipy":
......@@ -214,7 +292,7 @@ class AudioResampler:
else:
raise ValueError(
f"Invalid resampling method: {self.method}. "
"Supported methods are 'librosa' and 'scipy'."
"Supported methods are 'pyav' and 'scipy'."
)
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import math
from io import BytesIO
from pathlib import Path
......@@ -14,58 +15,80 @@ from vllm.utils.serial_utils import tensor2base64
from .base import MediaIO
try:
import librosa
import av
except ImportError:
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
av = PlaceholderModule("av") # type: ignore[assignment]
try:
import soundfile
except ImportError:
soundfile = PlaceholderModule("soundfile") # type: ignore[assignment]
try:
import av
import resampy
except ImportError:
av = PlaceholderModule("av") # type: ignore[assignment]
resampy = PlaceholderModule("resampy") # type: ignore[assignment]
def extract_audio_from_video_bytes(
data: bytes,
) -> tuple[npt.NDArray, float]:
"""Extract the audio track from raw video bytes using PyAV.
# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile
# being librosa's main backend. Used to validate if an audio loading error is due to a
# server error vs a client error (invalid audio file).
# 1 = unrecognised format (file is not a supported audio container)
# 3 = malformed file (corrupt or structurally invalid audio)
# 4 = unsupported encoding (codec not supported by this libsndfile build)
_BAD_SF_CODES = {1, 3, 4}
PyAV wraps FFmpeg's C libraries in-process — no subprocess is
spawned, which is critical to avoid crashing CUDA-active vLLM
worker processes.
The returned waveform is at the native sample rate of the video's
audio stream. Resampling to a model-specific rate is left to the
downstream :class:`AudioResampler` in the parsing pipeline.
def load_audio_pyav(
path: BytesIO | Path | str,
*,
sr: float | None = 22050,
mono: bool = True,
) -> tuple[npt.NDArray, float]:
"""Load an audio file using PyAV (FFmpeg), returning float32 mono waveform.
Decodes the audio stream at its native sample rate. Channel reduction to
mono is performed by averaging across channels. Resampling to a
model-specific rate is left to the downstream :class:`AudioResampler`.
Args:
data: Raw video file bytes (e.g. from an mp4 file).
path: A :class:`~io.BytesIO` buffer, a filesystem
:class:`~pathlib.Path`, or a string path.
Returns:
A tuple of ``(waveform, sample_rate)`` suitable for use as an
:class:`AudioItem`.
``(waveform, sample_rate)`` where *waveform* is a 1-D float32
NumPy array and *sample_rate* is the native sample rate in Hz.
"""
if data is None or len(data) == 0:
raise ValueError(
"Cannot extract audio: video bytes are missing or empty. "
"Ensure video was loaded with keep_video_bytes=True for "
"audio-in-video extraction."
)
native_sr = None
try:
with av.open(BytesIO(data)) as container:
with av.open(path) as container:
if not container.streams.audio:
raise ValueError("No audio stream found in the video.")
raise ValueError("No audio stream found.")
stream = container.streams.audio[0]
stream.thread_type = "AUTO"
native_sr = stream.rate
sr = sr or native_sr
chunks: list[npt.NDArray] = []
for frame in container.decode(audio=0):
arr = frame.to_ndarray()
chunks.append(arr.mean(axis=0) if arr.ndim > 1 else arr)
needs_resampling = not math.isclose(
float(sr),
float(native_sr),
rel_tol=0.0,
abs_tol=1e-6,
)
resampler = (
av.AudioResampler(format="fltp", layout="mono", rate=sr)
if needs_resampling
else None
)
for frame in container.decode(stream):
if needs_resampling:
assert resampler is not None
for out_frame in resampler.resample(frame):
chunks.append(out_frame.to_ndarray())
else:
chunks.append(frame.to_ndarray())
except ValueError:
raise
except Exception as e:
......@@ -77,37 +100,54 @@ def extract_audio_from_video_bytes(
if not chunks:
raise ValueError("No audio found in the video.")
audio = np.concatenate(chunks).astype(np.float32)
return audio, float(native_sr)
audio = np.concatenate(chunks, axis=-1).astype(np.float32)
if mono and audio.ndim > 1:
audio = np.mean(audio, axis=0)
return audio, sr
def is_video(data: bytes) -> bool:
"""Check if the fetched bytes are video"""
if len(data) < 12:
return False
box_type = data[4:8]
major_brand = data[8:12]
def load_audio_soundfile(
path: BytesIO | Path | str,
*,
sr: float | None = 22050,
mono: bool = True,
) -> tuple[np.ndarray, int]:
"""Load audio via soundfile"""
with soundfile.SoundFile(path) as f:
native_sr = f.samplerate
y = f.read(dtype="float32", always_2d=False).T
MP4_BRANDS = {
b"mp41",
b"mp42", # MP4
b"isom", # ISO Base Media
b"iso2",
b"iso4",
b"iso5",
b"iso6",
b"M4V ",
b"M4A ", # Apple
b"avc1", # H.264
b"dash", # DASH
b"mmp4",
b"MSNV",
}
if mono and y.ndim > 1:
y = np.mean(y, axis=tuple(range(y.ndim - 1)))
is_avi = data[:4] == b"RIFF" and major_brand == b"AVI "
is_mp4 = box_type == b"ftyp" and major_brand in MP4_BRANDS
return is_mp4 or is_avi
if sr is not None and sr != native_sr:
y = resampy.resample(y, sr_orig=native_sr, sr_new=sr)
return y, int(sr)
return y, native_sr
def load_audio(
path: BytesIO | Path | str,
*,
sr: float | None = 22050,
mono: bool = True,
):
try:
return load_audio_soundfile(path, sr=sr, mono=mono)
except soundfile.LibsndfileError as exc:
# Only fall back for known format-detection failures.
# Re-raise anything else (e.g. corrupt but recognised format).
if exc.code not in _BAD_SF_CODES:
raise
# soundfile may have advanced the BytesIO seek position before failing;
# reset it so PyAV can read from the beginning.
if isinstance(path, BytesIO):
path.seek(0)
try:
return load_audio_pyav(path, sr=sr, mono=mono)
except Exception as pyav_exc:
raise ValueError("Invalid or unsupported audio file.") from pyav_exc
class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
......@@ -128,9 +168,7 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
self.kwargs = kwargs
def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
if is_video(data):
return extract_audio_from_video_bytes(data)
return librosa.load(BytesIO(data), sr=None)
return load_audio(BytesIO(data), sr=None)
def load_base64(
self,
......@@ -140,7 +178,7 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
return self.load_bytes(pybase64.b64decode(data))
def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
return librosa.load(filepath, sr=None)
return load_audio(filepath, sr=None)
def encode_base64(
self,
......
......@@ -497,7 +497,7 @@ class MultiModalDataParser:
*,
target_sr: float | None = None,
target_channels: int | None = None,
audio_resample_method: Literal["librosa", "scipy"] = "librosa",
audio_resample_method: Literal["pyav", "scipy"] = "pyav",
video_needs_metadata: bool = False,
expected_hidden_size: int | None = None,
) -> None:
......
......@@ -172,9 +172,6 @@ class BaseRenderer(ABC, Generic[_T]):
For chat requests:
- Jinja2 template compilation
For multi-modal requests:
- Importing libraries such as librosa triggers JIT compilation.
"""
from vllm.entrypoints.chat_utils import ChatTemplateResolutionError
......
......@@ -188,7 +188,7 @@ class FireRedASR2FeatureExtractor(SequenceFeatureExtractor):
for speech in raw_speech:
"""
We must multiply by 32768 here because FireRedASR2 loads audio data
using kaldiio.load_mat, while vLLM loads audio data using librosa.
using kaldiio.load_mat, while vLLM loads audio data using pyav.
"""
speech = speech * 32768
fbank = self.fbank(sampling_rate, speech)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment