Unverified Commit 153ba7f0 authored by Nick Cao's avatar Nick Cao Committed by GitHub
Browse files

[Refactor] Drop direct dependency on librosa (#39079)


Signed-off-by: default avatarNick Cao <ncao@redhat.com>
Co-authored-by: default avatarClaude <noreply@anthropic.com>
parent 87518c30
...@@ -193,7 +193,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics: ...@@ -193,7 +193,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics:
The API server takes care of basic audio I/O and optional chunking before building prompts: The API server takes care of basic audio I/O and optional chunking before building prompts:
- Resampling: Input audio is resampled to `SpeechToTextConfig.sample_rate` using `librosa`. - Resampling: Input audio is resampled to `SpeechToTextConfig.sample_rate` using `AudioResampler`.
- Chunking: If `SpeechToTextConfig.allow_audio_chunking` is True and the duration exceeds `max_audio_clip_s`, the server splits the audio into overlapping chunks and generates a prompt per chunk. Overlap is controlled by `overlap_chunk_second`. - Chunking: If `SpeechToTextConfig.allow_audio_chunking` is True and the duration exceeds `max_audio_clip_s`, the server splits the audio into overlapping chunks and generates a prompt per chunk. Overlap is controlled by `overlap_chunk_second`.
- Energy-aware splitting: When `min_energy_split_window_size` is set, the server finds low-energy regions to minimize cutting within words. - Energy-aware splitting: When `min_energy_split_window_size` is set, the server finds low-energy regions to minimize cutting within words.
...@@ -206,8 +206,8 @@ Relevant server logic: ...@@ -206,8 +206,8 @@ Relevant server logic:
async def _preprocess_speech_to_text(...): async def _preprocess_speech_to_text(...):
language = self.model_cls.validate_language(request.language) language = self.model_cls.validate_language(request.language)
... ...
y, sr = librosa.load(bytes_, sr=self.asr_config.sample_rate) y, sr = load_audio(bytes_, sr=self.asr_config.sample_rate)
duration = librosa.get_duration(y=y, sr=sr) duration = get_audio_duration(y=y, sr=sr)
do_split_audio = (self.asr_config.allow_audio_chunking do_split_audio = (self.asr_config.allow_audio_chunking
and duration > self.asr_config.max_audio_clip_s) and duration > self.asr_config.max_audio_clip_s)
chunks = [y] if not do_split_audio else self._split_audio(y, int(sr)) chunks = [y] if not do_split_audio else self._split_audio(y, int(sr))
......
...@@ -300,12 +300,12 @@ Full example: [examples/offline_inference/audio_language.py](../../examples/offl ...@@ -300,12 +300,12 @@ Full example: [examples/offline_inference/audio_language.py](../../examples/offl
Speech-to-text models like Whisper have a maximum audio length they can process (typically 30 seconds). For longer audio files, vLLM provides a utility to intelligently split audio into chunks at quiet points to minimize cutting through speech. Speech-to-text models like Whisper have a maximum audio length they can process (typically 30 seconds). For longer audio files, vLLM provides a utility to intelligently split audio into chunks at quiet points to minimize cutting through speech.
```python ```python
import librosa
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.multimodal.audio import split_audio from vllm.multimodal.audio import split_audio
from vllm.multimodal.media.audio import load_audio
# Load long audio file # Load long audio file
audio, sr = librosa.load("long_audio.wav", sr=16000) audio, sr = load_audio("long_audio.wav", sr=16000)
# Split into chunks at low-energy (quiet) regions # Split into chunks at low-energy (quiet) regions
chunks = split_audio( chunks = split_audio(
...@@ -832,7 +832,7 @@ Then, you can use the OpenAI client as follows: ...@@ -832,7 +832,7 @@ Then, you can use the OpenAI client as follows:
base_url=openai_api_base, base_url=openai_api_base,
) )
# Any format supported by librosa is supported # Any format supported by soundfile/PyAV is supported
audio_url = AudioAsset("winning_call").url audio_url = AudioAsset("winning_call").url
audio_base64 = encode_base64_content_from_url(audio_url) audio_base64 = encode_base64_content_from_url(audio_url)
......
...@@ -267,7 +267,7 @@ def run_audio(model: str, max_completion_tokens: int) -> None: ...@@ -267,7 +267,7 @@ def run_audio(model: str, max_completion_tokens: int) -> None:
{ {
"type": "input_audio", "type": "input_audio",
"input_audio": { "input_audio": {
# Any format supported by librosa is supported # Any format supported by soundfile/PyAV is supported
"data": audio_base64, "data": audio_base64,
"format": "wav", "format": "wav",
}, },
...@@ -292,7 +292,7 @@ def run_audio(model: str, max_completion_tokens: int) -> None: ...@@ -292,7 +292,7 @@ def run_audio(model: str, max_completion_tokens: int) -> None:
{ {
"type": "audio_url", "type": "audio_url",
"audio_url": { "audio_url": {
# Any format supported by librosa is supported # Any format supported by soundfile/PyAV is supported
"url": audio_url "url": audio_url
}, },
}, },
...@@ -316,7 +316,7 @@ def run_audio(model: str, max_completion_tokens: int) -> None: ...@@ -316,7 +316,7 @@ def run_audio(model: str, max_completion_tokens: int) -> None:
{ {
"type": "audio_url", "type": "audio_url",
"audio_url": { "audio_url": {
# Any format supported by librosa is supported # Any format supported by soundfile/PyAV is supported
"url": f"data:audio/ogg;base64,{audio_base64}" "url": f"data:audio/ogg;base64,{audio_base64}"
}, },
}, },
......
...@@ -12,7 +12,6 @@ model, for example: ...@@ -12,7 +12,6 @@ model, for example:
Requirements: Requirements:
- vllm with audio support - vllm with audio support
- websockets - websockets
- librosa
- numpy - numpy
The script: The script:
...@@ -26,12 +25,12 @@ import argparse ...@@ -26,12 +25,12 @@ import argparse
import asyncio import asyncio
import json import json
import librosa
import numpy as np import numpy as np
import pybase64 as base64 import pybase64 as base64
import websockets import websockets
from vllm.assets.audio import AudioAsset from vllm.assets.audio import AudioAsset
from vllm.multimodal.media.audio import load_audio
def audio_to_pcm16_base64(audio_path: str) -> str: def audio_to_pcm16_base64(audio_path: str) -> str:
...@@ -39,7 +38,7 @@ def audio_to_pcm16_base64(audio_path: str) -> str: ...@@ -39,7 +38,7 @@ def audio_to_pcm16_base64(audio_path: str) -> str:
Load an audio file and convert it to base64-encoded PCM16 @ 16kHz. Load an audio file and convert it to base64-encoded PCM16 @ 16kHz.
""" """
# Load audio and resample to 16kHz mono # Load audio and resample to 16kHz mono
audio, _ = librosa.load(audio_path, sr=16000, mono=True) audio, _ = load_audio(audio_path, sr=16000, mono=True)
# Convert to PCM16 # Convert to PCM16
pcm16 = (audio * 32767).astype(np.int16) pcm16 = (audio * 32767).astype(np.int16)
# Encode as base64 # Encode as base64
......
...@@ -13,7 +13,6 @@ import io ...@@ -13,7 +13,6 @@ import io
import time import time
from statistics import mean, median from statistics import mean, median
import librosa
import pytest import pytest
import soundfile import soundfile
import torch import torch
...@@ -21,6 +20,7 @@ from datasets import load_dataset ...@@ -21,6 +20,7 @@ from datasets import load_dataset
from evaluate import load from evaluate import load
from transformers.models.whisper.english_normalizer import EnglishTextNormalizer from transformers.models.whisper.english_normalizer import EnglishTextNormalizer
from vllm.multimodal.audio import get_audio_duration
from vllm.tokenizers import get_tokenizer from vllm.tokenizers import get_tokenizer
from ....models.registry import HF_EXAMPLE_MODELS from ....models.registry import HF_EXAMPLE_MODELS
...@@ -84,7 +84,7 @@ async def process_dataset(model, client, data, concurrent_request): ...@@ -84,7 +84,7 @@ async def process_dataset(model, client, data, concurrent_request):
trust_remote_code=model_info.trust_remote_code, trust_remote_code=model_info.trust_remote_code,
) )
# Warmup call as the first `librosa.load` server-side is quite slow. # Warmup call as the first `load_audio` server-side is quite slow.
audio, sr = data[0]["audio"]["array"], data[0]["audio"]["sampling_rate"] audio, sr = data[0]["audio"]["array"], data[0]["audio"]["sampling_rate"]
_ = await bound_transcribe(sem, client, tokenizer, (audio, sr), "") _ = await bound_transcribe(sem, client, tokenizer, (audio, sr), "")
...@@ -118,7 +118,7 @@ def print_performance_metrics(results, total_time): ...@@ -118,7 +118,7 @@ def print_performance_metrics(results, total_time):
def add_duration(sample): def add_duration(sample):
y, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"] y, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
sample["duration_ms"] = librosa.get_duration(y=y, sr=sr) * 1000 sample["duration_ms"] = get_audio_duration(y=y, sr=sr) * 1000
return sample return sample
......
...@@ -5,7 +5,6 @@ import asyncio ...@@ -5,7 +5,6 @@ import asyncio
import json import json
import warnings import warnings
import librosa
import numpy as np import numpy as np
import pybase64 as base64 import pybase64 as base64
import pytest import pytest
...@@ -14,6 +13,7 @@ import websockets ...@@ -14,6 +13,7 @@ import websockets
from tests.entrypoints.openai.conftest import add_attention_backend from tests.entrypoints.openai.conftest import add_attention_backend
from tests.utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer from tests.utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer
from vllm.assets.audio import AudioAsset from vllm.assets.audio import AudioAsset
from vllm.multimodal.media.audio import load_audio
# Increase engine iteration timeout for ROCm where first-use JIT compilation # Increase engine iteration timeout for ROCm where first-use JIT compilation
# can exceed the default 60s, causing a silent deadlock in feed_tokens. # can exceed the default 60s, causing a silent deadlock in feed_tokens.
...@@ -56,7 +56,7 @@ async def send_event(ws, event: dict) -> None: ...@@ -56,7 +56,7 @@ async def send_event(ws, event: dict) -> None:
def mary_had_lamb_audio_chunks() -> list[str]: def mary_had_lamb_audio_chunks() -> list[str]:
"""Audio split into ~1 second chunks for streaming.""" """Audio split into ~1 second chunks for streaming."""
path = AudioAsset("mary_had_lamb").get_local_path() path = AudioAsset("mary_had_lamb").get_local_path()
audio, _ = librosa.load(str(path), sr=16000, mono=True) audio, _ = load_audio(str(path), sr=16000, mono=True)
# Split into ~0.1 second chunks (1600 samples at 16kHz) # Split into ~0.1 second chunks (1600 samples at 16kHz)
chunk_size = 1600 chunk_size = 1600
......
...@@ -6,7 +6,6 @@ import asyncio ...@@ -6,7 +6,6 @@ import asyncio
import io import io
import json import json
import librosa
import numpy as np import numpy as np
import openai import openai
import pytest import pytest
...@@ -14,6 +13,7 @@ import pytest_asyncio ...@@ -14,6 +13,7 @@ import pytest_asyncio
import soundfile as sf import soundfile as sf
from tests.utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
from vllm.multimodal.media.audio import load_audio
from vllm.platforms import current_platform from vllm.platforms import current_platform
MODEL_NAME = "openai/whisper-large-v3-turbo" MODEL_NAME = "openai/whisper-large-v3-turbo"
...@@ -134,7 +134,7 @@ async def test_bad_requests(mary_had_lamb, whisper_client): ...@@ -134,7 +134,7 @@ async def test_bad_requests(mary_had_lamb, whisper_client):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_long_audio_request(mary_had_lamb, whisper_client): async def test_long_audio_request(mary_had_lamb, whisper_client):
mary_had_lamb.seek(0) mary_had_lamb.seek(0)
audio, sr = librosa.load(mary_had_lamb) audio, sr = load_audio(mary_had_lamb)
# Add small silence after each audio for repeatability in the split process # Add small silence after each audio for repeatability in the split process
audio = np.pad(audio, (0, 1600)) audio = np.pad(audio, (0, 1600))
repeated_audio = np.tile(audio, 10) repeated_audio = np.tile(audio, 10)
......
...@@ -7,7 +7,6 @@ import io ...@@ -7,7 +7,6 @@ import io
import json import json
import httpx import httpx
import librosa
import numpy as np import numpy as np
import openai import openai
import pytest import pytest
...@@ -17,6 +16,7 @@ import soundfile as sf ...@@ -17,6 +16,7 @@ import soundfile as sf
from tests.entrypoints.openai.conftest import add_attention_backend from tests.entrypoints.openai.conftest import add_attention_backend
from tests.utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.multimodal.media.audio import load_audio
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -264,7 +264,7 @@ async def test_long_audio_request(foscolo, client_and_model): ...@@ -264,7 +264,7 @@ async def test_long_audio_request(foscolo, client_and_model):
if model_name == "google/gemma-3n-E2B-it": if model_name == "google/gemma-3n-E2B-it":
pytest.skip("Gemma3n does not support long audio requests") pytest.skip("Gemma3n does not support long audio requests")
foscolo.seek(0) foscolo.seek(0)
audio, sr = librosa.load(foscolo) audio, sr = load_audio(foscolo)
repeated_audio = np.tile(audio, 2) repeated_audio = np.tile(audio, 2)
# Repeated audio to buffer # Repeated audio to buffer
buffer = io.BytesIO() buffer = io.BytesIO()
......
...@@ -4,7 +4,6 @@ ...@@ -4,7 +4,6 @@
import os import os
from collections.abc import Sequence from collections.abc import Sequence
import librosa
import pytest import pytest
import regex as re import regex as re
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
...@@ -14,6 +13,7 @@ from vllm.assets.image import ImageAsset ...@@ -14,6 +13,7 @@ from vllm.assets.image import ImageAsset
from vllm.logprobs import SampleLogprobs from vllm.logprobs import SampleLogprobs
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.multimodal.image import convert_image_mode, rescale_image_size from vllm.multimodal.image import convert_image_mode, rescale_image_size
from vllm.multimodal.media.audio import load_audio
from ....conftest import ( from ....conftest import (
IMAGE_ASSETS, IMAGE_ASSETS,
...@@ -290,7 +290,7 @@ def test_vision_speech_models( ...@@ -290,7 +290,7 @@ def test_vision_speech_models(
num_logprobs: int, num_logprobs: int,
) -> None: ) -> None:
# use the example speech question so that the model outputs are reasonable # use the example speech question so that the model outputs are reasonable
audio = librosa.load(speech_question, sr=None) audio = load_audio(speech_question, sr=None)
image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB") image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
inputs_vision_speech = [ inputs_vision_speech = [
......
...@@ -4,11 +4,11 @@ ...@@ -4,11 +4,11 @@
from collections.abc import Sequence from collections.abc import Sequence
from typing import Any from typing import Any
import librosa
import pytest import pytest
from transformers import AutoModelForSpeechSeq2Seq from transformers import AutoModelForSpeechSeq2Seq
from vllm.assets.audio import AudioAsset from vllm.assets.audio import AudioAsset
from vllm.multimodal.audio import AudioResampler
from vllm.platforms import current_platform from vllm.platforms import current_platform
from ....conftest import HfRunner, PromptAudioInput, VllmRunner from ....conftest import HfRunner, PromptAudioInput, VllmRunner
...@@ -93,13 +93,12 @@ def run_test( ...@@ -93,13 +93,12 @@ def run_test(
def resampled_assets() -> list[tuple[Any, int]]: def resampled_assets() -> list[tuple[Any, int]]:
audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")] audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
sampled_assets = [] sampled_assets = []
resampler = AudioResampler(target_sr=WHISPER_SAMPLE_RATE)
for asset in audio_assets: for asset in audio_assets:
audio, orig_sr = asset.audio_and_sample_rate audio, orig_sr = asset.audio_and_sample_rate
# Resample to Whisper's expected sample rate (16kHz) # Resample to Whisper's expected sample rate (16kHz)
if orig_sr != WHISPER_SAMPLE_RATE: if orig_sr != WHISPER_SAMPLE_RATE:
audio = librosa.resample( audio = resampler.resample(audio, orig_sr=orig_sr)
audio, orig_sr=orig_sr, target_sr=WHISPER_SAMPLE_RATE
)
sampled_assets.append( sampled_assets.append(
(audio, WHISPER_SAMPLE_RATE), (audio, WHISPER_SAMPLE_RATE),
) )
......
...@@ -3,12 +3,12 @@ ...@@ -3,12 +3,12 @@
from pathlib import Path from pathlib import Path
from unittest.mock import patch from unittest.mock import patch
import librosa
import numpy as np import numpy as np
import pybase64 as base64 import pybase64 as base64
import pytest import pytest
from vllm.multimodal.media import AudioMediaIO from vllm.multimodal.media import AudioMediaIO
from vllm.multimodal.media.audio import load_audio
from ...conftest import AudioTestAssets from ...conftest import AudioTestAssets
...@@ -73,6 +73,6 @@ def test_audio_media_io_from_video(video_assets): ...@@ -73,6 +73,6 @@ def test_audio_media_io_from_video(video_assets):
video_path = video_assets[0].video_path video_path = video_assets[0].video_path
with open(video_path, "rb") as f: with open(video_path, "rb") as f:
audio, sr = audio_io.load_bytes(f.read()) audio, sr = audio_io.load_bytes(f.read())
audio_ref, sr_ref = librosa.load(video_path, sr=None) audio_ref, sr_ref = load_audio(video_path, sr=None)
assert sr == sr_ref assert sr == sr_ref
np.testing.assert_allclose(audio_ref, audio, atol=1e-4) np.testing.assert_allclose(audio_ref, audio, atol=1e-4)
...@@ -29,9 +29,9 @@ except ImportError: ...@@ -29,9 +29,9 @@ except ImportError:
soundfile = PlaceholderModule("soundfile") # type: ignore[assignment] soundfile = PlaceholderModule("soundfile") # type: ignore[assignment]
# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile # Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`,
# being librosa's main backend. Used to validate if an audio loading error is due to a # soundfile being the main audio loading backend. Used to validate if an audio
# server error vs a client error (invalid audio file). # loading error is due to a server error vs a client error (invalid audio file).
# 0 = sf_error(NULL) race condition: when multiple threads fail sf_open_virtual # 0 = sf_error(NULL) race condition: when multiple threads fail sf_open_virtual
# concurrently, one thread may clear the global error before another reads it, # concurrently, one thread may clear the global error before another reads it,
# producing code=0 ("Garbled error message from libsndfile" in soundfile). # producing code=0 ("Garbled error message from libsndfile" in soundfile).
......
...@@ -4,11 +4,11 @@ import logging ...@@ -4,11 +4,11 @@ import logging
import math import math
import random import random
import librosa
import numpy as np import numpy as np
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from torch import nn from torch import nn
from torchaudio.functional import melscale_fbanks
from transformers import AutoFeatureExtractor, AutoProcessor, BatchFeature from transformers import AutoFeatureExtractor, AutoProcessor, BatchFeature
from transformers.feature_extraction_sequence_utils import ( from transformers.feature_extraction_sequence_utils import (
SequenceFeatureExtractor, SequenceFeatureExtractor,
...@@ -129,17 +129,15 @@ class FilterbankFeatures(nn.Module): ...@@ -129,17 +129,15 @@ class FilterbankFeatures(nn.Module):
self.pad_min_duration = 0.0 self.pad_min_duration = 0.0
self.pad_direction = "both" self.pad_direction = "both"
filterbanks = torch.tensor( filterbanks = melscale_fbanks(
librosa.filters.mel( n_freqs=self.n_fft // 2 + 1,
sr=sample_rate, f_min=lowfreq,
n_fft=self.n_fft, f_max=highfreq,
n_mels=nfilt, n_mels=nfilt,
fmin=lowfreq, sample_rate=sample_rate,
fmax=highfreq, norm=mel_norm,
norm=mel_norm, mel_scale="slaney",
), ).T.unsqueeze(0)
dtype=torch.float,
).unsqueeze(0)
self.register_buffer("fb", filterbanks) self.register_buffer("fb", filterbanks)
# Calculate maximum sequence length # Calculate maximum sequence length
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment