Unverified Commit 36d450e3 authored by ramos's avatar ramos Committed by GitHub
Browse files

Adds FunAudioChat multimodal audio model support (#2) (#33058)


Signed-off-by: default avatarramos <49182011+nemoramo@users.noreply.github.com>
Signed-off-by: default avatarmayufeng <mayufeng@example.com>
Co-authored-by: default avatarmayufeng <mayufeng@example.com>
parent a2b877df
......@@ -117,6 +117,31 @@ def run_glmasr(question: str, audio_count: int) -> ModelRequestData:
)
# FunAudioChat
def run_funaudiochat(question: str, audio_count: int) -> ModelRequestData:
# NOTE: FunAudioChat is not available on the HuggingFace Hub at the time of
# writing. Pass a local model path via `--model`.
model_name = "funaudiochat"
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=2,
limit_mm_per_prompt={"audio": audio_count},
enforce_eager=True,
)
audio_in_prompt = "".join(
["<|audio_bos|><|AUDIO|><|audio_eos|>\n" for _ in range(audio_count)]
)
prompt = f"{audio_in_prompt}{question}"
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
)
# Granite Speech
def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
# NOTE - the setting in this example are somewhat different from what is
......@@ -410,6 +435,7 @@ model_example_map = {
"audioflamingo3": run_audioflamingo3,
"gemma3n": run_gemma3n,
"glmasr": run_glmasr,
"funaudiochat": run_funaudiochat,
"granite_speech": run_granite_speech,
"midashenglm": run_midashenglm,
"minicpmo": run_minicpmo,
......@@ -435,6 +461,12 @@ def parse_args():
choices=model_example_map.keys(),
help='Huggingface "model_type".',
)
parser.add_argument(
"--model",
type=str,
default=None,
help="Model ID or local path override. Required for funaudiochat.",
)
parser.add_argument(
"--num-prompts", type=int, default=1, help="Number of prompts to run."
)
......@@ -467,6 +499,9 @@ def main(args):
if model not in model_example_map:
raise ValueError(f"Model type {model} is not supported.")
if model == "funaudiochat" and not args.model:
raise ValueError("--model is required when --model-type=funaudiochat")
if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1:
raise ValueError(
f"tensor_parallel_size must be a positive integer, "
......@@ -477,6 +512,8 @@ def main(args):
req_data = model_example_map[model](
question_per_audio_count[audio_count], audio_count
)
if model == "funaudiochat":
req_data.engine_args.model = args.model
# Disable other modalities to save memory
default_limits = {"image": 0, "video": 0, "audio": 0}
......
......@@ -692,6 +692,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"baidu/ERNIE-4.5-VL-28B-A3B-PT",
trust_remote_code=True,
),
"FunAudioChatForConditionalGeneration": _HfExamplesInfo(
"funaudiochat", is_available_online=False
),
"FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
"Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it"),
"Gemma3nForConditionalGeneration": _HfExamplesInfo("google/gemma-3n-E2B-it"),
......
This diff is collapsed.
......@@ -312,6 +312,10 @@ _MULTIMODAL_MODELS = {
"ernie45_vl",
"Ernie4_5_VLMoeForConditionalGeneration",
),
"FunAudioChatForConditionalGeneration": (
"funaudiochat",
"FunAudioChatForConditionalGeneration",
),
"FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
"Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"), # noqa: E501
"Gemma3nForConditionalGeneration": (
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import math
from dataclasses import dataclass
from enum import Enum
from typing import Literal
......@@ -195,6 +196,13 @@ class AudioResampler:
raise RuntimeError(
"Audio resampling is not supported when `target_sr` is not provided"
)
if math.isclose(
float(orig_sr),
float(self.target_sr),
rel_tol=0.0,
abs_tol=1e-6,
):
return audio
if self.method == "librosa":
return resample_audio_librosa(
audio, orig_sr=orig_sr, target_sr=self.target_sr
......
......@@ -77,6 +77,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
deepseek_vl_v2="DeepseekVLV2Config",
deepseek_v32="DeepseekV3Config",
flex_olmo="FlexOlmoConfig",
funaudiochat="FunAudioChatConfig",
hunyuan_vl="HunYuanVLConfig",
isaac="IsaacConfig",
kimi_linear="KimiLinearConfig",
......
......@@ -22,6 +22,8 @@ _CLASS_TO_MODULE: dict[str, str] = {
"DotsOCRConfig": "vllm.transformers_utils.configs.dotsocr",
"EAGLEConfig": "vllm.transformers_utils.configs.eagle",
"FlexOlmoConfig": "vllm.transformers_utils.configs.flex_olmo",
"FunAudioChatConfig": "vllm.transformers_utils.configs.funaudiochat",
"FunAudioChatAudioEncoderConfig": "vllm.transformers_utils.configs.funaudiochat",
"HunYuanVLConfig": "vllm.transformers_utils.configs.hunyuan_vl",
"HunYuanVLTextConfig": "vllm.transformers_utils.configs.hunyuan_vl",
"HunYuanVLVisionConfig": "vllm.transformers_utils.configs.hunyuan_vl",
......@@ -65,6 +67,8 @@ __all__ = [
"DotsOCRConfig",
"EAGLEConfig",
"FlexOlmoConfig",
"FunAudioChatConfig",
"FunAudioChatAudioEncoderConfig",
"HunYuanVLConfig",
"HunYuanVLTextConfig",
"HunYuanVLVisionConfig",
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
from transformers import PretrainedConfig
# NOTE: Temporary shim for FunAudioChat checkpoints.
# These checkpoints use `model_type="funaudiochat"`, which is not currently
# recognized by released Transformers, and the public checkpoint does not
# provide an `auto_map` to enable `trust_remote_code=True`.
# Remove this file once Transformers adds native support (or the checkpoint
# provides an `auto_map`) and vLLM can rely on `AutoConfig.from_pretrained()`.
class FunAudioChatAudioEncoderConfig(PretrainedConfig):
model_type = "funaudiochat_audio_encoder"
def __init__(
self,
_attn_implementation: str | None = None,
num_mel_bins: int = 128,
encoder_layers: int = 32,
encoder_attention_heads: int = 20,
encoder_ffn_dim: int = 5120,
d_model: int = 1280,
dropout: float = 0.0,
attention_dropout: float = 0.0,
activation_function: str = "gelu",
activation_dropout: float = 0.0,
scale_embedding: bool = False,
initializer_range: float = 0.02,
max_source_positions: int = 1500,
n_window: int = 100,
output_dim: int = 3584,
bos_token_id: int | None = None,
codebook_size: int | None = None,
continuous_features_mode: str = "replace",
crq_transformer_config: dict | None = None,
eos_token_id: int | None = None,
group_size: int = 5,
enable_audio_invert_tower: bool = True,
pad_token_id: int | None = None,
**kwargs,
) -> None:
attn_impl = kwargs.pop("_attn_implementation", None) or _attn_implementation
super().__init__(**kwargs)
# Match HF default for attention implementation selection.
self._attn_implementation = attn_impl or "sdpa"
self.num_mel_bins = num_mel_bins
self.d_model = d_model
self.encoder_layers = encoder_layers
self.encoder_attention_heads = encoder_attention_heads
self.encoder_ffn_dim = encoder_ffn_dim
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_function = activation_function
self.activation_dropout = activation_dropout
self.num_hidden_layers = encoder_layers
self.initializer_range = initializer_range
self.scale_embedding = scale_embedding
self.max_source_positions = max_source_positions
self.n_window = n_window
self.output_dim = output_dim
self.bos_token_id = bos_token_id
self.codebook_size = codebook_size
self.continuous_features_mode = continuous_features_mode
self.crq_transformer_config = crq_transformer_config
self.eos_token_id = eos_token_id
self.group_size = group_size
self.enable_audio_invert_tower = enable_audio_invert_tower
self.pad_token_id = pad_token_id
class FunAudioChatConfig(PretrainedConfig):
model_type = "funaudiochat"
attribute_map = {
"audio_token_id": "audio_token_index",
}
def __init__(
self,
audio_config: PretrainedConfig | dict | None = None,
text_config: PretrainedConfig | dict | None = None,
audio_token_index: int = 151646,
ignore_index: int = -100,
hidden_size: int | None = None,
**kwargs,
) -> None:
self.audio_token_index = audio_token_index
self.ignore_index = ignore_index
if isinstance(audio_config, dict):
audio_config.setdefault(
"model_type", FunAudioChatAudioEncoderConfig.model_type
)
audio_config = FunAudioChatAudioEncoderConfig(**audio_config)
elif audio_config is None:
audio_config = FunAudioChatAudioEncoderConfig()
self.audio_config = audio_config
if isinstance(text_config, dict):
# Default to qwen2 for backwards compatibility; FunAudioChat uses
# qwen3 in practice for recent checkpoints.
text_config.setdefault("model_type", "qwen2")
import transformers
text_cls = transformers.CONFIG_MAPPING[text_config["model_type"]]
text_config = text_cls(**text_config)
elif text_config is None:
import transformers
text_config = transformers.CONFIG_MAPPING["qwen2"]()
self.text_config = text_config
self.hidden_size = (
int(self.text_config.hidden_size)
if hidden_size is None
else int(hidden_size)
)
super().__init__(**kwargs)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment