Unverified Commit 36d450e3 authored by ramos's avatar ramos Committed by GitHub
Browse files

Adds FunAudioChat multimodal audio model support (#2) (#33058)


Signed-off-by: default avatarramos <49182011+nemoramo@users.noreply.github.com>
Signed-off-by: default avatarmayufeng <mayufeng@example.com>
Co-authored-by: default avatarmayufeng <mayufeng@example.com>
parent a2b877df
...@@ -117,6 +117,31 @@ def run_glmasr(question: str, audio_count: int) -> ModelRequestData: ...@@ -117,6 +117,31 @@ def run_glmasr(question: str, audio_count: int) -> ModelRequestData:
) )
# FunAudioChat
def run_funaudiochat(question: str, audio_count: int) -> ModelRequestData:
# NOTE: FunAudioChat is not available on the HuggingFace Hub at the time of
# writing. Pass a local model path via `--model`.
model_name = "funaudiochat"
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=2,
limit_mm_per_prompt={"audio": audio_count},
enforce_eager=True,
)
audio_in_prompt = "".join(
["<|audio_bos|><|AUDIO|><|audio_eos|>\n" for _ in range(audio_count)]
)
prompt = f"{audio_in_prompt}{question}"
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
)
# Granite Speech # Granite Speech
def run_granite_speech(question: str, audio_count: int) -> ModelRequestData: def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
# NOTE - the setting in this example are somewhat different from what is # NOTE - the setting in this example are somewhat different from what is
...@@ -410,6 +435,7 @@ model_example_map = { ...@@ -410,6 +435,7 @@ model_example_map = {
"audioflamingo3": run_audioflamingo3, "audioflamingo3": run_audioflamingo3,
"gemma3n": run_gemma3n, "gemma3n": run_gemma3n,
"glmasr": run_glmasr, "glmasr": run_glmasr,
"funaudiochat": run_funaudiochat,
"granite_speech": run_granite_speech, "granite_speech": run_granite_speech,
"midashenglm": run_midashenglm, "midashenglm": run_midashenglm,
"minicpmo": run_minicpmo, "minicpmo": run_minicpmo,
...@@ -435,6 +461,12 @@ def parse_args(): ...@@ -435,6 +461,12 @@ def parse_args():
choices=model_example_map.keys(), choices=model_example_map.keys(),
help='Huggingface "model_type".', help='Huggingface "model_type".',
) )
parser.add_argument(
"--model",
type=str,
default=None,
help="Model ID or local path override. Required for funaudiochat.",
)
parser.add_argument( parser.add_argument(
"--num-prompts", type=int, default=1, help="Number of prompts to run." "--num-prompts", type=int, default=1, help="Number of prompts to run."
) )
...@@ -467,6 +499,9 @@ def main(args): ...@@ -467,6 +499,9 @@ def main(args):
if model not in model_example_map: if model not in model_example_map:
raise ValueError(f"Model type {model} is not supported.") raise ValueError(f"Model type {model} is not supported.")
if model == "funaudiochat" and not args.model:
raise ValueError("--model is required when --model-type=funaudiochat")
if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1: if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1:
raise ValueError( raise ValueError(
f"tensor_parallel_size must be a positive integer, " f"tensor_parallel_size must be a positive integer, "
...@@ -477,6 +512,8 @@ def main(args): ...@@ -477,6 +512,8 @@ def main(args):
req_data = model_example_map[model]( req_data = model_example_map[model](
question_per_audio_count[audio_count], audio_count question_per_audio_count[audio_count], audio_count
) )
if model == "funaudiochat":
req_data.engine_args.model = args.model
# Disable other modalities to save memory # Disable other modalities to save memory
default_limits = {"image": 0, "video": 0, "audio": 0} default_limits = {"image": 0, "video": 0, "audio": 0}
......
...@@ -692,6 +692,9 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -692,6 +692,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"baidu/ERNIE-4.5-VL-28B-A3B-PT", "baidu/ERNIE-4.5-VL-28B-A3B-PT",
trust_remote_code=True, trust_remote_code=True,
), ),
"FunAudioChatForConditionalGeneration": _HfExamplesInfo(
"funaudiochat", is_available_online=False
),
"FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"), "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
"Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it"), "Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it"),
"Gemma3nForConditionalGeneration": _HfExamplesInfo("google/gemma-3n-E2B-it"), "Gemma3nForConditionalGeneration": _HfExamplesInfo("google/gemma-3n-E2B-it"),
......
This diff is collapsed.
...@@ -312,6 +312,10 @@ _MULTIMODAL_MODELS = { ...@@ -312,6 +312,10 @@ _MULTIMODAL_MODELS = {
"ernie45_vl", "ernie45_vl",
"Ernie4_5_VLMoeForConditionalGeneration", "Ernie4_5_VLMoeForConditionalGeneration",
), ),
"FunAudioChatForConditionalGeneration": (
"funaudiochat",
"FunAudioChatForConditionalGeneration",
),
"FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"), "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
"Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"), # noqa: E501 "Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"), # noqa: E501
"Gemma3nForConditionalGeneration": ( "Gemma3nForConditionalGeneration": (
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import math
from dataclasses import dataclass from dataclasses import dataclass
from enum import Enum from enum import Enum
from typing import Literal from typing import Literal
...@@ -195,6 +196,13 @@ class AudioResampler: ...@@ -195,6 +196,13 @@ class AudioResampler:
raise RuntimeError( raise RuntimeError(
"Audio resampling is not supported when `target_sr` is not provided" "Audio resampling is not supported when `target_sr` is not provided"
) )
if math.isclose(
float(orig_sr),
float(self.target_sr),
rel_tol=0.0,
abs_tol=1e-6,
):
return audio
if self.method == "librosa": if self.method == "librosa":
return resample_audio_librosa( return resample_audio_librosa(
audio, orig_sr=orig_sr, target_sr=self.target_sr audio, orig_sr=orig_sr, target_sr=self.target_sr
......
...@@ -77,6 +77,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict( ...@@ -77,6 +77,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
deepseek_vl_v2="DeepseekVLV2Config", deepseek_vl_v2="DeepseekVLV2Config",
deepseek_v32="DeepseekV3Config", deepseek_v32="DeepseekV3Config",
flex_olmo="FlexOlmoConfig", flex_olmo="FlexOlmoConfig",
funaudiochat="FunAudioChatConfig",
hunyuan_vl="HunYuanVLConfig", hunyuan_vl="HunYuanVLConfig",
isaac="IsaacConfig", isaac="IsaacConfig",
kimi_linear="KimiLinearConfig", kimi_linear="KimiLinearConfig",
......
...@@ -22,6 +22,8 @@ _CLASS_TO_MODULE: dict[str, str] = { ...@@ -22,6 +22,8 @@ _CLASS_TO_MODULE: dict[str, str] = {
"DotsOCRConfig": "vllm.transformers_utils.configs.dotsocr", "DotsOCRConfig": "vllm.transformers_utils.configs.dotsocr",
"EAGLEConfig": "vllm.transformers_utils.configs.eagle", "EAGLEConfig": "vllm.transformers_utils.configs.eagle",
"FlexOlmoConfig": "vllm.transformers_utils.configs.flex_olmo", "FlexOlmoConfig": "vllm.transformers_utils.configs.flex_olmo",
"FunAudioChatConfig": "vllm.transformers_utils.configs.funaudiochat",
"FunAudioChatAudioEncoderConfig": "vllm.transformers_utils.configs.funaudiochat",
"HunYuanVLConfig": "vllm.transformers_utils.configs.hunyuan_vl", "HunYuanVLConfig": "vllm.transformers_utils.configs.hunyuan_vl",
"HunYuanVLTextConfig": "vllm.transformers_utils.configs.hunyuan_vl", "HunYuanVLTextConfig": "vllm.transformers_utils.configs.hunyuan_vl",
"HunYuanVLVisionConfig": "vllm.transformers_utils.configs.hunyuan_vl", "HunYuanVLVisionConfig": "vllm.transformers_utils.configs.hunyuan_vl",
...@@ -65,6 +67,8 @@ __all__ = [ ...@@ -65,6 +67,8 @@ __all__ = [
"DotsOCRConfig", "DotsOCRConfig",
"EAGLEConfig", "EAGLEConfig",
"FlexOlmoConfig", "FlexOlmoConfig",
"FunAudioChatConfig",
"FunAudioChatAudioEncoderConfig",
"HunYuanVLConfig", "HunYuanVLConfig",
"HunYuanVLTextConfig", "HunYuanVLTextConfig",
"HunYuanVLVisionConfig", "HunYuanVLVisionConfig",
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
from transformers import PretrainedConfig
# NOTE: Temporary shim for FunAudioChat checkpoints.
# These checkpoints use `model_type="funaudiochat"`, which is not currently
# recognized by released Transformers, and the public checkpoint does not
# provide an `auto_map` to enable `trust_remote_code=True`.
# Remove this file once Transformers adds native support (or the checkpoint
# provides an `auto_map`) and vLLM can rely on `AutoConfig.from_pretrained()`.
class FunAudioChatAudioEncoderConfig(PretrainedConfig):
model_type = "funaudiochat_audio_encoder"
def __init__(
self,
_attn_implementation: str | None = None,
num_mel_bins: int = 128,
encoder_layers: int = 32,
encoder_attention_heads: int = 20,
encoder_ffn_dim: int = 5120,
d_model: int = 1280,
dropout: float = 0.0,
attention_dropout: float = 0.0,
activation_function: str = "gelu",
activation_dropout: float = 0.0,
scale_embedding: bool = False,
initializer_range: float = 0.02,
max_source_positions: int = 1500,
n_window: int = 100,
output_dim: int = 3584,
bos_token_id: int | None = None,
codebook_size: int | None = None,
continuous_features_mode: str = "replace",
crq_transformer_config: dict | None = None,
eos_token_id: int | None = None,
group_size: int = 5,
enable_audio_invert_tower: bool = True,
pad_token_id: int | None = None,
**kwargs,
) -> None:
attn_impl = kwargs.pop("_attn_implementation", None) or _attn_implementation
super().__init__(**kwargs)
# Match HF default for attention implementation selection.
self._attn_implementation = attn_impl or "sdpa"
self.num_mel_bins = num_mel_bins
self.d_model = d_model
self.encoder_layers = encoder_layers
self.encoder_attention_heads = encoder_attention_heads
self.encoder_ffn_dim = encoder_ffn_dim
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_function = activation_function
self.activation_dropout = activation_dropout
self.num_hidden_layers = encoder_layers
self.initializer_range = initializer_range
self.scale_embedding = scale_embedding
self.max_source_positions = max_source_positions
self.n_window = n_window
self.output_dim = output_dim
self.bos_token_id = bos_token_id
self.codebook_size = codebook_size
self.continuous_features_mode = continuous_features_mode
self.crq_transformer_config = crq_transformer_config
self.eos_token_id = eos_token_id
self.group_size = group_size
self.enable_audio_invert_tower = enable_audio_invert_tower
self.pad_token_id = pad_token_id
class FunAudioChatConfig(PretrainedConfig):
model_type = "funaudiochat"
attribute_map = {
"audio_token_id": "audio_token_index",
}
def __init__(
self,
audio_config: PretrainedConfig | dict | None = None,
text_config: PretrainedConfig | dict | None = None,
audio_token_index: int = 151646,
ignore_index: int = -100,
hidden_size: int | None = None,
**kwargs,
) -> None:
self.audio_token_index = audio_token_index
self.ignore_index = ignore_index
if isinstance(audio_config, dict):
audio_config.setdefault(
"model_type", FunAudioChatAudioEncoderConfig.model_type
)
audio_config = FunAudioChatAudioEncoderConfig(**audio_config)
elif audio_config is None:
audio_config = FunAudioChatAudioEncoderConfig()
self.audio_config = audio_config
if isinstance(text_config, dict):
# Default to qwen2 for backwards compatibility; FunAudioChat uses
# qwen3 in practice for recent checkpoints.
text_config.setdefault("model_type", "qwen2")
import transformers
text_cls = transformers.CONFIG_MAPPING[text_config["model_type"]]
text_config = text_cls(**text_config)
elif text_config is None:
import transformers
text_config = transformers.CONFIG_MAPPING["qwen2"]()
self.text_config = text_config
self.hidden_size = (
int(self.text_config.hidden_size)
if hidden_size is None
else int(hidden_size)
)
super().__init__(**kwargs)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment