Add AudioFlamingo3 model support (#30539)

Signed-off-by: Lasha <26011196+lashahub@users.noreply.github.com> Signed-off-by: Lasha Koroshinadze <26011196+lashahub@users.noreply.github.com> Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>

Add AudioFlamingo3 model support (#30539)
Signed-off-by: Lasha <26011196+lashahub@users.noreply.github.com> Signed-off-by: Lasha Koroshinadze <26011196+lashahub@users.noreply.github.com> Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
3a20450d · Lasha Koroshinadze · GitHub · 1a55cfaf · 3a20450d · 3a20450d
Unverified Commit 3a20450d authored Dec 14, 2025 by Lasha Koroshinadze Committed by GitHub Dec 14, 2025
9 changed files
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -659,6 +659,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 |--------------|--------|--------|-------------------|----------------------|---------------------------|
 | `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | |
+| `AudioFlamingo3ForConditionalGeneration` | AudioFlamingo3 | T + A<sup>+</sup> | `nvidia/audio-flamingo-3-hf`, `nvidia/music-flamingo-hf` | ✅︎ | ✅︎ |
 | `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereLabs/aya-vision-8b`, `CohereLabs/aya-vision-32b`, etc. | | ✅︎ |
 | `BeeForConditionalGeneration` | Bee-8B | T + I<sup>E+</sup> | `Open-Bee/Bee-8B-RL`, `Open-Bee/Bee-8B-SFT` | | ✅︎ |
 | `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ |

--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -42,60 +42,31 @@ class ModelRequestData(NamedTuple):
 # Unless specified, these settings have been tested to work on a single L4.
-# Voxtral
+# AudioFlamingo3
-# Make sure to install mistral-common[audio].
+def run_audioflamingo3(question: str, audio_count: int) -> ModelRequestData:
-def run_voxtral(question: str, audio_count: int) -> ModelRequestData:
+    model_name = "nvidia/audio-flamingo-3-hf"
-    from mistral_common.audio import Audio
-    from mistral_common.protocol.instruct.chunk import (
-        AudioChunk,
-        RawAudio,
-        TextChunk,
-    )
-    from mistral_common.protocol.instruct.messages import (
-        UserMessage,
-    )
-    from mistral_common.protocol.instruct.request import ChatCompletionRequest
-    from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
-    model_name = "mistralai/Voxtral-Mini-3B-2507"
-    tokenizer = MistralTokenizer.from_hf_hub(model_name)
    engine_args = EngineArgs(
        model=model_name,
-        max_model_len=8192,
+        max_model_len=4096,
        max_num_seqs=2,
        limit_mm_per_prompt={"audio": audio_count},
-        config_format="mistral",
-        load_format="mistral",
-        tokenizer_mode="mistral",
        enforce_eager=True,
-        enable_chunked_prefill=False,
    )
-    text_chunk = TextChunk(text=question)
+    # AudioFlamingo3 uses <sound> token for audio
-    audios = [
+    audio_placeholder = "<sound>" * audio_count
-        Audio.from_file(str(audio_assets[i].get_local_path()), strict=False)
-        for i in range(audio_count)
-    ]
-    audio_chunks = [
-        AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
-    ]
-    messages = [UserMessage(content=[*audio_chunks, text_chunk])]
-    req = ChatCompletionRequest(messages=messages, model=model_name)
-    tokens = tokenizer.encode_chat_completion(req)
-    prompt_ids, audios = tokens.tokens, tokens.audios
-    audios_and_sr = [(au.audio_array, au.sampling_rate) for au in audios]
-    multi_modal_data = {"audio": audios_and_sr}
+    prompt = (
+        "<|im_start|>system\n"
+        "You are a helpful assistant.<|im_end|>\n"
+        "<|im_start|>user\n"
+        f"{audio_placeholder}{question}<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
    return ModelRequestData(
        engine_args=engine_args,
-        prompt_token_ids=prompt_ids,
+        prompt=prompt,
-        multi_modal_data=multi_modal_data,
    )
@@ -361,6 +332,63 @@ def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
    )
+# Voxtral
+# Make sure to install mistral-common[audio].
+def run_voxtral(question: str, audio_count: int) -> ModelRequestData:
+    from mistral_common.audio import Audio
+    from mistral_common.protocol.instruct.chunk import (
+        AudioChunk,
+        RawAudio,
+        TextChunk,
+    )
+    from mistral_common.protocol.instruct.messages import (
+        UserMessage,
+    )
+    from mistral_common.protocol.instruct.request import ChatCompletionRequest
+    from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
+    model_name = "mistralai/Voxtral-Mini-3B-2507"
+    tokenizer = MistralTokenizer.from_hf_hub(model_name)
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"audio": audio_count},
+        config_format="mistral",
+        load_format="mistral",
+        tokenizer_mode="mistral",
+        enforce_eager=True,
+        enable_chunked_prefill=False,
+    )
+    text_chunk = TextChunk(text=question)
+    audios = [
+        Audio.from_file(str(audio_assets[i].get_local_path()), strict=False)
+        for i in range(audio_count)
+    ]
+    audio_chunks = [
+        AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
+    ]
+    messages = [UserMessage(content=[*audio_chunks, text_chunk])]
+    req = ChatCompletionRequest(messages=messages, model=model_name)
+    tokens = tokenizer.encode_chat_completion(req)
+    prompt_ids, audios = tokens.tokens, tokens.audios
+    audios_and_sr = [(au.audio_array, au.sampling_rate) for au in audios]
+    multi_modal_data = {"audio": audios_and_sr}
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt_token_ids=prompt_ids,
+        multi_modal_data=multi_modal_data,
+    )
 # Whisper
 def run_whisper(question: str, audio_count: int) -> ModelRequestData:
    assert audio_count == 1, "Whisper only support single audio input per prompt"
@@ -382,7 +410,7 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:
 model_example_map = {
-    "voxtral": run_voxtral,
+    "audioflamingo3": run_audioflamingo3,
    "gemma3n": run_gemma3n,
    "granite_speech": run_granite_speech,
    "midashenglm": run_midashenglm,
@@ -392,6 +420,7 @@ model_example_map = {
    "qwen2_audio": run_qwen2_audio,
    "qwen2_5_omni": run_qwen2_5_omni,
    "ultravox": run_ultravox,
+    "voxtral": run_voxtral,
    "whisper": run_whisper,
 }

--- a/tests/models/fixtures/audioflamingo3/expected_results_batched.json
+++ b/tests/models/fixtures/audioflamingo3/expected_results_batched.json
+{"transcriptions": ["There is no clear relationship between the barking and the music, as they seem to be independent of each other.", "(B) To indicate that language cannot express clearly, satirizing the inversion of black and white in the world"], "token_ids": [[3862, 374, 902, 2797, 5025, 1948, 279, 293, 33452, 323, 279, 4627, 11, 438, 807, 2803, 311, 387, 9489, 315, 1817, 1008, 13, 151645], [5349, 8, 2014, 13216, 429, 4128, 4157, 3158, 9355, 11, 7578, 404, 4849, 279, 46488, 315, 3691, 323, 4158, 304, 279, 1879, 151645, 151671]]}
\ No newline at end of file
--- a/tests/models/fixtures/audioflamingo3/expected_results_single.json
+++ b/tests/models/fixtures/audioflamingo3/expected_results_single.json
+{"transcriptions": ["The content of the input audio is 'you can ask why over and over and over again forever even if one day we explain every physical interaction and scientific law and hope and dream and regret with a single elegant equation'."], "token_ids": [[785, 2213, 315, 279, 1946, 7699, 374, 364, 9330, 646, 2548, 3170, 916, 323, 916, 323, 916, 1549, 15683, 1496, 421, 825, 1899, 582, 10339, 1449, 6961, 16230, 323, 12344, 2329, 323, 3900, 323, 7904, 323, 22231, 448, 264, 3175, 25777, 23606, 4427, 151645]]}
\ No newline at end of file
--- a/tests/models/multimodal/generation/test_audioflamingo3.py
+++ b/tests/models/multimodal/generation/test_audioflamingo3.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright 2025 The vLLM team.
+# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
+# reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+import pytest
+from tests.models.registry import HF_EXAMPLE_MODELS
+from vllm import LLM, SamplingParams
+MODEL_NAME = "nvidia/audio-flamingo-3-hf"
+def get_fixture_path(filename):
+    return os.path.join(
+        os.path.dirname(__file__), "../../fixtures/audioflamingo3", filename
+    )
+@pytest.fixture(scope="module")
+def llm():
+    # Check if the model is supported by the current transformers version
+    model_info = HF_EXAMPLE_MODELS.get_hf_info("AudioFlamingo3ForConditionalGeneration")
+    model_info.check_transformers_version(on_fail="skip")
+    try:
+        llm = LLM(
+            model=MODEL_NAME,
+            trust_remote_code=True,
+            dtype="bfloat16",
+            enforce_eager=True,
+            limit_mm_per_prompt={"audio": 1},
+        )
+        return llm
+    except Exception as e:
+        pytest.skip(f"Failed to load model {MODEL_NAME}: {e}")
+def test_single_generation(llm):
+    fixture_path = get_fixture_path("expected_results_single.json")
+    if not os.path.exists(fixture_path):
+        pytest.skip(f"Fixture not found: {fixture_path}")
+    with open(fixture_path) as f:
+        expected = json.load(f)
+    audio_url = "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/Why_do_we_ask_questions_converted.wav"
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "audio_url", "audio_url": {"url": audio_url}},
+                {"type": "text", "text": "Transcribe the input speech."},
+            ],
+        }
+    ]
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=128)
+    outputs = llm.chat(
+        messages=messages,
+        sampling_params=sampling_params,
+    )
+    generated_text = outputs[0].outputs[0].text.strip()
+    expected_text = expected["transcriptions"][0]
+    assert expected_text in generated_text or generated_text in expected_text
+def test_batched_generation(llm):
+    fixture_path = get_fixture_path("expected_results_batched.json")
+    if not os.path.exists(fixture_path):
+        pytest.skip(f"Fixture not found: {fixture_path}")
+    with open(fixture_path) as f:
+        expected = json.load(f)
+    items = [
+        {
+            "audio_url": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/dogs_barking_in_sync_with_the_music.wav",
+            "question": "What is surprising about the relationship "
+            "between the barking and the music?",
+            "expected_idx": 0,
+        },
+        {
+            "audio_url": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/Ch6Ae9DT6Ko_00-04-03_00-04-31.wav",
+            "question": (
+                "Why is the philosopher's name mentioned in the lyrics? "
+                "(A) To express a sense of nostalgia "
+                "(B) To indicate that language cannot express clearly, "
+                "satirizing the inversion of black and white in the world "
+                "(C) To add depth and complexity to the lyrics "
+                "(D) To showcase the wisdom and influence of the philosopher"
+            ),
+            "expected_idx": 1,
+        },
+    ]
+    conversations = []
+    for item in items:
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "audio_url", "audio_url": {"url": item["audio_url"]}},
+                    {"type": "text", "text": item["question"]},
+                ],
+            }
+        ]
+        conversations.append(messages)
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=128)
+    outputs = llm.chat(
+        messages=conversations,
+        sampling_params=sampling_params,
+    )
+    for i, output in enumerate(outputs):
+        generated_text = output.outputs[0].text.strip()
+        expected_text = expected["transcriptions"][i]
+        assert expected_text in generated_text or generated_text in expected_text
--- a/tests/models/multimodal/processing/test_audioflamingo3.py
+++ b/tests/models/multimodal/processing/test_audioflamingo3.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright 2025 The vLLM team.
+# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
+# reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from unittest.mock import MagicMock
+import numpy as np
+import pytest
+import torch
+from transformers import PretrainedConfig
+from tests.models.registry import HF_EXAMPLE_MODELS
+class MockAudioFlamingo3Config(PretrainedConfig):
+    model_type = "audioflamingo3"
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.audio_config = PretrainedConfig()
+        self.text_config = PretrainedConfig()
+class MockAudioFlamingo3Processor:
+    def __init__(self):
+        self.audio_token = "<sound>"
+        self.audio_token_id = 12345
+        self.feature_extractor = MockFeatureExtractor()
+    def __call__(self, text=None, audios=None, **kwargs):
+        return {"input_ids": [1, 2, 3], "input_features": [np.zeros((3000, 80))]}
+class MockFeatureExtractor:
+    def __init__(self):
+        self.sampling_rate = 16000
+        self.chunk_length = 30
+@pytest.fixture
+def mock_ctx():
+    config = MockAudioFlamingo3Config()
+    ctx = MagicMock()
+    ctx.get_hf_config.return_value = config
+    ctx.get_hf_processor.return_value = MockAudioFlamingo3Processor()
+    ctx.model_config.hf_config = config
+    return ctx
+@pytest.fixture(autouse=True)
+def check_transformers_version():
+    # Check if the model is supported by the current transformers version
+    model_info = HF_EXAMPLE_MODELS.get_hf_info("AudioFlamingo3ForConditionalGeneration")
+    model_info.check_transformers_version(on_fail="skip")
+def test_audio_chunk_counting(mock_ctx):
+    from vllm.model_executor.models.audioflamingo3 import (
+        AudioFlamingo3DummyInputsBuilder,
+        AudioFlamingo3MultiModalProcessor,
+        AudioFlamingo3ProcessingInfo,
+    )
+    info = AudioFlamingo3ProcessingInfo(mock_ctx)
+    processor = AudioFlamingo3MultiModalProcessor(
+        info, AudioFlamingo3DummyInputsBuilder(info)
+    )
+    sr = 16000
+    audio_1 = np.zeros(30 * sr)
+    audio_2 = np.zeros(45 * sr)
+    mm_data = {"audio": [audio_1, audio_2]}
+    prompt = "<|user|>Listen.<|end|>"
+    from vllm.multimodal.processing import BaseMultiModalProcessor
+    def mock_base_call(self, prompt, mm_data, mm_kwargs, tok_kwargs):
+        return {"input_ids": [1, 2, 3], "input_features": torch.randn(1, 80, 3000)}
+    with pytest.MonkeyPatch.context() as mp:
+        mp.setattr(BaseMultiModalProcessor, "_call_hf_processor", mock_base_call)
+        processed = processor._call_hf_processor(prompt, mm_data, {}, {})
+        chunk_counts = processed["chunk_counts"]
+        assert chunk_counts[0].item() == 1
+        assert chunk_counts[1].item() == 2
+        assert len(chunk_counts) == 2
+def test_dummy_data_generation(mock_ctx):
+    from vllm.model_executor.models.audioflamingo3 import (
+        AudioFlamingo3DummyInputsBuilder,
+        AudioFlamingo3ProcessingInfo,
+    )
+    info = AudioFlamingo3ProcessingInfo(mock_ctx)
+    builder = AudioFlamingo3DummyInputsBuilder(info)
+    mm_counts = {"audio": 2}
+    dummy_data = builder.get_dummy_mm_data(100, mm_counts, None)
+    assert "audio" in dummy_data
+    assert len(dummy_data["audio"]) == 2
+    expected_len = 600 * 16000
+    assert len(dummy_data["audio"][0]) == expected_len
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -578,6 +578,9 @@ _AUTOMATIC_CONVERTED_MODELS = {
 _MULTIMODAL_EXAMPLE_MODELS = {
    # [Decoder-only]
    "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
+    "AudioFlamingo3ForConditionalGeneration": _HfExamplesInfo(
+        "nvidia/audio-flamingo-3-hf", min_transformers_version="5.0.0.dev"
+    ),
    "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereLabs/aya-vision-8b"),
    "BeeForConditionalGeneration": _HfExamplesInfo(
        "Open-Bee/Bee-8B-RL",

--- a/vllm/model_executor/models/audioflamingo3.py
+++ b/vllm/model_executor/models/audioflamingo3.py
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -264,6 +264,10 @@ _CROSS_ENCODER_MODELS = {
 _MULTIMODAL_MODELS = {
    # [Decoder-only]
    "AriaForConditionalGeneration": ("aria", "AriaForConditionalGeneration"),
+    "AudioFlamingo3ForConditionalGeneration": (
+        "audioflamingo3",
+        "AudioFlamingo3ForConditionalGeneration",
+    ),
    "AyaVisionForConditionalGeneration": (
        "aya_vision",
        "AyaVisionForConditionalGeneration",