Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
c46b0cd0
Unverified
Commit
c46b0cd0
authored
Jan 30, 2026
by
Wang Haoyu
Committed by
GitHub
Jan 30, 2026
Browse files
[Model][Multimodal] Add explicit MusicFlamingo adapter (#32696)
Signed-off-by:
WangHaoyuuu
<
mailwhaoyu@gmail.com
>
parent
13376576
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
115 additions
and
2 deletions
+115
-2
docs/models/supported_models.md
docs/models/supported_models.md
+1
-1
examples/offline_inference/audio_language.py
examples/offline_inference/audio_language.py
+29
-0
tests/models/registry.py
tests/models/registry.py
+3
-0
vllm/model_executor/models/audioflamingo3.py
vllm/model_executor/models/audioflamingo3.py
+8
-1
vllm/model_executor/models/musicflamingo.py
vllm/model_executor/models/musicflamingo.py
+70
-0
vllm/model_executor/models/registry.py
vllm/model_executor/models/registry.py
+4
-0
No files found.
docs/models/supported_models.md
View file @
c46b0cd0
...
@@ -657,7 +657,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
...
@@ -657,7 +657,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| Architecture | Models | Inputs | Example HF Models |
[
LoRA
](
../features/lora.md
)
|
[
PP
](
../serving/parallelism_scaling.md
)
|
| Architecture | Models | Inputs | Example HF Models |
[
LoRA
](
../features/lora.md
)
|
[
PP
](
../serving/parallelism_scaling.md
)
|
|--------------|--------|--------|-------------------|----------------------|---------------------------|
|--------------|--------|--------|-------------------|----------------------|---------------------------|
|
`AriaForConditionalGeneration`
| Aria | T + I
<sup>
+
</sup>
|
`rhymes-ai/Aria`
| | |
|
`AriaForConditionalGeneration`
| Aria | T + I
<sup>
+
</sup>
|
`rhymes-ai/Aria`
| | |
|
`AudioFlamingo3ForConditionalGeneration`
| AudioFlamingo3 | T + A
<sup>
+
</sup>
|
`nvidia/audio-flamingo-3-hf`
,
`nvidia/music-flamingo-hf`
| ✅︎ | ✅︎ |
|
`AudioFlamingo3ForConditionalGeneration`
| AudioFlamingo3 | T + A
<sup>
+
</sup>
|
`nvidia/audio-flamingo-3-hf`
,
`nvidia/music-flamingo-
2601-
hf`
| ✅︎ | ✅︎ |
|
`AyaVisionForConditionalGeneration`
| Aya Vision | T + I
<sup>
+
</sup>
|
`CohereLabs/aya-vision-8b`
,
`CohereLabs/aya-vision-32b`
, etc. | | ✅︎ |
|
`AyaVisionForConditionalGeneration`
| Aya Vision | T + I
<sup>
+
</sup>
|
`CohereLabs/aya-vision-8b`
,
`CohereLabs/aya-vision-32b`
, etc. | | ✅︎ |
|
`BagelForConditionalGeneration`
| BAGEL | T + I
<sup>
+
</sup>
|
`ByteDance-Seed/BAGEL-7B-MoT`
| ✅︎ | ✅︎ |
|
`BagelForConditionalGeneration`
| BAGEL | T + I
<sup>
+
</sup>
|
`ByteDance-Seed/BAGEL-7B-MoT`
| ✅︎ | ✅︎ |
|
`BeeForConditionalGeneration`
| Bee-8B | T + I
<sup>
E+
</sup>
|
`Open-Bee/Bee-8B-RL`
,
`Open-Bee/Bee-8B-SFT`
| | ✅︎ |
|
`BeeForConditionalGeneration`
| Bee-8B | T + I
<sup>
E+
</sup>
|
`Open-Bee/Bee-8B-RL`
,
`Open-Bee/Bee-8B-SFT`
| | ✅︎ |
...
...
examples/offline_inference/audio_language.py
View file @
c46b0cd0
...
@@ -70,6 +70,34 @@ def run_audioflamingo3(question: str, audio_count: int) -> ModelRequestData:
...
@@ -70,6 +70,34 @@ def run_audioflamingo3(question: str, audio_count: int) -> ModelRequestData:
)
)
# MusicFlamingo
def
run_musicflamingo
(
question
:
str
,
audio_count
:
int
)
->
ModelRequestData
:
model_name
=
"nvidia/music-flamingo-2601-hf"
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
limit_mm_per_prompt
=
{
"audio"
:
audio_count
},
enforce_eager
=
True
,
)
# MusicFlamingo uses <sound> token for audio
audio_placeholder
=
"<sound>"
*
audio_count
prompt
=
(
"<|im_start|>system
\n
"
"You are a helpful assistant.<|im_end|>
\n
"
"<|im_start|>user
\n
"
f
"
{
audio_placeholder
}{
question
}
<|im_end|>
\n
"
"<|im_start|>assistant
\n
"
)
return
ModelRequestData
(
engine_args
=
engine_args
,
prompt
=
prompt
,
)
# Gemma3N
# Gemma3N
def
run_gemma3n
(
question
:
str
,
audio_count
:
int
)
->
ModelRequestData
:
def
run_gemma3n
(
question
:
str
,
audio_count
:
int
)
->
ModelRequestData
:
model_name
=
"google/gemma-3n-E2B-it"
model_name
=
"google/gemma-3n-E2B-it"
...
@@ -452,6 +480,7 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:
...
@@ -452,6 +480,7 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:
model_example_map
=
{
model_example_map
=
{
"audioflamingo3"
:
run_audioflamingo3
,
"audioflamingo3"
:
run_audioflamingo3
,
"musicflamingo"
:
run_musicflamingo
,
"gemma3n"
:
run_gemma3n
,
"gemma3n"
:
run_gemma3n
,
"glmasr"
:
run_glmasr
,
"glmasr"
:
run_glmasr
,
"funaudiochat"
:
run_funaudiochat
,
"funaudiochat"
:
run_funaudiochat
,
...
...
tests/models/registry.py
View file @
c46b0cd0
...
@@ -657,6 +657,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -657,6 +657,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"AudioFlamingo3ForConditionalGeneration"
:
_HfExamplesInfo
(
"AudioFlamingo3ForConditionalGeneration"
:
_HfExamplesInfo
(
"nvidia/audio-flamingo-3-hf"
,
min_transformers_version
=
"5.0.0"
"nvidia/audio-flamingo-3-hf"
,
min_transformers_version
=
"5.0.0"
),
),
"MusicFlamingoForConditionalGeneration"
:
_HfExamplesInfo
(
"nvidia/music-flamingo-2601-hf"
,
min_transformers_version
=
"5.0.0.dev"
),
"AyaVisionForConditionalGeneration"
:
_HfExamplesInfo
(
"CohereLabs/aya-vision-8b"
),
"AyaVisionForConditionalGeneration"
:
_HfExamplesInfo
(
"CohereLabs/aya-vision-8b"
),
"BagelForConditionalGeneration"
:
_HfExamplesInfo
(
"ByteDance-Seed/BAGEL-7B-MoT"
),
"BagelForConditionalGeneration"
:
_HfExamplesInfo
(
"ByteDance-Seed/BAGEL-7B-MoT"
),
"BeeForConditionalGeneration"
:
_HfExamplesInfo
(
"BeeForConditionalGeneration"
:
_HfExamplesInfo
(
...
...
vllm/model_executor/models/audioflamingo3.py
View file @
c46b0cd0
...
@@ -128,6 +128,12 @@ class AudioFlamingo3Encoder(Qwen2AudioEncoder):
...
@@ -128,6 +128,12 @@ class AudioFlamingo3Encoder(Qwen2AudioEncoder):
super
().
__init__
(
config
)
super
().
__init__
(
config
)
self
.
avg_pooler
=
nn
.
AvgPool1d
(
kernel_size
=
2
,
stride
=
2
)
self
.
avg_pooler
=
nn
.
AvgPool1d
(
kernel_size
=
2
,
stride
=
2
)
# self.layer_norm is already initialized in super().__init__
# self.layer_norm is already initialized in super().__init__
# Keep a dummy freqs parameter for MusicFlamingo checkpoints.
self
.
pos_emb
=
nn
.
Module
()
freqs
=
torch
.
empty
(
getattr
(
config
,
"num_mel_bins"
,
128
))
self
.
pos_emb
.
register_parameter
(
"freqs"
,
nn
.
Parameter
(
freqs
,
requires_grad
=
False
)
)
def
forward
(
def
forward
(
self
,
self
,
...
@@ -146,7 +152,8 @@ class AudioFlamingo3Encoder(Qwen2AudioEncoder):
...
@@ -146,7 +152,8 @@ class AudioFlamingo3Encoder(Qwen2AudioEncoder):
).
to
(
hidden_states
.
dtype
)
).
to
(
hidden_states
.
dtype
)
for
layer
in
self
.
layers
:
for
layer
in
self
.
layers
:
layer_outputs
=
layer
(
hidden_states
,
attention_mask
)
# Qwen2AudioEncoderLayer expects layer_head_mask as third arg.
layer_outputs
=
layer
(
hidden_states
,
attention_mask
,
None
)
hidden_states
=
layer_outputs
[
0
]
hidden_states
=
layer_outputs
[
0
]
# AvgPool (time/2) + LayerNorm
# AvgPool (time/2) + LayerNorm
...
...
vllm/model_executor/models/musicflamingo.py
0 → 100644
View file @
c46b0cd0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""MusicFlamingo model adapter.
MusicFlamingo shares the AudioFlamingo3 architecture, so we reuse the same
implementation and multimodal processor, while accepting MusicFlamingo config
and processor classes when available.
"""
from
collections.abc
import
Mapping
from
transformers.models.audioflamingo3
import
(
AudioFlamingo3Config
,
AudioFlamingo3Processor
,
)
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.processing
import
BaseProcessingInfo
from
.audioflamingo3
import
(
AudioFlamingo3DummyInputsBuilder
,
AudioFlamingo3ForConditionalGeneration
,
AudioFlamingo3MultiModalProcessor
,
)
try
:
# Optional dependency: use MusicFlamingo classes when transformers provides them.
from
transformers.models.musicflamingo
import
(
MusicFlamingoConfig
,
MusicFlamingoProcessor
,
)
except
Exception
:
# pragma: no cover - optional dependency
MusicFlamingoConfig
=
None
MusicFlamingoProcessor
=
None
class
MusicFlamingoProcessingInfo
(
BaseProcessingInfo
):
def
get_hf_config
(
self
):
if
MusicFlamingoConfig
is
None
:
return
self
.
ctx
.
get_hf_config
(
AudioFlamingo3Config
)
return
self
.
ctx
.
get_hf_config
((
MusicFlamingoConfig
,
AudioFlamingo3Config
))
def
get_hf_processor
(
self
,
**
kwargs
:
object
):
if
MusicFlamingoProcessor
is
None
:
return
self
.
ctx
.
get_hf_processor
(
AudioFlamingo3Processor
,
**
kwargs
)
# Tuple triggers AutoProcessor path and accepts either processor class.
return
self
.
ctx
.
get_hf_processor
(
(
MusicFlamingoProcessor
,
AudioFlamingo3Processor
),
**
kwargs
)
def
get_feature_extractor
(
self
,
**
kwargs
:
object
):
hf_processor
=
self
.
get_hf_processor
(
**
kwargs
)
return
hf_processor
.
feature_extractor
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
int
|
None
]:
return
{
"audio"
:
None
}
class
MusicFlamingoDummyInputsBuilder
(
AudioFlamingo3DummyInputsBuilder
):
pass
@
MULTIMODAL_REGISTRY
.
register_processor
(
AudioFlamingo3MultiModalProcessor
,
info
=
MusicFlamingoProcessingInfo
,
dummy_inputs
=
MusicFlamingoDummyInputsBuilder
,
)
class
MusicFlamingoForConditionalGeneration
(
AudioFlamingo3ForConditionalGeneration
):
"""MusicFlamingo model for conditional generation."""
vllm/model_executor/models/registry.py
View file @
c46b0cd0
...
@@ -286,6 +286,10 @@ _MULTIMODAL_MODELS = {
...
@@ -286,6 +286,10 @@ _MULTIMODAL_MODELS = {
"audioflamingo3"
,
"audioflamingo3"
,
"AudioFlamingo3ForConditionalGeneration"
,
"AudioFlamingo3ForConditionalGeneration"
,
),
),
"MusicFlamingoForConditionalGeneration"
:
(
"musicflamingo"
,
"MusicFlamingoForConditionalGeneration"
,
),
"AyaVisionForConditionalGeneration"
:
(
"AyaVisionForConditionalGeneration"
:
(
"aya_vision"
,
"aya_vision"
,
"AyaVisionForConditionalGeneration"
,
"AyaVisionForConditionalGeneration"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment