"docs/vscode:/vscode.git/clone" did not exist on "d79d9eaaff90801668613a4e3d5d8a0004963f21"
Unverified Commit 3abb7560 authored by Roger Wang's avatar Roger Wang Committed by GitHub
Browse files

[Bugfix] Fix audioflamingo test (#40052)


Signed-off-by: default avatarRoger Wang <hey@rogerw.io>
parent 617d1c2f
......@@ -140,88 +140,3 @@ def test_audio_token_count_matches_hf_processor_math():
_count_audio_tokens_from_mask(feature_attention_mask, chunk_counts, 0) == 1499
)
assert _count_audio_tokens_from_mask(feature_attention_mask, chunk_counts, 1) == 375
def test_audio_feature_pipeline_matches_hf_small_config():
from transformers.models.audioflamingo3 import (
modeling_audioflamingo3 as hf_audioflamingo3_modeling,
)
from transformers.models.audioflamingo3.configuration_audioflamingo3 import (
AudioFlamingo3Config,
)
from vllm.model_executor.models.audioflamingo3 import (
AudioFlamingo3Encoder,
AudioFlamingo3MultiModalProjector,
_build_audio_encoder_attention_mask,
_flatten_valid_audio_embeddings,
)
text_config = {
"model_type": "qwen2",
"intermediate_size": 64,
"initializer_range": 0.02,
"hidden_size": 32,
"max_position_embeddings": 1024,
"num_hidden_layers": 2,
"num_attention_heads": 4,
"num_key_value_heads": 2,
"vocab_size": 128,
"pad_token_id": 1,
"use_mrope": False,
}
audio_config = {
"hidden_size": 16,
"num_attention_heads": 4,
"intermediate_size": 32,
"num_hidden_layers": 2,
"num_mel_bins": 80,
"max_source_positions": 1500,
"dropout": 0.0,
"attention_dropout": 0.0,
"activation_dropout": 0.0,
"encoder_layerdrop": 0.0,
}
torch.manual_seed(0)
config = AudioFlamingo3Config(
text_config=text_config,
audio_config=audio_config,
audio_token_id=0,
)
hf_model = hf_audioflamingo3_modeling.AudioFlamingo3ForConditionalGeneration(
config
).eval()
vllm_encoder = AudioFlamingo3Encoder(config.audio_config).eval()
vllm_encoder.load_state_dict(hf_model.audio_tower.state_dict())
vllm_projector = AudioFlamingo3MultiModalProjector(config).eval()
vllm_projector.load_state_dict(hf_model.multi_modal_projector.state_dict())
input_features = torch.randn(3, 80, 3000)
feature_attention_mask = torch.zeros(3, 3000, dtype=torch.bool)
feature_attention_mask[0, :3000] = True
feature_attention_mask[1, :2500] = True
feature_attention_mask[2, :1500] = True
hf_output = hf_model.get_audio_features(
input_features,
feature_attention_mask,
return_dict=True,
).pooler_output
vllm_attention_mask = _build_audio_encoder_attention_mask(
feature_attention_mask,
dtype=vllm_encoder.conv1.weight.dtype,
device=vllm_encoder.conv1.weight.device,
)
vllm_hidden_states = vllm_encoder(
input_features,
attention_mask=vllm_attention_mask,
)
vllm_output, _ = _flatten_valid_audio_embeddings(
vllm_projector(vllm_hidden_states),
feature_attention_mask,
)
torch.testing.assert_close(vllm_output, hf_output)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment