Commit 7e63ef82 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.14.0' into v0.14.0-dev

parents 8cbcac5d b17039bc
......@@ -114,7 +114,7 @@ def check_model_available(model: str) -> None:
@pytest.mark.core_model
@pytest.mark.cpu_model
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "openai/whisper-large-v3-turbo")])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("dtype", ["half", "float"])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("enforce_eager", [True, False])
@create_new_process_for_each_test("spawn")
......
......@@ -522,6 +522,183 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
return hf_model
def isaac_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
"""Patch HF runner for Isaac:
1) Move processor outputs to model device
2) Ensure IsaacModel.forward returns hidden_states
for compatibility with hidden_states_to_seq_logprobs()
"""
from perceptron.tensorstream import TextType
from perceptron.tensorstream.ops import compute_mrope_pos_tensor, modality_mask
from transformers.modeling_outputs import BaseModelOutputWithPast
def compute_position_ids_input_ids(input_ids: torch.Tensor) -> torch.Tensor:
"""
Create 3D positional indices for token input.
"""
batch_size, seq_length = input_ids.shape
position_ids = torch.arange(seq_length, device=input_ids.device)
position_ids = position_ids.view(1, -1).expand(batch_size, -1)
position_ids = position_ids.unsqueeze(2).expand(-1, -1, 3) # Add 3D for MRoPE
return position_ids
model_device = next(hf_model.model.parameters()).device
# ----------------------------
# 1) Patch processor: move BatchFeature input_ids and TensorStream to model device
# ----------------------------
original_processor = hf_model.processor
def patched_processor(*args, **kwargs):
result = original_processor(*args, **kwargs)
for k, v in result.data.items():
result[k] = v.to(model_device)
return result
hf_model.processor = patched_processor
tokenizer = AutoTokenizer.from_pretrained(
hf_model.model_name, trust_remote_code=True
)
original_generate = hf_model.model.generate
def patched_generate(*args, **kwargs):
kwargs["pad_token_id"] = tokenizer.eos_token_id
kwargs["eos_token_id"] = tokenizer.eos_token_id
return original_generate(*args, **kwargs)
hf_model.model.generate = patched_generate
# ----------------------------
# 2) Patch IsaacModel.forward: add hidden_states to the output
# ----------------------------
isaac_model = hf_model.model.model
def patched_forward(
self,
input_ids=None,
tensor_stream=None,
attention_mask=None,
position_ids=None,
modality_tensor=None,
past_key_values=None,
inputs_embeds=None,
use_cache=None,
output_hidden_states=None,
return_dict=None,
cache_position=None,
**kwargs,
):
"""
Forward pass with MRoPE position embeddings.
Computes position embeddings once and passes them through all layers.
"""
output_hidden_states = (
output_hidden_states
if output_hidden_states is not None
else self.config.output_hidden_states
)
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = (
return_dict if return_dict is not None else self.config.use_return_dict
)
# Get inputs
if tensor_stream is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both tensor_stream and inputs_embeds")
elif tensor_stream is not None:
# Embed TensorStream directly
inputs_embeds = self.embed_stream(tensor_stream)
# Create modality tensor if not provided
if modality_tensor is None:
modality_tensor = modality_mask(tensor_stream)
elif input_ids is not None and inputs_embeds is not None:
raise ValueError(
"You cannot specify both input_ids and inputs_embeds at the same time"
)
elif input_ids is not None:
inputs_embeds = self.embed_tokens(input_ids)
# Create text modality tensor if not provided
if modality_tensor is None:
batch_size, seq_length = input_ids.shape
modality_tensor = torch.full(
(batch_size, seq_length),
TextType.text.value,
device=input_ids.device,
dtype=torch.long,
)
elif inputs_embeds is None:
raise ValueError(
"You have to specify either tensor_stream, input_ids or inputs_embeds"
)
# Create default position_ids if not provided
if position_ids is None:
if tensor_stream is not None:
position_ids = compute_mrope_pos_tensor(tensor_stream) # (B,L,3)
else:
position_ids = compute_position_ids_input_ids(input_ids)
# Compute MRoPE position embeddings if we have custom rotary_emb
cos, sin = self.rotary_emb(position_ids, modality_tensor)
cos = cos.to(inputs_embeds.dtype)
sin = sin.to(inputs_embeds.dtype)
# Prepare attention mask
if attention_mask is not None:
attention_mask = self._update_causal_mask(
attention_mask, inputs_embeds, cache_position, past_key_values, False
)
# Initialize and collect hidden states
hidden_states = inputs_embeds
hidden_states_list: list[torch.Tensor] = []
if output_hidden_states:
hidden_states_list.append(hidden_states)
for decoder_layer in self.layers:
layer_outputs = decoder_layer(
hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_values,
use_cache=use_cache,
cache_position=cache_position,
position_embeddings=(cos, sin),
**kwargs,
)
hidden_states = (
layer_outputs[0] if isinstance(layer_outputs, tuple) else layer_outputs
)
if output_hidden_states:
hidden_states_list.append(hidden_states)
# Final layer norm
hidden_states = self.norm(hidden_states)
if output_hidden_states:
hidden_states_list.append(hidden_states)
# Convert to tuple or None
all_hidden_states = tuple(hidden_states_list) if output_hidden_states else None
# Include hiden_states for compatibility with hidden_states_to_seq_logprobs()
return BaseModelOutputWithPast(
last_hidden_state=hidden_states,
past_key_values=past_key_values,
hidden_states=all_hidden_states,
)
isaac_model.forward = types.MethodType(patched_forward, isaac_model)
return hf_model
def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
"""Patches and returns an instance of the HfRunner to use for SkyworkR1V."""
......
......@@ -2,23 +2,17 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Pytest configuration for vLLM pooling tests."""
import os
import warnings
import pytest
from vllm.platforms import current_platform
def pytest_collection_modifyitems(config, items):
"""Set FLEX_ATTENTION backend for SigLIP tests on ROCm."""
if not current_platform.is_rocm():
return
@pytest.fixture
def siglip_attention_config():
"""Return attention config for SigLIP tests on ROCm.
siglip_tests = [item for item in items if "test_siglip" in item.nodeid]
if siglip_tests:
os.environ["VLLM_ATTENTION_BACKEND"] = "FLEX_ATTENTION"
warnings.warn(
"ROCm: Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION for SigLIP tests",
UserWarning,
stacklevel=1,
)
On ROCm, SigLIP tests require FLEX_ATTENTION backend.
"""
if current_platform.is_rocm():
return {"backend": "FLEX_ATTENTION"}
return None
......@@ -78,7 +78,9 @@ def run_intern_vit_test(
],
)
@pytest.mark.parametrize("dtype", ["half"])
def test_models(dist_init, image_assets, model_id, dtype: str) -> None:
def test_models(
default_vllm_config, dist_init, image_assets, model_id, dtype: str
) -> None:
run_intern_vit_test(
image_assets,
model_id,
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import cast
import pytest
from transformers import AutoModel
from vllm.entrypoints.chat_utils import ChatCompletionContentPartImageParam
from vllm.entrypoints.chat_utils import (
ChatCompletionContentPartImageEmbedsParam,
ChatCompletionContentPartImageParam,
ChatCompletionContentPartTextParam,
)
from vllm.entrypoints.score_utils import ScoreMultiModalParam
from ....conftest import HfRunner, VllmRunner
model_name = "jinaai/jina-reranker-m0"
MODELS = ["jinaai/jina-reranker-m0"]
mm_processor_kwargs = {
MM_PROCESSOR_KWARGS = {
"min_pixels": 3136,
"max_pixels": 602112,
}
limit_mm_per_prompt = {"image": 2}
LIMIT_MM_PER_PROMPT = {"image": 2}
CHECKPOINT_TO_HF_MAPPER = {
"visual.": "model.visual.",
"model.": "model.language_model.",
}
# Shared long text for test data
LONG_TEXT_DOC = """We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient
web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML
into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding
large language models. The models effectiveness results from two key innovations: (1) a three-stage
data synthesis pipeline that generates high quality, diverse training data by iteratively drafting,
refining, and critiquing web content extraction; and (2) a unified training framework combining
continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that
ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated
benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly
lower computational requirements.""" # noqa: E501
# Test data for different scenarios
TEXT_IMAGE_TEST_DATA = {
"query": [{"text": "slm markdown"}],
"documents": [
{
"image": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
},
{
"image": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
},
],
}
TEXT_TEXT_TEST_DATA = {
"query": [{"text": "slm markdown"}],
"documents": [
{"text": LONG_TEXT_DOC},
{"text": "数据提取么?为什么不用正则啊,你用正则不就全解决了么?"},
],
}
IMAGE_TEXT_TEST_DATA = {
"query": [
{
"image": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
}
],
"documents": [
{"text": LONG_TEXT_DOC},
{"text": "数据提取么?为什么不用正则啊,你用正则不就全解决了么?"},
],
}
IMAGE_IMAGE_TEST_DATA = {
"query": [
{
"image": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
}
],
"documents": [
{
"image": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
},
{
"image": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
},
],
}
def vllm_reranker(
TEXT_MIXED_DOCS_TEST_DATA = {
"query": [{"text": "slm markdown"}],
"documents": [
{"text": LONG_TEXT_DOC},
{
"image": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
},
{"text": "数据提取么?为什么不用正则啊,你用正则不就全解决了么?"},
{
"image": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
},
],
}
def _normalize_image(image_val: str) -> str:
"""Normalize image value to proper format for HF model."""
return (
image_val
if image_val.startswith(("http://", "https://"))
else f"data:image/png;base64,{image_val}"
)
def create_score_multimodal_param(
content_parts: list[dict],
) -> ScoreMultiModalParam:
"""
Create a ScoreMultiModalParam from a list of content dictionaries.
Each dict supports the following formats:
- Text: {'text': 'content'}
- Image URL: {'image': 'https://...'}
- Image Base64: {'image': 'base64_str'}
"""
formatted_content = []
for part in content_parts:
if "text" in part:
formatted_content.append(
ChatCompletionContentPartTextParam(
type="text",
text=part["text"],
)
)
elif "image" in part:
image_val = part["image"]
if image_val.startswith(("http://", "https://")):
formatted_content.append(
ChatCompletionContentPartImageParam(
type="image_url",
image_url={"url": image_val},
)
)
else:
formatted_content.append(
ChatCompletionContentPartImageEmbedsParam(
type="image_embeds", image_embeds=image_val
)
)
return ScoreMultiModalParam(content=formatted_content)
def _run_vllm(
vllm_runner: type[VllmRunner],
model_name: str,
model: str,
dtype: str,
query_strs: list[str],
document_strs: list[str],
query_type: str = "text",
doc_type: str = "text",
):
def create_image_param(url: str) -> ChatCompletionContentPartImageParam:
return {"type": "image_url", "image_url": {"url": f"{url}"}}
query: list[str] | ScoreMultiModalParam
if query_type == "text":
query = query_strs
elif query_type == "image":
query = ScoreMultiModalParam(
content=[create_image_param(url) for url in query_strs]
)
documents: list[str] | ScoreMultiModalParam
if doc_type == "text":
documents = document_strs
elif doc_type == "image":
documents = ScoreMultiModalParam(
content=[create_image_param(url) for url in document_strs]
)
query_strs: list[dict[str, str]],
document_strs: list[dict[str, str]],
) -> list[float]:
"""Run vLLM reranker and return scores."""
query = create_score_multimodal_param(query_strs)
documents = create_score_multimodal_param(document_strs)
with vllm_runner(
model_name,
model,
runner="pooling",
dtype=dtype,
max_num_seqs=2,
max_model_len=2048,
mm_processor_kwargs=mm_processor_kwargs,
limit_mm_per_prompt=limit_mm_per_prompt,
mm_processor_kwargs=MM_PROCESSOR_KWARGS,
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
) as vllm_model:
outputs = vllm_model.llm.score(query, documents)
return [output.outputs.score for output in outputs]
def hf_reranker(
def _run_hf(
hf_runner: type[HfRunner],
model_name: str,
model: str,
dtype: str,
query_strs: list[str],
document_strs: list[str],
query_type: str = "text",
doc_type: str = "text",
):
checkpoint_to_hf_mapper = {
"visual.": "model.visual.",
"model.": "model.language_model.",
}
data_pairs = [[query_strs[0], d] for d in document_strs]
query_strs: list[dict[str, str]],
document_strs: list[dict[str, str]],
) -> list[float]:
"""Run HuggingFace reranker and return scores."""
query = query_strs[0]
if "text" in query:
query_type = "text"
query_data = query["text"]
elif "image" in query:
query_type = "image"
query_data = _normalize_image(query["image"])
else:
raise ValueError("Unsupported query format")
# Separate documents by type
text_docs: list[str] = []
image_docs: list[str] = []
text_indices: list[int] = []
image_indices: list[int] = []
for idx, doc in enumerate(document_strs):
if "text" in doc:
text_docs.append(doc["text"])
text_indices.append(idx)
elif "image" in doc:
image_docs.append(_normalize_image(doc["image"]))
image_indices.append(idx)
else:
raise ValueError(f"Unsupported document format at index {idx}")
scores: list[None | float] = [None] * len(document_strs)
with hf_runner(
model_name,
model,
dtype=dtype,
trust_remote_code=True,
auto_cls=AutoModel,
model_kwargs={"key_mapping": checkpoint_to_hf_mapper},
model_kwargs={"key_mapping": CHECKPOINT_TO_HF_MAPPER},
) as hf_model:
return hf_model.model.compute_score(
data_pairs, max_length=2048, query_type=query_type, doc_type=doc_type
)
# Score text documents
if text_docs:
text_scores = hf_model.model.compute_score(
[[query_data, d] for d in text_docs],
max_length=2048,
query_type=query_type,
doc_type="text",
)
for i, s in zip(text_indices, text_scores):
scores[i] = s
# Score image documents
if image_docs:
image_scores = hf_model.model.compute_score(
[[query_data, d] for d in image_docs],
max_length=2048,
query_type=query_type,
doc_type="image",
)
for i, s in zip(image_indices, image_scores):
scores[i] = s
# Visual Documents Reranking
@pytest.mark.parametrize("model_name", [model_name])
@pytest.mark.parametrize("dtype", ["half"])
def test_model_text_image(hf_runner, vllm_runner, model_name, dtype):
query = ["slm markdown"]
documents = [
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png",
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
]
hf_outputs = hf_reranker(
hf_runner, model_name, dtype, query, documents, "text", "image"
)
vllm_outputs = vllm_reranker(
vllm_runner, model_name, dtype, query, documents, "text", "image"
)
assert all(s is not None for s in scores)
return cast(list[float], scores)
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
def _run_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
model: str,
dtype: str,
query_strs: list[dict[str, str]],
document_strs: list[dict[str, str]],
) -> None:
"""Run comparison test between vLLM and HuggingFace implementations."""
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# Textual Documents Reranking
@pytest.mark.parametrize("model_name", [model_name])
@pytest.mark.parametrize("dtype", ["half"])
def test_model_text_text(hf_runner, vllm_runner, model_name, dtype):
query = ["slm markdown"]
documents = [
"""We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient
web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML
into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding
large language models. The models effectiveness results from two key innovations: (1) a three-stage
data synthesis pipeline that generates high quality, diverse training data by iteratively drafting,
refining, and critiquing web content extraction; and (2) a unified training framework combining
continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that
ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated
benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly
lower computational requirements.""", # noqa: E501
"数据提取么?为什么不用正则啊,你用正则不就全解决了么?",
]
hf_outputs = hf_reranker(
hf_runner, model_name, dtype, query, documents, "text", "text"
)
vllm_outputs = vllm_reranker(
vllm_runner, model_name, dtype, query, documents, "text", "text"
vllm_outputs = _run_vllm(vllm_runner, model, dtype, query_strs, document_strs)
hf_outputs = _run_hf(hf_runner, model, dtype, query_strs, document_strs)
# Compare outputs
assert len(hf_outputs) == len(vllm_outputs), (
f"Output length mismatch: HF={len(hf_outputs)}, vLLM={len(vllm_outputs)}"
)
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
for i, (hf_score, vllm_score) in enumerate(zip(hf_outputs, vllm_outputs)):
assert hf_score == pytest.approx(vllm_score, rel=0.02), (
f"Score mismatch at index {i}: HF={hf_score}, vLLM={vllm_score}"
)
# Image Querying for Textual Documents
@pytest.mark.parametrize("model_name", [model_name])
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_model_image_text(hf_runner, vllm_runner, model_name, dtype):
query = [
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
]
documents = [
"""We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient
web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML
into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding
large language models. The models effectiveness results from two key innovations: (1) a three-stage
data synthesis pipeline that generates high quality, diverse training data by iteratively drafting,
refining, and critiquing web content extraction; and (2) a unified training framework combining
continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that
ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated
benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly
lower computational requirements.""", # noqa: E501
"数据提取么?为什么不用正则啊,你用正则不就全解决了么?",
]
hf_outputs = hf_reranker(
hf_runner, model_name, dtype, query, documents, "image", "text"
)
vllm_outputs = vllm_reranker(
vllm_runner, model_name, dtype, query, documents, "image", "text"
def test_model_text_image(
hf_runner,
vllm_runner,
model: str,
dtype: str,
) -> None:
"""Visual Documents Reranking"""
_run_test(
hf_runner,
vllm_runner,
model,
dtype,
TEXT_IMAGE_TEST_DATA["query"],
TEXT_IMAGE_TEST_DATA["documents"],
)
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_model_text_text(
hf_runner,
vllm_runner,
model: str,
dtype: str,
) -> None:
"""Textual Documents Reranking"""
_run_test(
hf_runner,
vllm_runner,
model,
dtype,
TEXT_TEXT_TEST_DATA["query"],
TEXT_TEXT_TEST_DATA["documents"],
)
# Image Querying for Image Documents
@pytest.mark.parametrize("model_name", [model_name])
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_model_image_image(hf_runner, vllm_runner, model_name, dtype):
query = [
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
]
documents = [
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png",
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
]
hf_outputs = hf_reranker(
hf_runner, model_name, dtype, query, documents, "image", "image"
def test_model_image_text(
hf_runner,
vllm_runner,
model: str,
dtype: str,
) -> None:
"""Image Querying for Textual Documents"""
_run_test(
hf_runner,
vllm_runner,
model,
dtype,
IMAGE_TEXT_TEST_DATA["query"],
IMAGE_TEXT_TEST_DATA["documents"],
)
vllm_outputs = vllm_reranker(
vllm_runner, model_name, dtype, query, documents, "image", "image"
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_model_image_image(
hf_runner,
vllm_runner,
model: str,
dtype: str,
) -> None:
"""Image Querying for Image Documents"""
_run_test(
hf_runner,
vllm_runner,
model,
dtype,
IMAGE_IMAGE_TEST_DATA["query"],
IMAGE_IMAGE_TEST_DATA["documents"],
)
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_model_text_mixed_documents(
hf_runner,
vllm_runner,
model: str,
dtype: str,
) -> None:
"""Text Query for Mixed Text and Image Documents"""
_run_test(
hf_runner,
vllm_runner,
model,
dtype,
TEXT_MIXED_DOCS_TEST_DATA["query"],
TEXT_MIXED_DOCS_TEST_DATA["documents"],
)
......@@ -40,15 +40,15 @@ def run_radio_test(
for image in images
]
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
hf_config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
# RADIO model on HF does not properly handle torch_dtype argument
# And relies on args["dtype"] which we have to patch manually:
config.args["dtype"] = torch_dtype
hf_config.args["dtype"] = torch_dtype
hf_model = AutoModel.from_pretrained(
model_id,
config=config,
config=hf_config,
dtype=torch_dtype,
trust_remote_code=True,
).to("cuda")
......@@ -62,13 +62,14 @@ def run_radio_test(
hf_model.make_preprocessor_external()
hf_outputs_per_image = [
hf_model(pixel_value.to("cuda")).features for pixel_value in pixel_values
hf_model(pixel_value.to("cuda")) for pixel_value in pixel_values
]
radio_config = RadioConfig(
model_name=config.args["model"], reg_tokens=config.args["register_multiple"]
vllm_config = RadioConfig(
model_name=hf_config.args["model"],
**hf_config.args,
)
vllm_model = RadioModel(radio_config)
vllm_model = RadioModel(vllm_config)
vllm_model.load_weights(hf_model.state_dict())
vllm_model = vllm_model.to("cuda", torch_dtype)
......@@ -80,7 +81,8 @@ def run_radio_test(
cos_similar = nn.CosineSimilarity(dim=-1)
for vllm_output, hf_output in zip(vllm_outputs_per_image, hf_outputs_per_image):
assert cos_similar(vllm_output, hf_output).mean() > 0.99
assert cos_similar(vllm_output[0], hf_output[0]).mean() > 0.99
assert cos_similar(vllm_output[1], hf_output[1]).mean() > 0.99
@pytest.mark.parametrize(
......@@ -90,7 +92,9 @@ def run_radio_test(
],
)
@pytest.mark.parametrize("dtype", ["half", "bfloat16"])
def test_radio(dist_init, image_assets, model_id, dtype: str) -> None:
def test_radio(
default_vllm_config, dist_init, image_assets, model_id, dtype: str
) -> None:
run_radio_test(
image_assets,
model_id,
......
......@@ -38,6 +38,7 @@ def _run_test(
*,
dtype: str,
tokenization_kwargs: dict[str, Any] | None = None,
attention_config: dict[str, Any] | None = None,
) -> None:
if tokenization_kwargs is None:
tokenization_kwargs = {}
......@@ -49,6 +50,7 @@ def _run_test(
enforce_eager=True,
max_model_len=64,
gpu_memory_utilization=0.7,
attention_config=attention_config,
) as vllm_model:
vllm_outputs = vllm_model.embed(
input_texts, images=input_images, tokenization_kwargs=tokenization_kwargs
......@@ -90,6 +92,7 @@ def test_models_text(
hf_runner,
vllm_runner,
image_assets,
siglip_attention_config,
model: str,
dtype: str,
) -> None:
......@@ -108,6 +111,7 @@ def test_models_text(
"padding": "max_length",
"max_length": 64,
}, # siglip2 was trained with this padding setting.
attention_config=siglip_attention_config,
)
......@@ -117,6 +121,7 @@ def test_models_image(
hf_runner,
vllm_runner,
image_assets,
siglip_attention_config,
model: str,
dtype: str,
) -> None:
......@@ -133,6 +138,7 @@ def test_models_image(
input_images,
model,
dtype=dtype,
attention_config=siglip_attention_config,
)
......@@ -141,6 +147,7 @@ def test_models_image(
def test_models_text_image_no_crash(
vllm_runner,
image_assets,
siglip_attention_config,
model: str,
dtype: str,
) -> None:
......@@ -154,6 +161,7 @@ def test_models_text_image_no_crash(
enforce_eager=True,
max_model_len=64,
gpu_memory_utilization=0.7,
attention_config=siglip_attention_config,
) as vllm_model:
with pytest.raises(ValueError, match="not both"):
vllm_model.embed(texts, images=images)
......
......@@ -86,11 +86,25 @@ def qwen3_vl_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
return mm_data
def glmasr_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
"""
Patch the multimodal data for GLM-ASR model.
GLM-ASR requires text and audio to match 1:1, so we limit audio to 1.
"""
if "audio" in mm_data:
audio = mm_data["audio"]
if isinstance(audio, list) and len(audio) > 1:
# Limit to single audio to match text requirement
mm_data["audio"] = [audio[0]]
return mm_data
# For some multimodal models, tokenizer will always add bos_token
# at the beginning of prompt by default, causing hf_processor outputs
# incorrect token ids. So we need use `add_special_tokens=False` here
# to leave bos_token to be added by the processor.
_ADD_SPECIAL_TOKENS_OVERRIDES = {
"nemotron_parse": False,
"ovis": False,
"ovis2_5": False,
"paligemma": False,
......@@ -106,9 +120,11 @@ _IGNORE_MM_KEYS = {
}
MM_DATA_PATCHES = {
# GLM4.1V and Qwen3-VL requires video metadata to be included in the input
# Ernie4.5-VL, GLM4.1V and Qwen3-VL requires video metadata
"ernie4_5_moe_vl": qwen3_vl_patch_mm_data,
"glm4v": glm4_1v_patch_mm_data,
"glm4v_moe": glm4_1v_patch_mm_data,
"glmasr": glmasr_patch_mm_data,
"qwen3_vl": qwen3_vl_patch_mm_data,
"qwen3_vl_moe": qwen3_vl_patch_mm_data,
}
......@@ -212,7 +228,11 @@ def _test_processing_correctness(
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id_or_arch)
model_id = model_id_or_arch
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
model_info.check_transformers_version(
on_fail="skip",
check_max_version=False,
check_version_reason="vllm",
)
model_config = ModelConfig(
model_id,
......@@ -386,6 +406,11 @@ def test_processing_correctness(
pytest.skip("Fix later")
if model_id == "jinaai/jina-reranker-m0":
pytest.skip("Fix later")
if model_id in {"Qwen/Qwen-VL", "Qwen/Qwen-VL-Chat"}:
pytest.skip(
"Qwen-VL tokenizer requires downloading a font file from "
"servers that often refuse connections in CI"
)
_test_processing_correctness(
model_id,
......
......@@ -2,14 +2,154 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from vllm.model_executor.models.gemma3n_audio_utils import (
adjust_audio_features_to_expected_length,
)
from vllm.multimodal import MULTIMODAL_REGISTRY
from ....conftest import ImageTestAssets
from ...utils import build_model_context
# Gemma3 (image) model
GEMMA3_MODEL_ID = "google/gemma-3-4b-it"
@pytest.mark.parametrize("model_id", ["google/gemma-3-4b-it"])
# Gemma3n (multimodal with audio) model
GEMMA3N_MODEL_ID = "google/gemma-3n-E2B-it"
# Expected audio tokens for Gemma3n (audio_soft_tokens_per_image)
GEMMA3N_EXPECTED_AUDIO_TOKENS = 188
class TestGemma3nAudioTensorLogic:
"""CPU-based tests for Gemma3n audio feature tensor manipulation.
These tests validate the padding/truncation logic in
adjust_audio_features_to_expected_length() which fixes the
integer overflow in _process_audio_input when audio_seq_len > 188.
"""
def test_padding_when_audio_short(self):
"""Test that short audio is padded to expected length."""
batch_size, seq_len, embed_dim = 1, 100, 256
expected_tokens = GEMMA3N_EXPECTED_AUDIO_TOKENS
audio_features = torch.randn(batch_size, seq_len, embed_dim)
padding_embs = torch.zeros(1, 1, embed_dim)
result, tokens_truncated = adjust_audio_features_to_expected_length(
audio_features, expected_tokens, padding_embs
)
assert result.shape == (batch_size, expected_tokens, embed_dim)
assert tokens_truncated == 0
# First 100 tokens should be original, rest should be padding (zeros)
assert torch.allclose(result[:, :seq_len, :], audio_features)
assert torch.allclose(
result[:, seq_len:, :],
torch.zeros(batch_size, expected_tokens - seq_len, embed_dim),
)
def test_truncation_when_audio_long(self):
"""Test that long audio is truncated to expected length.
This is the key test for the overflow fix. Previously, when
audio_seq_len > expected_tokens, the code would compute a negative
padding value causing: RuntimeError: numel: integer multiplication overflow
"""
batch_size, seq_len, embed_dim = 1, 192, 256 # 192 > 188
expected_tokens = GEMMA3N_EXPECTED_AUDIO_TOKENS
audio_features = torch.randn(batch_size, seq_len, embed_dim)
padding_embs = torch.zeros(1, 1, embed_dim)
result, tokens_truncated = adjust_audio_features_to_expected_length(
audio_features, expected_tokens, padding_embs
)
assert result.shape == (batch_size, expected_tokens, embed_dim)
assert tokens_truncated == seq_len - expected_tokens # 192 - 188 = 4
# Result should be first 188 tokens of original
assert torch.allclose(result, audio_features[:, :expected_tokens, :])
def test_no_change_when_exact_length(self):
"""Test that exact-length audio passes through unchanged."""
batch_size, embed_dim = 1, 256
expected_tokens = GEMMA3N_EXPECTED_AUDIO_TOKENS
audio_features = torch.randn(batch_size, expected_tokens, embed_dim)
padding_embs = torch.zeros(1, 1, embed_dim)
result, tokens_truncated = adjust_audio_features_to_expected_length(
audio_features, expected_tokens, padding_embs
)
assert result.shape == audio_features.shape
assert tokens_truncated == 0
assert torch.allclose(result, audio_features)
def test_original_bug_would_fail(self):
"""Verify the original buggy implementation would cause overflow.
The original code always tried to pad, which fails when
audio_seq_len > expected_tokens because expand() gets negative size.
"""
batch_size, seq_len, embed_dim = 1, 192, 256
expected_tokens = GEMMA3N_EXPECTED_AUDIO_TOKENS
padding_embs = torch.zeros(1, 1, embed_dim)
# Original buggy logic (always pads, never truncates)
extra_padding_tokens = expected_tokens - seq_len # = -4 (negative!)
with pytest.raises(RuntimeError):
# This should fail with negative size error
padding_embs.expand(batch_size, extra_padding_tokens, embed_dim)
@pytest.mark.parametrize(
"seq_len",
[50, 100, 150, 187, 188, 189, 192, 200, 300],
)
def test_various_audio_lengths(self, seq_len: int):
"""Test padding/truncation with various audio lengths."""
batch_size, embed_dim = 1, 256
expected_tokens = GEMMA3N_EXPECTED_AUDIO_TOKENS
audio_features = torch.randn(batch_size, seq_len, embed_dim)
padding_embs = torch.zeros(1, 1, embed_dim)
# Should not raise any errors
result, tokens_truncated = adjust_audio_features_to_expected_length(
audio_features, expected_tokens, padding_embs
)
# Output should always be expected_tokens length
assert result.shape == (batch_size, expected_tokens, embed_dim)
# Verify truncation count is correct
if seq_len > expected_tokens:
assert tokens_truncated == seq_len - expected_tokens
else:
assert tokens_truncated == 0
def test_batch_processing(self):
"""Test that batch processing works correctly."""
batch_size, seq_len, embed_dim = 4, 192, 256
expected_tokens = GEMMA3N_EXPECTED_AUDIO_TOKENS
audio_features = torch.randn(batch_size, seq_len, embed_dim)
padding_embs = torch.zeros(1, 1, embed_dim)
result, tokens_truncated = adjust_audio_features_to_expected_length(
audio_features, expected_tokens, padding_embs
)
assert result.shape == (batch_size, expected_tokens, embed_dim)
assert tokens_truncated == seq_len - expected_tokens
@pytest.mark.parametrize("model_id", [GEMMA3_MODEL_ID])
def test_get_image_size_with_most_features(
image_assets: ImageTestAssets, model_id: str
):
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for Qwen3 Omni audio processing and sample rate handling."""
from typing import Any
import numpy as np
import pytest
from vllm.multimodal import MULTIMODAL_REGISTRY
from ...utils import build_model_context
@pytest.mark.parametrize("model_id", ["Qwen/Qwen3-Omni-30B-A3B-Instruct"])
@pytest.mark.parametrize(
("audio_sample_rate", "audio_duration_sec"),
[
(16000, 1.0), # Native Whisper sample rate, 1 second
(16000, 2.0), # Native Whisper sample rate, 2 seconds
],
)
def test_processor_with_audio_sample_rate(
model_id: str,
audio_sample_rate: int,
audio_duration_sec: float,
) -> None:
"""
Test that vLLM's processor generates expected outputs with audio_sample_rate.
This validates that the processor correctly handles audio_sample_rate
passed via hf_processor_mm_kwargs and generates audio tokens.
"""
ctx = build_model_context(
model_id,
limit_mm_per_prompt={"audio": 1, "image": 0, "video": 0},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
tokenizer = processor.info.get_tokenizer()
# Create audio data at the specified sample rate
audio_length = int(audio_sample_rate * audio_duration_sec)
rng = np.random.RandomState(42)
audio_data = rng.rand(audio_length).astype(np.float32)
# Build prompt with audio placeholder
prompt = "<|audio_start|><|audio_pad|><|audio_end|>"
mm_data = {"audio": [(audio_data, audio_sample_rate)]}
# Apply processor with audio_sample_rate in mm_kwargs
hf_processor_mm_kwargs: dict[str, Any] = {
"audio_sample_rate": audio_sample_rate,
}
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
# Verify audio tokens are generated
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
audio_token_id = tokenizer.convert_tokens_to_ids(hf_processor.audio_token)
aud_tok_count = processed_inputs["prompt_token_ids"].count(audio_token_id)
assert aud_tok_count >= 1, (
f"Expected at least 1 audio token but got {aud_tok_count}. "
f"sample_rate: {audio_sample_rate}Hz, duration: {audio_duration_sec}s"
)
@pytest.mark.parametrize("model_id", ["Qwen/Qwen3-Omni-30B-A3B-Instruct"])
def test_longer_audio_generates_more_tokens(model_id: str) -> None:
"""
Test that longer audio generates more tokens than shorter audio.
This validates that audio_sample_rate is being used correctly by checking
that audio duration affects token count as expected.
"""
ctx = build_model_context(
model_id,
limit_mm_per_prompt={"audio": 1, "image": 0, "video": 0},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
tokenizer = processor.info.get_tokenizer()
audio_sample_rate = 16000
rng = np.random.RandomState(42)
def get_token_count(duration: float) -> int:
audio_length = int(audio_sample_rate * duration)
audio_data = rng.rand(audio_length).astype(np.float32)
prompt = "<|audio_start|><|audio_pad|><|audio_end|>"
mm_data = {"audio": [(audio_data, audio_sample_rate)]}
hf_processor_mm_kwargs: dict[str, Any] = {
"audio_sample_rate": audio_sample_rate,
}
processed = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
hf_proc = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
audio_token_id = tokenizer.convert_tokens_to_ids(hf_proc.audio_token)
return processed["prompt_token_ids"].count(audio_token_id)
short_tokens = get_token_count(1.0)
long_tokens = get_token_count(2.0)
assert long_tokens > short_tokens, (
f"Expected longer audio (2s) to have more tokens than shorter (1s). "
f"Got short={short_tokens}, long={long_tokens}"
)
......@@ -138,25 +138,25 @@ def create_batched_mm_kwargs(
)
# TODO(Isotr0py): Don't initalize model during test
# TODO(Isotr0py): Don't initialize model during test
@contextmanager
def initialize_dummy_model(
model_cls: type[nn.Module],
model_config: ModelConfig,
):
temp_file = tempfile.mkstemp()[1]
init_distributed_environment(
world_size=1,
rank=0,
distributed_init_method=f"file://{temp_file}",
local_rank=0,
backend="nccl",
)
initialize_model_parallel(tensor_model_parallel_size=1)
current_device = torch.get_default_device()
vllm_config = VllmConfig(model_config=model_config)
with set_current_vllm_config(vllm_config=vllm_config):
init_distributed_environment(
world_size=1,
rank=0,
distributed_init_method=f"file://{temp_file}",
local_rank=0,
backend="nccl",
)
initialize_model_parallel(tensor_model_parallel_size=1)
with set_default_torch_dtype(model_config.dtype):
torch.set_default_device(current_platform.device_type)
model = model_cls(vllm_config=vllm_config)
......@@ -172,7 +172,11 @@ def initialize_dummy_model(
def test_model_tensor_schema(model_id: str):
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
model_info.check_transformers_version(
on_fail="skip",
check_max_version=False,
check_version_reason="vllm",
)
model_arch = next(
arch for arch, info in HF_EXAMPLE_MODELS.hf_models.items() if info == model_info
......
......@@ -9,7 +9,7 @@ import os
import pytest
from tests.quantization.utils import is_quant_method_supported
from vllm.attention.utils.fa_utils import flash_attn_supports_fp8
from vllm.v1.attention.backends.fa_utils import flash_attn_supports_fp8
from vllm.platforms import current_platform
from ..utils import check_logprobs_close
from ...utils import models_path_prefix
......@@ -76,7 +76,6 @@ def test_models(
with monkeypatch.context() as m:
m.setenv("TOKENIZERS_PARALLELISM", "true")
m.setenv("VLLM_ATTENTION_BACKEND", backend)
MAX_MODEL_LEN = 1024
NUM_LOG_PROBS = 8
......@@ -87,6 +86,7 @@ def test_models(
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
kv_cache_dtype="auto",
attention_config={"backend": backend},
) as vllm_model:
baseline_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS
......@@ -98,6 +98,7 @@ def test_models(
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
kv_cache_dtype=kv_cache_dtype,
attention_config={"backend": backend},
) as vllm_model:
test_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS
......
......@@ -65,7 +65,10 @@ def test_models(
num_logprobs: int,
) -> None:
with vllm_runner(
model_pair.model_marlin, dtype=dtype, quantization="gptq_marlin_24"
model_pair.model_marlin,
dtype=dtype,
quantization="gptq_marlin_24",
allow_deprecated_quantization=True,
) as marlin_24_model:
marlin_24_outputs = marlin_24_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs
......
......@@ -51,9 +51,11 @@ class _HfExamplesInfo:
The maximum version of HF Transformers that this model runs on.
"""
transformers_version_reason: str | None = None
transformers_version_reason: dict[Literal["vllm", "hf"], str] | None = None
"""
The reason for the minimum/maximum version requirement.
The type and reason to skip test for the minimum/maximum version requirement.
vllm: skip all vLLM tests if the version requirement is not met.
hf: only skip tests that uses HF runner if the version requirement is not met.
"""
require_embed_inputs: bool = False
......@@ -113,6 +115,7 @@ class _HfExamplesInfo:
self,
*,
on_fail: Literal["error", "skip", "return"],
check_version_reason: Literal["vllm", "hf"] = "hf",
check_min_version: bool = True,
check_max_version: bool = True,
) -> str | None:
......@@ -133,23 +136,28 @@ class _HfExamplesInfo:
msg = f"`transformers=={current_version}` installed, but `transformers"
# Only check the base version for the min/max version, otherwise preview
# models cannot be run because `x.yy.0.dev0`<`x.yy.0`
if (
check_min_version
and min_version
and Version(cur_base_version) < Version(min_version)
):
if min_version and Version(cur_base_version) < Version(min_version):
is_version_valid = not check_min_version
msg += f">={min_version}` is required to run this model."
elif (
check_max_version
and max_version
and Version(cur_base_version) > Version(max_version)
):
elif max_version and Version(cur_base_version) > Version(max_version):
is_version_valid = not check_max_version
msg += f"<={max_version}` is required to run this model."
else:
return None
is_version_valid = True
if self.transformers_version_reason:
msg += f" Reason: {self.transformers_version_reason}"
# check if Transformers version breaks the corresponding model runner,
# skip test when model runner not compatible
is_reason_valid = not (
check_version_reason
and self.transformers_version_reason
and check_version_reason in self.transformers_version_reason
)
is_transformers_valid = is_version_valid and is_reason_valid
if is_transformers_valid:
return None
elif self.transformers_version_reason:
for reason_type, reason in self.transformers_version_reason.items():
msg += f" Reason({reason_type}): {reason}"
if on_fail == "error":
raise RuntimeError(msg)
......@@ -219,7 +227,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code=True,
),
"CwmForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "facebook/cwm"), min_transformers_version="4.58"),
"DbrxForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "databricks/dbrx-instruct")),
# FIXME: databricks/dbrx-instruct has been deleted
"DbrxForCausalLM": _HfExamplesInfo(
os.path.join(models_path_prefix, "databricks/dbrx-instruct"), is_available_online=False
),
"DeciLMForCausalLM": _HfExamplesInfo(
os.path.join(models_path_prefix, "nvidia/Llama-3_3-Nemotron-Super-49B-v1"),
trust_remote_code=True,
......@@ -243,6 +254,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
os.path.join(models_path_prefix, "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"), trust_remote_code=True
),
"Exaone4ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "LGAI-EXAONE/EXAONE-4.0-32B")),
"ExaoneMoEForCausalLM": _HfExamplesInfo(
"LGAI-EXAONE/K-EXAONE-236B-A23B", min_transformers_version="5.0.0"
),
"Fairseq2LlamaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "mgleize/fairseq2-dummy-Llama-3.2-1B")),
"FalconForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "tiiuae/falcon-7b")),
"FalconH1ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "tiiuae/Falcon-H1-0.5B-Base")),
......@@ -282,6 +296,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"Grok1ModelForCausalLM": _HfExamplesInfo(
os.path.join(models_path_prefix, "hpcai-tech/grok-1"), trust_remote_code=True
),
"Grok1ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "xai-org/grok-2"), trust_remote_code=True),
"HunYuanDenseV1ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "tencent/Hunyuan-7B-Instruct")),
"HunYuanMoEV1ForCausalLM": _HfExamplesInfo(
os.path.join(models_path_prefix, "tencent/Hunyuan-A13B-Instruct"), trust_remote_code=True
......@@ -302,6 +317,16 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"Jais2ForCausalLM": _HfExamplesInfo(
os.path.join(models_path_prefix, "inceptionai/Jais-2-8B-Chat"), min_transformers_version="4.58"
),
"IQuestCoderForCausalLM": _HfExamplesInfo(
os.path.join(models_path_prefix, "IQuestLab/IQuest-Coder-V1-40B-Instruct"), trust_remote_code=True
),
"IQuestLoopCoderForCausalLM": _HfExamplesInfo(
os.path.join(models_path_prefix, "IQuestLab/IQuest-Coder-V1-40B-Loop-Instruct"), trust_remote_code=True
),
"JAISLMHeadModel": _HfExamplesInfo(os.path.join(models_path_prefix, "inceptionai/jais-13b-chat")),
"Jais2ForCausalLM": _HfExamplesInfo(
os.path.join(models_path_prefix, "inceptionai/Jais-2-8B-Chat"), min_transformers_version="4.58"
),
"JambaForCausalLM": _HfExamplesInfo(
os.path.join(models_path_prefix, "ai21labs/AI21-Jamba-1.5-Mini"),
extras={
......@@ -348,6 +373,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"MiniCPM3ForCausalLM": _HfExamplesInfo(
os.path.join(models_path_prefix, "openbmb/MiniCPM3-4B"), trust_remote_code=True
),
"MiniCPM4ForCausalLM": _HfExamplesInfo(
os.path.join(models_path_prefix, "openbmb/MiniCPM4.1-8B"), trust_remote_code=True
),
"MiniMaxForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "MiniMaxAI/MiniMax-Text-01-hf")),
"MiniMaxText01ForCausalLM": _HfExamplesInfo(
os.path.join(models_path_prefix, "MiniMaxAI/MiniMax-Text-01"),
......@@ -370,7 +398,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
{"tiny": os.path.join(models_path_prefix, "TitanML/tiny-mixtral")},
),
"MptForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "mpt"), is_available_online=False),
"MPTForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "mosaicml/mpt-7b")),
# FIXME: mosaicml/mpt-7b has been deleted
"MPTForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "mosaicml/mpt-7b"), is_available_online=False),
"NemotronForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "nvidia/Minitron-8B-Base")),
"NemotronHForCausalLM": _HfExamplesInfo(
os.path.join(models_path_prefix, "nvidia/Nemotron-H-8B-Base-8K"), trust_remote_code=True
......@@ -394,6 +423,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"PanguEmbeddedForCausalLM": _HfExamplesInfo(
os.path.join(models_path_prefix, "FreedomIntelligence/openPangu-Embedded-7B-V1.1"), trust_remote_code=True
),
"PanguProMoEV2ForCausalLM": _HfExamplesInfo(
"",
trust_remote_code=True,
is_available_online=False,
),
"PanguUltraMoEForCausalLM": _HfExamplesInfo(
os.path.join(models_path_prefix, "FreedomIntelligence/openPangu-Ultra-MoE-718B-V1.1"),
trust_remote_code=True,
......@@ -416,7 +450,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"QWenLMHeadModel": _HfExamplesInfo(
os.path.join(models_path_prefix, "Qwen/Qwen-7B-Chat"),
max_transformers_version="4.53",
transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers", # noqa: E501
transformers_version_reason={
"hf": "HF model uses remote code that is not compatible with latest Transformers" # noqa: E501
},
trust_remote_code=True,
),
"Qwen2ForCausalLM": _HfExamplesInfo(
......@@ -463,6 +499,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
),
"Zamba2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "Zyphra/Zamba2-7B-instruct")),
"MiMoForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "XiaomiMiMo/MiMo-7B-RL"), trust_remote_code=True),
"MiMoV2FlashForCausalLM": _HfExamplesInfo(
os.path.join(models_path_prefix, "XiaomiMiMo/MiMo-V2-Flash"), trust_remote_code=True
),
"Dots1ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "rednote-hilab/dots.llm1.inst")),
}
......@@ -484,7 +523,10 @@ _EMBEDDING_EXAMPLE_MODELS = {
os.path.join(models_path_prefix, "internlm/internlm2-1_8b-reward"), trust_remote_code=True
),
"JambaForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-reward-dev")),
"LlamaModel": _HfExamplesInfo(os.path.join(models_path_prefix, "llama", is_available_online=False)),
"LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
"LlamaBidirectionalModel": _HfExamplesInfo(
os.path.join(models_path_prefix, "nvidia/llama-nemotron-embed-1b-v2"), trust_remote_code=True
),
"MistralModel": _HfExamplesInfo(os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct")),
"ModernBertModel": _HfExamplesInfo(
os.path.join(models_path_prefix, "Alibaba-NLP/gte-modernbert-base"), trust_remote_code=True
......@@ -496,12 +538,16 @@ _EMBEDDING_EXAMPLE_MODELS = {
"Qwen2ForRewardModel": _HfExamplesInfo(
os.path.join(models_path_prefix, "Qwen/Qwen2.5-Math-RM-72B"),
max_transformers_version="4.53",
transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers", # noqa: E501
transformers_version_reason={
"hf": "HF model uses remote code that is not compatible with latest Transformers" # noqa: E501
},
),
"Qwen2ForProcessRewardModel": _HfExamplesInfo(
os.path.join(models_path_prefix, "Qwen/Qwen2.5-Math-PRM-7B"),
max_transformers_version="4.53",
transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers", # noqa: E501
transformers_version_reason={
"hf": "HF model uses remote code that is not compatible with latest Transformers" # noqa: E501
},
),
"RobertaModel": _HfExamplesInfo(os.path.join(models_path_prefix, "sentence-transformers/stsb-roberta-base-v2")),
"RobertaForMaskedLM": _HfExamplesInfo(os.path.join(models_path_prefix, "sentence-transformers/all-roberta-large-v1")),
......@@ -551,6 +597,9 @@ _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = {
trust_remote_code=True,
hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
),
"LlamaBidirectionalForSequenceClassification": _HfExamplesInfo(
"nvidia/llama-nemotron-rerank-1b-v2", trust_remote_code=True
),
"ModernBertForSequenceClassification": _HfExamplesInfo(
os.path.join(models_path_prefix, "Alibaba-NLP/gte-reranker-modernbert-base")
),
......@@ -581,6 +630,15 @@ _AUTOMATIC_CONVERTED_MODELS = {
os.path.join(models_path_prefix, "tomaarsen/Qwen3-Reranker-0.6B-seq-cls")
),
"Qwen3ForTokenClassification": _HfExamplesInfo("bd2lcco/Qwen3-0.6B-finetuned"),
"Qwen3VLForSequenceClassification": _HfExamplesInfo(
"Qwen/Qwen3-VL-Reranker-2B",
is_available_online=False,
hf_overrides={
"architectures": ["Qwen3VLForSequenceClassification"],
"classifier_from_token": ["no", "yes"],
"is_original_qwen3_reranker": True,
},
),
}
_MULTIMODAL_EXAMPLE_MODELS = {
......@@ -607,7 +665,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
os.path.join(models_path_prefix, "deepseek-ai/deepseek-vl2-tiny"),
extras={"fork": os.path.join(models_path_prefix, "Isotr0py/deepseek-vl2-tiny")},
max_transformers_version="4.48",
transformers_version_reason="HF model is not compatible.",
transformers_version_reason={"hf": "HF model is not compatible."},
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
),
"DeepseekOCRForCausalLM": _HfExamplesInfo(
......@@ -624,6 +682,11 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"FuyuForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "adept/fuyu-8b")),
"Gemma3ForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "google/gemma-3-4b-it")),
"Gemma3nForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "google/gemma-3n-E2B-it")),
"GlmAsrForConditionalGeneration": _HfExamplesInfo(
os.path.join(models_path_prefix, "zai-org/GLM-ASR-Nano-2512"),
trust_remote_code=True,
min_transformers_version="5.0",
),
"GraniteSpeechForConditionalGeneration": _HfExamplesInfo(
os.path.join(models_path_prefix, "ibm-granite/granite-speech-3.3-2b")
),
......@@ -639,7 +702,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
trust_remote_code=True,
extras={"2b": os.path.join(models_path_prefix, "h2oai/h2ovl-mississippi-2b")},
max_transformers_version="4.48",
transformers_version_reason="HF model is not compatible.",
transformers_version_reason={"hf": "HF model is not compatible."},
),
"HCXVisionForCausalLM": _HfExamplesInfo(
os.path.join(models_path_prefix, "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"),
......@@ -653,6 +716,11 @@ _MULTIMODAL_EXAMPLE_MODELS = {
os.path.join(models_path_prefix, "HuggingFaceM4/Idefics3-8B-Llama3"),
extras={"tiny": os.path.join(models_path_prefix, "HuggingFaceTB/SmolVLM-256M-Instruct")},
),
"IsaacForConditionalGeneration": _HfExamplesInfo(
"PerceptronAI/Isaac-0.1",
trust_remote_code=True,
extras={"0.2-2B-Preview": "PerceptronAI/Isaac-0.2-2B-Preview"},
),
"InternS1ForConditionalGeneration": _HfExamplesInfo(
os.path.join(models_path_prefix, "internlm/Intern-S1"), trust_remote_code=True
),
......@@ -668,6 +736,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
trust_remote_code=True,
),
"InternVLForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "OpenGVLab/InternVL3-1B-hf")),
"KananaVForConditionalGeneration": _HfExamplesInfo(
os.path.join(models_path_prefix, "kakaocorp/kanana-1.5-v-3b-instruct"),
trust_remote_code=True,
),
"KeyeForConditionalGeneration": _HfExamplesInfo(
os.path.join(models_path_prefix, "Kwai-Keye/Keye-VL-8B-Preview"),
trust_remote_code=True,
......@@ -681,13 +753,21 @@ _MULTIMODAL_EXAMPLE_MODELS = {
extras={"thinking": os.path.join(models_path_prefix, "moonshotai/Kimi-VL-A3B-Thinking")},
trust_remote_code=True,
max_transformers_version="4.53.3",
transformers_version_reason="HF model uses deprecated transformers API "
"(PytorchGELUTanh, DynamicCache.seen_tokens, and more). See: "
"https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/discussions/31",
transformers_version_reason={
"hf": (
"HF model uses deprecated transformers API "
"(PytorchGELUTanh, DynamicCache.seen_tokens, and more). See: "
"https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/discussions/31"
)
},
),
"LightOnOCRForConditionalGeneration": _HfExamplesInfo(
os.path.join(models_path_prefix, "lightonai/LightOnOCR-1B-1025")
),
"Lfm2VlForConditionalGeneration": _HfExamplesInfo(
"LiquidAI/LFM2-VL-450M",
min_transformers_version="5.0.0",
),
"Llama4ForConditionalGeneration": _HfExamplesInfo(
os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct"),
max_model_len=10240,
......@@ -712,7 +792,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"MantisForConditionalGeneration": _HfExamplesInfo(
os.path.join(models_path_prefix, "TIGER-Lab/Mantis-8B-siglip-llama3"),
max_transformers_version="4.48",
transformers_version_reason="HF model is not compatible.",
transformers_version_reason={"hf": "HF model is not compatible."},
hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
),
"MiDashengLMModel": _HfExamplesInfo(
......@@ -739,7 +819,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"MolmoForCausalLM": _HfExamplesInfo(
os.path.join(models_path_prefix, "allenai/Molmo-7B-D-0924"),
max_transformers_version="4.48",
transformers_version_reason="Incorrectly-detected `tensorflow` import.",
transformers_version_reason={
"vllm": "Incorrectly-detected `tensorflow` import from processor."
},
extras={"olmo": os.path.join(models_path_prefix, "allenai/Molmo-7B-O-0924")},
trust_remote_code=True,
),
......@@ -758,7 +840,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
os.path.join(models_path_prefix, "AIDC-AI/Ovis2-1B"),
trust_remote_code=True,
max_transformers_version="4.53",
transformers_version_reason="HF model is not compatible",
transformers_version_reason={"hf": "HF model is not compatible"},
extras={
"1.6-llama": os.path.join(models_path_prefix, "AIDC-AI/Ovis1.6-Llama3.2-3B"),
"1.6-gemma": os.path.join(models_path_prefix, "AIDC-AI/Ovis1.6-Gemma2-9B"),
......@@ -777,7 +859,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
os.path.join(models_path_prefix, "microsoft/Phi-3-vision-128k-instruct"),
trust_remote_code=True,
max_transformers_version="4.48",
transformers_version_reason="Use of deprecated imports which have been removed.", # noqa: E501
transformers_version_reason={
"hf": "HF model use deprecated imports which have been removed."
}, # noqa: E501
extras={"phi3.5": os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct")},
),
"Phi4MMForCausalLM": _HfExamplesInfo(
......@@ -796,7 +880,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
extras={"chat": os.path.join(models_path_prefix, "Qwen/Qwen-VL-Chat")},
trust_remote_code=True,
max_transformers_version="4.53.3",
transformers_version_reason="Use of deprecated imports which have been removed.", # noqa: E501
transformers_version_reason={
"hf": "HF model uses deprecated imports which have been removed."
}, # noqa: E501
hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
),
"Qwen2AudioForConditionalGeneration": _HfExamplesInfo(
......@@ -851,7 +937,15 @@ _MULTIMODAL_EXAMPLE_MODELS = {
# disable this temporarily until we support HF format
is_available_online=False,
),
"VoxtralStreamingGeneration": _HfExamplesInfo(
"<place-holder>",
# disable this temporarily until we support HF format
is_available_online=False,
),
# [Encoder-decoder]
"NemotronParseForConditionalGeneration": _HfExamplesInfo(
"nvidia/NVIDIA-Nemotron-Parse-v1.1", trust_remote_code=True
),
"WhisperForConditionalGeneration": _HfExamplesInfo(
os.path.join(models_path_prefix, "openai/whisper-large-v3-turbo"),
extras={"v3": os.path.join(models_path_prefix, "openai/whisper-large-v3")},
......@@ -926,6 +1020,11 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
trust_remote_code=True,
speculative_model=os.path.join(models_path_prefix, "baidu/ERNIE-4.5-21B-A3B-PT"),
),
"ExaoneMoeMTP": _HfExamplesInfo(
"LGAI-EXAONE/K-EXAONE-236B-A23B",
speculative_model="LGAI-EXAONE/K-EXAONE-236B-A23B",
min_transformers_version="5.0.0",
),
"Glm4MoeMTPModel": _HfExamplesInfo(
os.path.join(models_path_prefix, "zai-org/GLM-4.5"),
speculative_model="zai-org/GLM-4.5",
......
......@@ -66,7 +66,11 @@ def can_initialize(
model_info = EXAMPLE_MODELS.get_hf_info(model_arch)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
model_info.check_transformers_version(
on_fail="skip",
check_max_version=False,
check_version_reason="vllm",
)
hf_overrides_fn = partial(
dummy_hf_overrides,
......@@ -108,11 +112,12 @@ def can_initialize(
patch.object(V1EngineCore, "_initialize_kv_caches", _initialize_kv_caches_v1),
monkeypatch.context() as m,
):
if model_arch == "GptOssForCausalLM":
# FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
# has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
# L4 supports FA3.
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
# FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
# has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
# L4 supports FA3.
attention_config = (
{"backend": "TRITON_ATTN"} if model_arch == "GptOssForCausalLM" else None
)
if model_arch == "WhisperForConditionalGeneration":
m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
......@@ -143,6 +148,7 @@ def can_initialize(
else "vllm",
hf_overrides=hf_overrides_fn,
max_num_seqs=model_info.max_num_seqs,
attention_config=attention_config,
)
......
......@@ -34,7 +34,11 @@ models_path_prefix = os.getenv('VLLM_OPTEST_MODELS_PATH') or os.getenv("OPTEST_M
def test_registry_imports(model_arch):
# Skip if transformers version is incompatible
model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
model_info.check_transformers_version(on_fail="skip")
model_info.check_transformers_version(
on_fail="skip",
check_max_version=False,
check_version_reason="vllm",
)
# Ensure all model classes can be imported successfully
model_cls = ModelRegistry._try_load_model_cls(model_arch)
assert model_cls is not None
......
......@@ -38,7 +38,7 @@ def test_inference(
max_num_seqs=32,
default_torch_num_threads=1,
) as vllm_model:
vllm_output = vllm_model.llm.encode(prompt)
vllm_output = vllm_model.llm.encode(prompt, pooling_task="plugin")
assert torch.equal(
torch.isnan(vllm_output[0].outputs.data).any(), torch.tensor(False)
)
......@@ -21,6 +21,7 @@ from vllm.model_executor.models.vision import (
from vllm.platforms import current_platform
from vllm.utils.network_utils import get_open_port
from vllm.utils.system_utils import update_environment_variables
from vllm.utils.torch_utils import set_random_seed
pytestmark = pytest.mark.cpu_test
......@@ -98,7 +99,7 @@ def run_dp_sharded_vision_model_vs_direct(
"""
# Set random seed for reproducibility
current_platform.seed_everything(0)
set_random_seed(0)
device = f"{current_platform.device_name}:{local_rank}"
current_platform.set_device(device)
......@@ -284,7 +285,7 @@ def run_dp_sharded_mrope_vision_model_vs_direct(
calling the model directly.
"""
# Set random seed for reproducibility
current_platform.seed_everything(0)
set_random_seed(0)
device = f"{current_platform.device_name}:{local_rank}"
current_platform.set_device(device)
torch.set_default_device(device)
......@@ -408,7 +409,7 @@ def run_dp_sharded_mrope_vision_model_uneven_load_worker(
):
"""Test run_dp_sharded_mrope_vision_model with uneven load distribution."""
# Set up distributed environment
current_platform.seed_everything(123)
set_random_seed(123)
device = f"{current_platform.device_name}:{local_rank}"
current_platform.set_device(device)
torch.set_default_device(device)
......
......@@ -10,7 +10,8 @@ import torch
import torch.nn.functional as F
from transformers import PretrainedConfig
from vllm.config.model import ModelConfig, ModelDType, RunnerOption
from vllm.config.model import AttnTypeStr, ModelConfig, ModelDType, RunnerOption
from vllm.config.pooler import SequencePoolingType, TokenPoolingType
from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
from vllm.multimodal.processing import InputProcessingContext
from vllm.tokenizers import cached_tokenizer_from_config
......@@ -292,7 +293,11 @@ def build_model_context(
"""
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
model_info.check_transformers_version(
on_fail="skip",
check_max_version=False,
check_version_reason="vllm",
)
model_config_kwargs = model_config_kwargs or {}
limit_mm_per_prompt = limit_mm_per_prompt or {}
......@@ -375,7 +380,11 @@ class ModelInfo:
max_model_len: int | None = None
hf_dtype: str = "float32"
hf_overrides: dict[str, Any] | None = None
default_pooling_type: str = ""
seq_pooling_type: SequencePoolingType | None = None
tok_pooling_type: TokenPoolingType | None = None
attn_type: AttnTypeStr | None = None
is_prefix_caching_supported: bool | None = None
is_chunked_prefill_supported: bool | None = None
enable_test: bool = True
......@@ -386,29 +395,10 @@ class EmbedModelInfo(ModelInfo):
matryoshka_dimensions: list[int] | None = None
@dataclass
class CLSPoolingEmbedModelInfo(EmbedModelInfo):
default_pooling_type: str = "CLS"
@dataclass
class LASTPoolingEmbedModelInfo(EmbedModelInfo):
default_pooling_type: str = "LAST"
@dataclass
class RerankModelInfo(ModelInfo):
mteb_score: float | None = None
@dataclass
class CLSPoolingRerankModelInfo(RerankModelInfo):
default_pooling_type: str = "CLS"
@dataclass
class LASTPoolingRerankModelInfo(RerankModelInfo):
default_pooling_type: str = "LAST"
chat_template_name: str | None = None
@dataclass
......@@ -483,12 +473,16 @@ def dummy_hf_overrides(
"num_kv_shared_layers": 1,
}
_hf_config = hf_config
class DummyConfig:
hf_config = _hf_config
hf_text_config = text_config
model_arch_config = ModelConfig.get_model_arch_config(DummyConfig)
# Only set MoE related config when the model has MoE layers.
# Otherwise all models detected as MoE by _get_transformers_backend_cls.
if ModelConfig.get_num_experts(DummyConfig) > 0:
if model_arch_config.num_experts > 0:
update_dict.update(
{
"num_experts": num_experts,
......
......@@ -7,10 +7,16 @@ from unittest.mock import patch
import numpy as np
import pytest
import torch
from vllm.multimodal.audio import (
MONO_AUDIO_SPEC,
PASSTHROUGH_AUDIO_SPEC,
AudioMediaIO,
AudioResampler,
AudioSpec,
ChannelReduction,
normalize_audio,
resample_audio_librosa,
resample_audio_scipy,
)
......@@ -137,3 +143,500 @@ def test_audio_media_io_encode_base64(dummy_audio):
decoded = base64.b64decode(out)
assert decoded == b"dummy_wav_data"
mock_write.assert_called_once()
# ============================================================
# Tests for normalize_audio function
# ============================================================
class TestNormalizeAudio:
"""Tests for normalize_audio function with different specs."""
def test_passthrough_preserves_audio(self):
"""Passthrough spec should not modify audio."""
stereo = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=np.float32)
result = normalize_audio(stereo, PASSTHROUGH_AUDIO_SPEC)
np.testing.assert_array_equal(result, stereo)
def test_mono_spec_with_numpy_stereo(self):
"""Mono spec should reduce stereo numpy array to 1D."""
stereo = np.array([[1.0, 2.0], [-1.0, 0.0]], dtype=np.float32)
result = normalize_audio(stereo, MONO_AUDIO_SPEC)
assert result.ndim == 1
np.testing.assert_array_almost_equal(result, [0.0, 1.0])
def test_mono_spec_with_torch_stereo(self):
"""Mono spec should reduce stereo torch tensor to 1D."""
stereo = torch.tensor([[1.0, 2.0], [-1.0, 0.0]])
result = normalize_audio(stereo, MONO_AUDIO_SPEC)
assert result.ndim == 1
torch.testing.assert_close(result, torch.tensor([0.0, 1.0]))
def test_mono_passthrough_for_1d_numpy(self):
"""1D numpy array should pass through unchanged with mono spec."""
mono = np.array([1.0, 2.0, 3.0], dtype=np.float32)
result = normalize_audio(mono, MONO_AUDIO_SPEC)
assert result.ndim == 1
np.testing.assert_array_equal(result, mono)
def test_mono_passthrough_for_1d_torch(self):
"""1D torch tensor should pass through unchanged with mono spec."""
mono = torch.tensor([1.0, 2.0, 3.0])
result = normalize_audio(mono, MONO_AUDIO_SPEC)
assert result.ndim == 1
torch.testing.assert_close(result, mono)
def test_first_channel_reduction(self):
"""FIRST reduction should take only the first channel."""
spec = AudioSpec(target_channels=1, channel_reduction=ChannelReduction.FIRST)
stereo = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
result = normalize_audio(stereo, spec)
np.testing.assert_array_equal(result, [1.0, 2.0])
def test_max_channel_reduction(self):
"""MAX reduction should take max across channels."""
spec = AudioSpec(target_channels=1, channel_reduction=ChannelReduction.MAX)
stereo = np.array([[1.0, 4.0], [3.0, 2.0]], dtype=np.float32)
result = normalize_audio(stereo, spec)
np.testing.assert_array_equal(result, [3.0, 4.0])
def test_sum_channel_reduction(self):
"""SUM reduction should sum across channels."""
spec = AudioSpec(target_channels=1, channel_reduction=ChannelReduction.SUM)
stereo = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
result = normalize_audio(stereo, spec)
np.testing.assert_array_equal(result, [4.0, 6.0])
def test_invalid_3d_array_raises(self):
"""3D arrays should raise ValueError."""
audio_3d = np.random.randn(2, 3, 4).astype(np.float32)
with pytest.raises(ValueError, match="Unsupported audio"):
normalize_audio(audio_3d, MONO_AUDIO_SPEC)
def test_channel_expansion_raises(self):
"""Expanding from mono to stereo should raise ValueError."""
mono = np.array([1.0, 2.0, 3.0], dtype=np.float32)
spec = AudioSpec(target_channels=2)
with pytest.raises(ValueError, match="Cannot expand"):
normalize_audio(mono, spec)
def test_time_channels_format_numpy(self):
"""Audio in (time, channels) format should be transposed to (channels, time).
This handles the case where audio loaders like soundfile return
(time, channels) format instead of (channels, time) like torchaudio.
"""
# Create audio in (time, channels) format: 1000 samples, 2 channels
audio_time_channels = np.array(
[[1.0, -1.0]] * 1000, # 1000 time steps, 2 channels
dtype=np.float32,
)
assert audio_time_channels.shape == (1000, 2) # (time, channels)
result = normalize_audio(audio_time_channels, MONO_AUDIO_SPEC)
# Should be reduced to mono 1D
assert result.ndim == 1
assert result.shape == (1000,)
# Mean of [1.0, -1.0] at each time step should be 0.0
np.testing.assert_array_almost_equal(result, np.zeros(1000))
def test_time_channels_format_torch(self):
"""Torch tensor in (time, channels) format should be transposed."""
# Create audio in (time, channels) format: 1000 samples, 2 channels
audio_time_channels = torch.tensor(
[[1.0, -1.0]] * 1000, # 1000 time steps, 2 channels
)
assert audio_time_channels.shape == (1000, 2) # (time, channels)
result = normalize_audio(audio_time_channels, MONO_AUDIO_SPEC)
# Should be reduced to mono 1D
assert result.ndim == 1
assert result.shape == (1000,)
# Mean of [1.0, -1.0] at each time step should be 0.0
torch.testing.assert_close(result, torch.zeros(1000))
def test_channels_time_format_preserved(self):
"""Audio already in (channels, time) format should work correctly."""
# Create audio in standard (channels, time) format: 2 channels, 1000 samples
audio_channels_time = np.array(
[[1.0] * 1000, [-1.0] * 1000], # 2 channels, 1000 time steps
dtype=np.float32,
)
assert audio_channels_time.shape == (2, 1000) # (channels, time)
result = normalize_audio(audio_channels_time, MONO_AUDIO_SPEC)
# Should be reduced to mono 1D
assert result.ndim == 1
assert result.shape == (1000,)
# Mean of [1.0, -1.0] at each time step should be 0.0
np.testing.assert_array_almost_equal(result, np.zeros(1000))
def test_ambiguous_square_audio_numpy(self):
"""Square audio arrays (N, N) should use shape[0] > shape[1] heuristic.
For a square array, shape[0] == shape[1], so no transpose happens
and we assume (channels, time) format.
"""
# Create square audio: 4 channels, 4 samples
audio_square = np.array(
[
[1.0, 2.0, 3.0, 4.0],
[5.0, 6.0, 7.0, 8.0],
[9.0, 10.0, 11.0, 12.0],
[13.0, 14.0, 15.0, 16.0],
],
dtype=np.float32,
)
assert audio_square.shape == (4, 4)
result = normalize_audio(audio_square, MONO_AUDIO_SPEC)
# Should be reduced to mono 1D with mean across channels (axis 0)
assert result.ndim == 1
assert result.shape == (4,)
# Mean across 4 channels: [1+5+9+13, 2+6+10+14, ...] / 4
expected = np.array([7.0, 8.0, 9.0, 10.0])
np.testing.assert_array_almost_equal(result, expected)
# ============================================================
# Tests for MultiModalDataParser integration with target_channels
# ============================================================
class TestMultiModalDataParserChannelNormalization:
"""Tests for MultiModalDataParser.target_channels integration.
These tests verify that the target_channels parameter is properly used
in the _parse_audio_data method to normalize audio channels.
"""
def test_parser_normalizes_stereo_to_mono(self):
"""Parser should normalize stereo to mono when target_channels=1."""
from vllm.multimodal.parse import MultiModalDataParser
# Create parser with mono normalization enabled
parser = MultiModalDataParser(
target_sr=16000,
target_channels=1,
)
# Create stereo audio (simulating torchaudio output)
stereo_audio = np.array(
[[1.0, 1.0, 1.0], [-1.0, -1.0, -1.0]], # 2 channels, 3 samples
dtype=np.float32,
)
# Parse audio data
result = parser._parse_audio_data((stereo_audio, 16000))
# Check that result is mono (1D)
audio_item = result.get(0)
assert audio_item.ndim == 1, f"Expected 1D mono audio, got {audio_item.ndim}D"
assert audio_item.shape == (3,), f"Expected shape (3,), got {audio_item.shape}"
# Channel average of [1, 1, 1] and [-1, -1, -1] should be [0, 0, 0]
np.testing.assert_array_almost_equal(audio_item, np.zeros(3))
def test_parser_preserves_stereo_when_target_channels_none(self):
"""Parser should preserve stereo when target_channels=None."""
from vllm.multimodal.parse import MultiModalDataParser
# Create parser without channel normalization
parser = MultiModalDataParser(
target_sr=16000,
target_channels=None,
)
# Create stereo audio
stereo_audio = np.array(
[[1.0, 1.0, 1.0], [-1.0, -1.0, -1.0]],
dtype=np.float32,
)
# Parse audio data
result = parser._parse_audio_data((stereo_audio, 16000))
# Check that result preserves original shape (after resampling)
audio_item = result.get(0)
# When target_channels=None, stereo audio should be preserved
assert audio_item.ndim == 2, f"Expected 2D stereo audio, got {audio_item.ndim}D"
def test_parser_mono_passthrough_when_target_channels_1(self):
"""Parser should pass through mono audio unchanged when target_channels=1."""
from vllm.multimodal.parse import MultiModalDataParser
# Create parser with mono normalization enabled
parser = MultiModalDataParser(
target_sr=16000,
target_channels=1,
)
# Create mono audio (already 1D)
mono_audio = np.random.randn(16000).astype(np.float32)
# Parse audio data
result = parser._parse_audio_data((mono_audio, 16000))
# Check that result is still mono (1D)
audio_item = result.get(0)
assert audio_item.ndim == 1
assert audio_item.shape == (16000,)
def test_parser_with_target_channels_2(self):
"""Parser should reduce 6-channel to 2-channel when target_channels=2."""
from vllm.multimodal.parse import MultiModalDataParser
# Create parser with stereo target
parser = MultiModalDataParser(
target_sr=16000,
target_channels=2,
)
# Create 6-channel audio (5.1 surround)
surround_audio = np.random.randn(6, 1000).astype(np.float32)
# Parse audio data
result = parser._parse_audio_data((surround_audio, 16000))
# Check that result is stereo (2 channels)
audio_item = result.get(0)
assert audio_item.ndim == 2
assert audio_item.shape[0] == 2 # 2 channels
# ============================================================
# End-to-End Audio Pipeline Tests
# ============================================================
class TestAudioPipelineE2E:
"""End-to-end tests for audio normalization in the full pipeline.
These tests verify the complete flow from raw audio input through
the MultiModalDataParser, simulating different audio loader formats.
"""
def test_stereo_audio_normalized_to_mono_e2e(self):
"""Full pipeline: stereo audio (torchaudio format) → mono output."""
from vllm.multimodal.parse import MultiModalDataParser
# Simulate torchaudio output: (channels, time) format
# Stereo audio with left channel = 1.0, right channel = -1.0
stereo_torchaudio = np.array(
[[1.0] * 16000, [-1.0] * 16000], # 2 channels, 1 second at 16kHz
dtype=np.float32,
)
assert stereo_torchaudio.shape == (2, 16000)
# Create parser with mono normalization (like Whisper models)
parser = MultiModalDataParser(
target_sr=16000,
target_channels=1,
)
# Process audio through the parser
result = parser._parse_audio_data((stereo_torchaudio, 16000))
audio_output = result.get(0)
# Verify output is mono 1D
assert audio_output.ndim == 1, f"Expected 1D, got {audio_output.ndim}D"
assert audio_output.shape == (16000,)
# Verify channel averaging: mean of [1.0, -1.0] = 0.0
np.testing.assert_array_almost_equal(audio_output, np.zeros(16000), decimal=5)
def test_soundfile_format_normalized_to_mono_e2e(self):
"""Full pipeline: soundfile format (time, channels) → mono output."""
from vllm.multimodal.parse import MultiModalDataParser
# Simulate soundfile output: (time, channels) format
# 16000 samples, 2 channels
stereo_soundfile = np.array(
[[0.5, -0.5]] * 16000, # Each row is [left, right]
dtype=np.float32,
)
assert stereo_soundfile.shape == (16000, 2)
# Create parser with mono normalization
parser = MultiModalDataParser(
target_sr=16000,
target_channels=1,
)
# Process audio through the parser
result = parser._parse_audio_data((stereo_soundfile, 16000))
audio_output = result.get(0)
# Verify output is mono 1D
assert audio_output.ndim == 1, f"Expected 1D, got {audio_output.ndim}D"
assert audio_output.shape == (16000,)
# Verify channel averaging: mean of [0.5, -0.5] = 0.0
np.testing.assert_array_almost_equal(audio_output, np.zeros(16000), decimal=5)
def test_librosa_mono_passthrough_e2e(self):
"""Full pipeline: librosa mono format → preserved as mono."""
from vllm.multimodal.parse import MultiModalDataParser
# Simulate librosa output: already mono (time,) format
mono_librosa = np.random.randn(16000).astype(np.float32)
assert mono_librosa.shape == (16000,)
# Create parser with mono normalization
parser = MultiModalDataParser(
target_sr=16000,
target_channels=1,
)
# Process audio through the parser
result = parser._parse_audio_data((mono_librosa, 16000))
audio_output = result.get(0)
# Verify output is still mono 1D
assert audio_output.ndim == 1
assert audio_output.shape == (16000,)
# Verify audio content is preserved
np.testing.assert_array_almost_equal(audio_output, mono_librosa)
def test_multichannel_5_1_surround_to_mono_e2e(self):
"""Full pipeline: 5.1 surround (6 channels) → mono output."""
from vllm.multimodal.parse import MultiModalDataParser
# Simulate 5.1 surround audio: 6 channels
surround_audio = np.array(
[
[1.0] * 8000, # Front Left
[2.0] * 8000, # Front Right
[3.0] * 8000, # Center
[4.0] * 8000, # LFE (subwoofer)
[5.0] * 8000, # Rear Left
[6.0] * 8000, # Rear Right
],
dtype=np.float32,
)
assert surround_audio.shape == (6, 8000)
# Create parser with mono normalization
parser = MultiModalDataParser(
target_sr=16000,
target_channels=1,
)
# Process audio through the parser
result = parser._parse_audio_data((surround_audio, 16000))
audio_output = result.get(0)
# Verify output is mono 1D
assert audio_output.ndim == 1
# Verify channel averaging: mean of [1,2,3,4,5,6] = 3.5
expected_value = (1.0 + 2.0 + 3.0 + 4.0 + 5.0 + 6.0) / 6
np.testing.assert_array_almost_equal(
audio_output, np.full(8000, expected_value), decimal=5
)
def test_torch_tensor_input_e2e(self):
"""Full pipeline: torch.Tensor stereo input → mono numpy output."""
from vllm.multimodal.parse import MultiModalDataParser
# Simulate torch tensor input (from torchaudio)
stereo_torch = torch.tensor(
[[1.0] * 8000, [-1.0] * 8000], # 2 channels
dtype=torch.float32,
)
assert stereo_torch.shape == (2, 8000)
# Create parser with mono normalization
parser = MultiModalDataParser(
target_sr=16000,
target_channels=1,
)
# Process audio through the parser
# Note: Parser expects numpy, so we convert first (simulating real usage)
result = parser._parse_audio_data((stereo_torch.numpy(), 16000))
audio_output = result.get(0)
# Verify output is mono 1D numpy array
assert audio_output.ndim == 1
assert isinstance(audio_output, np.ndarray)
# Verify channel averaging
np.testing.assert_array_almost_equal(audio_output, np.zeros(8000), decimal=5)
def test_passthrough_preserves_stereo_e2e(self):
"""Full pipeline: stereo with target_channels=None → stereo preserved."""
from vllm.multimodal.parse import MultiModalDataParser
# Stereo audio
stereo_audio = np.array(
[[1.0] * 8000, [-1.0] * 8000],
dtype=np.float32,
)
# Create parser WITHOUT mono normalization (passthrough)
parser = MultiModalDataParser(
target_sr=16000,
target_channels=None, # Passthrough - no normalization
)
# Process audio through the parser
result = parser._parse_audio_data((stereo_audio, 16000))
audio_output = result.get(0)
# Verify output preserves stereo (2D)
assert audio_output.ndim == 2
assert audio_output.shape == (2, 8000)
def test_resampling_with_channel_normalization_e2e(self):
"""Full pipeline: resample + channel normalize in single pass."""
from vllm.multimodal.parse import MultiModalDataParser
# Stereo audio at 48kHz (common recording rate)
stereo_48k = np.array(
[[1.0] * 48000, [-1.0] * 48000], # 1 second at 48kHz
dtype=np.float32,
)
# Create parser with both resampling and mono normalization
parser = MultiModalDataParser(
target_sr=16000, # Resample to 16kHz
target_channels=1, # Normalize to mono
)
# Process audio through the parser
result = parser._parse_audio_data((stereo_48k, 48000))
audio_output = result.get(0)
# Verify output is mono 1D at target sample rate
assert audio_output.ndim == 1
# After resampling from 48kHz to 16kHz, length should be ~16000
assert audio_output.shape[0] == 16000
def test_very_short_audio_e2e(self):
"""Full pipeline: very short audio (< 1 frame) handled correctly."""
from vllm.multimodal.parse import MultiModalDataParser
# Very short stereo audio (10 samples)
short_stereo = np.array(
[[1.0] * 10, [-1.0] * 10],
dtype=np.float32,
)
parser = MultiModalDataParser(
target_sr=16000,
target_channels=1,
)
result = parser._parse_audio_data((short_stereo, 16000))
audio_output = result.get(0)
# Should still produce mono output
assert audio_output.ndim == 1
assert audio_output.shape == (10,)
np.testing.assert_array_almost_equal(audio_output, np.zeros(10))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment