Commit 0da93439 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.18.1rc0' into v0.18.1rc0-ori

parents 25f2f756 298e5108
...@@ -17,89 +17,6 @@ from unittest.mock import MagicMock, patch ...@@ -17,89 +17,6 @@ from unittest.mock import MagicMock, patch
import pytest import pytest
import torch import torch
from vllm.model_executor.layers.quantization.mxfp4 import (
Mxfp4Backend,
Mxfp4MoEMethod,
)
def _make_mock_moe_config(ep_size: int = 1) -> MagicMock:
"""Create a mock FusedMoEConfig with the given EP size."""
parallel_config = MagicMock()
parallel_config.ep_size = ep_size
moe_config = MagicMock()
moe_config.ep_size = ep_size
moe_config.is_lora_enabled = False
moe_config.moe_parallel_config = parallel_config
return moe_config
class TestMxfp4TritonIsMonolithic:
"""Verify that is_monolithic is always True for the TRITON backend,
regardless of EP size, since triton_kernel_moe_forward now handles
expert_map remapping internally."""
@pytest.mark.parametrize(
"backend,ep_size,expected_monolithic",
[
# TRITON is always monolithic (handles EP via expert_map remapping)
(Mxfp4Backend.TRITON, 1, True),
(Mxfp4Backend.TRITON, 2, True),
(Mxfp4Backend.TRITON, 4, True),
# SM100 backends are always monolithic
(Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM, 1, True),
(Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM, 2, True),
(Mxfp4Backend.SM100_FI_MXFP4_BF16, 1, True),
(Mxfp4Backend.SM100_FI_MXFP4_BF16, 2, True),
# MARLIN is never monolithic
(Mxfp4Backend.MARLIN, 1, False),
(Mxfp4Backend.MARLIN, 2, False),
],
ids=[
"triton-no-ep",
"triton-ep2",
"triton-ep4",
"sm100-trtllm-no-ep",
"sm100-trtllm-ep2",
"sm100-bf16-no-ep",
"sm100-bf16-ep2",
"marlin-no-ep",
"marlin-ep2",
],
)
@patch(
"vllm.model_executor.layers.quantization.mxfp4.get_mxfp4_backend",
)
@patch(
"vllm.model_executor.layers.quantization.mxfp4.get_current_vllm_config",
)
def test_is_monolithic(
self,
mock_get_config,
mock_get_backend,
backend,
ep_size,
expected_monolithic,
):
"""is_monolithic should be True for TRITON regardless of EP size."""
mock_get_backend.return_value = backend
mock_compilation_config = MagicMock()
mock_compilation_config.max_cudagraph_capture_size = 1024
mock_vllm_config = MagicMock()
mock_vllm_config.compilation_config = mock_compilation_config
mock_get_config.return_value = mock_vllm_config
moe_config = _make_mock_moe_config(ep_size=ep_size)
method = Mxfp4MoEMethod(moe_config)
assert method.is_monolithic == expected_monolithic, (
f"Expected is_monolithic={expected_monolithic} for "
f"backend={backend.name}, ep_size={ep_size}, "
f"but got {method.is_monolithic}."
)
class TestTritonMoeForwardExpertMap: class TestTritonMoeForwardExpertMap:
"""Test that triton_kernel_moe_forward applies expert_map remapping """Test that triton_kernel_moe_forward applies expert_map remapping
......
...@@ -160,6 +160,8 @@ def test_rocm_wvsplitkrc_kernel(xnorm, n, k, m, dtype, seed, padded_a, bias_mode ...@@ -160,6 +160,8 @@ def test_rocm_wvsplitkrc_kernel(xnorm, n, k, m, dtype, seed, padded_a, bias_mode
BIAS = torch.rand(m, dtype=dtype, device="cuda") * 2 - 1 BIAS = torch.rand(m, dtype=dtype, device="cuda") * 2 - 1
elif bias_mode == 2: elif bias_mode == 2:
BIAS = torch.rand(n, m, dtype=dtype, device="cuda") * 2 - 1 BIAS = torch.rand(n, m, dtype=dtype, device="cuda") * 2 - 1
elif bias_mode == 3:
BIAS = torch.rand(1, m, dtype=dtype, device="cuda") * 2 - 1
ref_out = torch.nn.functional.linear(A, B, BIAS) ref_out = torch.nn.functional.linear(A, B, BIAS)
out = ops.wvSplitKrc(A, B, cu_count, BIAS) out = ops.wvSplitKrc(A, B, cu_count, BIAS)
...@@ -224,10 +226,9 @@ def test_rocm_wvsplitk_kernel( ...@@ -224,10 +226,9 @@ def test_rocm_wvsplitk_kernel(
ref_out = torch.nn.functional.linear(A, B, BIAS) ref_out = torch.nn.functional.linear(A, B, BIAS)
out = ops.wvSplitK(B, A.view(-1, A.size(-1)), cu_count, BIAS) out = ops.wvSplitK(B, A.view(-1, A.size(-1)), cu_count, BIAS)
if xnorm: # Accumulation error in fp16 GEMM scales with sqrt(K)
assert torch.allclose(out, ref_out, atol=1e-3, rtol=1e-8) atol = torch.finfo(dtype).eps * math.sqrt(k)
else: torch.testing.assert_close(out, ref_out, atol=atol, rtol=1e-2)
assert torch.allclose(out, ref_out, atol=1e-3, rtol=1e-2)
@pytest.mark.parametrize("xnorm", [False, True]) @pytest.mark.parametrize("xnorm", [False, True])
......
...@@ -294,6 +294,11 @@ def whisper_lora_files(): ...@@ -294,6 +294,11 @@ def whisper_lora_files():
return snapshot_download(repo_id="chengyili2005/whisper-small-mandarin-lora") return snapshot_download(repo_id="chengyili2005/whisper-small-mandarin-lora")
@pytest.fixture(scope="session")
def qwen35_dense_model_lora_files():
return snapshot_download(repo_id="jeeejeee/qwen35-4b-text-only-sql-lora")
@pytest.fixture @pytest.fixture
def reset_default_device(): def reset_default_device():
""" """
......
...@@ -711,3 +711,192 @@ def test_packed_loras(default_vllm_config, dist_init, dummy_model_gate_up, devic ...@@ -711,3 +711,192 @@ def test_packed_loras(default_vllm_config, dist_init, dummy_model_gate_up, devic
torch.testing.assert_close( torch.testing.assert_close(
packed_lora1.lora_b[1], model_lora_clone1.get_lora("up_proj").lora_b packed_lora1.lora_b[1], model_lora_clone1.get_lora("up_proj").lora_b
) )
def _test_target_modules(
model,
target_modules: list[str] | None,
device: str,
expected_lora: list[tuple[str, type]],
expected_no_lora: list[tuple[str, type]],
):
"""Create a LoRAModelManager and assert which modules have LoRA applied."""
LoRAModelManager(
model,
2,
2,
2,
LoRAConfig(
max_lora_rank=8,
max_cpu_loras=2,
max_loras=2,
lora_dtype=DEFAULT_DTYPE,
target_modules=target_modules,
),
device=device,
)
for module_path, lora_cls in expected_lora:
assert isinstance(model.get_submodule(module_path), lora_cls)
for module_path, lora_cls in expected_no_lora:
assert not isinstance(model.get_submodule(module_path), lora_cls)
@pytest.mark.parametrize("device", DEVICES)
def test_target_modules_config(default_vllm_config, dist_init, dummy_model, device):
"""Test that target_modules config restricts which modules get LoRA applied."""
_test_target_modules(
dummy_model,
["dense1"],
device,
expected_lora=[
("dense1", ColumnParallelLinearWithLoRA),
("layer1.dense1", ColumnParallelLinearWithLoRA),
],
expected_no_lora=[
("dense2", RowParallelLinearWithLoRA),
("layer1.dense2", RowParallelLinearWithLoRA),
],
)
@pytest.mark.parametrize("device", DEVICES)
def test_target_modules_multiple(default_vllm_config, dist_init, dummy_model, device):
"""Test that multiple target_modules work correctly."""
_test_target_modules(
dummy_model,
["dense1", "dense2"],
device,
expected_lora=[
("dense1", ColumnParallelLinearWithLoRA),
("layer1.dense1", ColumnParallelLinearWithLoRA),
("dense2", RowParallelLinearWithLoRA),
("layer1.dense2", RowParallelLinearWithLoRA),
],
expected_no_lora=[],
)
@pytest.mark.parametrize("device", DEVICES)
def test_target_modules_none_uses_all(
default_vllm_config, dist_init, dummy_model, device
):
"""Test that target_modules=None uses all supported modules."""
_test_target_modules(
dummy_model,
None,
device,
expected_lora=[
("dense1", ColumnParallelLinearWithLoRA),
("layer1.dense1", ColumnParallelLinearWithLoRA),
("dense2", RowParallelLinearWithLoRA),
("layer1.dense2", RowParallelLinearWithLoRA),
],
expected_no_lora=[],
)
@pytest.mark.parametrize("device", DEVICES)
def test_load_adapter_warns_on_unsupported_modules(
default_vllm_config, dist_init, dummy_model_gate_up, device, tmp_path
):
"""Test that _load_adapter warns when a LoRA adapter contains modules
not in the model's supported LoRA target modules."""
from unittest.mock import patch
import vllm.lora.worker_manager as wm_module
lora_config = LoRAConfig(
max_lora_rank=8, max_cpu_loras=4, max_loras=4, lora_dtype=DEFAULT_DTYPE
)
dummy_lora_files = f"{tmp_path}/lora_adapter"
os.makedirs(dummy_lora_files, exist_ok=True)
create_peft_lora(
dummy_model_gate_up,
save_dir=dummy_lora_files,
target_modules=["layer1.dense1", "dense2"],
lora_dtype=DEFAULT_DTYPE,
)
model_config = ModelConfig(max_model_len=16)
vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config)
vllm_config.scheduler_config.max_num_seqs = 4
vllm_config.scheduler_config.max_num_batched_tokens = 2
worker_manager = WorkerLoRAManager(vllm_config, device, EMBEDDING_MODULES)
worker_manager.vocab_size = dummy_model_gate_up.unpadded_vocab_size
worker_manager.create_lora_manager(dummy_model_gate_up)
# Patch from_local_checkpoint to inject an unsupported module
original_from_checkpoint = LoRAModel.from_local_checkpoint
def patched_from_checkpoint(*args, **kwargs):
lora = original_from_checkpoint(*args, **kwargs)
lora.loras["unsupported_module"] = LoRALayerWeights(
module_name="unsupported_module",
rank=8,
lora_alpha=16,
lora_a=torch.randn(8, 10),
lora_b=torch.randn(10, 8),
)
return lora
lora_request = LoRARequest("test", 1, dummy_lora_files)
with (
patch.object(LoRAModel, "from_local_checkpoint", patched_from_checkpoint),
patch.object(wm_module.logger, "warning_once") as mock_warning,
):
worker_manager._load_adapter(lora_request)
warning_args = mock_warning.call_args_list
found = any("unsupported_module" in str(call) for call in warning_args)
assert found, (
f"Expected warning about 'unsupported_module', got: {warning_args}"
)
@pytest.mark.parametrize("device", DEVICES)
def test_load_adapter_warns_on_target_modules_restriction(
default_vllm_config, dist_init, dummy_model_gate_up, device, tmp_path
):
"""Test that _load_adapter warns when a LoRA adapter contains modules
excluded by the deployment-time target_modules restriction."""
from unittest.mock import patch
import vllm.lora.worker_manager as wm_module
# Restrict to only dense2 — adapter has dense1 which will be excluded
lora_config = LoRAConfig(
max_lora_rank=8,
max_cpu_loras=4,
max_loras=4,
lora_dtype=DEFAULT_DTYPE,
target_modules=["dense2"],
)
dummy_lora_files = f"{tmp_path}/lora_adapter"
os.makedirs(dummy_lora_files, exist_ok=True)
create_peft_lora(
dummy_model_gate_up,
save_dir=dummy_lora_files,
target_modules=["layer1.dense1", "dense2"],
lora_dtype=DEFAULT_DTYPE,
)
model_config = ModelConfig(max_model_len=16)
vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config)
vllm_config.scheduler_config.max_num_seqs = 4
vllm_config.scheduler_config.max_num_batched_tokens = 2
worker_manager = WorkerLoRAManager(vllm_config, device, EMBEDDING_MODULES)
worker_manager.vocab_size = dummy_model_gate_up.unpadded_vocab_size
worker_manager.create_lora_manager(dummy_model_gate_up)
lora_request = LoRARequest("test", 1, dummy_lora_files)
with patch.object(wm_module.logger, "warning_once") as mock_warning:
worker_manager._load_adapter(lora_request)
warning_args = mock_warning.call_args_list
# dense1 is supported by the model but excluded by target_modules
found = any("target_modules" in str(call) for call in warning_args)
assert found, (
f"Expected warning about target_modules restriction, got: {warning_args}"
)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm.lora.utils import is_in_target_modules, is_supported_lora_module
class TestIsSupportedLoraModule:
"""Tests for is_supported_lora_module (model-definition check)."""
def test_suffix_match(self):
assert is_supported_lora_module(
"model.layers.0.self_attn.o_proj", ["o_proj", "q_proj"]
)
def test_no_match(self):
assert not is_supported_lora_module(
"model.layers.0.self_attn.o_proj", ["q_proj", "k_proj"]
)
def test_exact_match(self):
assert is_supported_lora_module("o_proj", ["o_proj"])
def test_regex_suffix_matching(self):
"""Regex anchors to end — partial suffix should not match."""
assert not is_supported_lora_module("model.layers.0.self_attn.o_proj", ["proj"])
def test_empty_supported_modules(self):
assert not is_supported_lora_module("model.layers.0.self_attn.o_proj", [])
def test_multiple_supported_modules(self):
supported = ["q_proj", "k_proj", "v_proj", "o_proj"]
assert is_supported_lora_module("model.layers.0.self_attn.v_proj", supported)
assert not is_supported_lora_module("model.layers.0.mlp.gate_proj", supported)
class TestIsInTargetModules:
"""Tests for is_in_target_modules (deployment-time filter)."""
def test_none_allows_all(self):
assert is_in_target_modules("model.layers.0.self_attn.o_proj", None)
def test_suffix_in_target(self):
assert is_in_target_modules(
"model.layers.0.self_attn.o_proj", ["o_proj", "q_proj"]
)
def test_suffix_not_in_target(self):
assert not is_in_target_modules(
"model.layers.0.self_attn.o_proj", ["q_proj", "k_proj"]
)
def test_empty_target_modules(self):
assert not is_in_target_modules("model.layers.0.self_attn.o_proj", [])
def test_exact_name_match(self):
assert is_in_target_modules("dense1", ["dense1", "dense2"])
def test_exact_name_no_match(self):
assert not is_in_target_modules("dense3", ["dense1", "dense2"])
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from transformers import AutoTokenizer
import vllm
import vllm.config
from vllm.lora.request import LoRARequest
from ..utils import create_new_process_for_each_test, multi_gpu_test
MODEL_PATH = "Qwen/Qwen3.5-4B"
PROMPT_TEMPLATE = """Write a SQL query for the given database.\nSchema:\nTables:\n - stadium(Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average)\n - singer(Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male)\n - concert(concert_ID, concert_Name, Theme, Stadium_ID, Year)\n - singer_in_concert(concert_ID, Singer_ID)\n\nQuestion:\n{query}""" # noqa: E501
EXPECTED_LORA_OUTPUT = [
"SELECT count(*) FROM singer",
"SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'",
"SELECT name FROM stadium WHERE stadium_id NOT IN (SELECT stadium_id FROM concert)",
]
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
prompts = [
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
PROMPT_TEMPLATE.format(
query=(
"What is the average, minimum, and maximum "
"age of all singers from France?"
)
),
PROMPT_TEMPLATE.format(
query=("What are the names of the stadiums without any concerts?")
),
]
input_templates = []
for prmpt in prompts:
messages = [{"role": "user", "content": prmpt}]
prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
enable_thinking=False, # disable thinking
)
input_templates.append(prompt)
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=512)
outputs = llm.generate(
input_templates,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
)
generated_texts: list[str] = []
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text.strip()
generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
return generated_texts
@create_new_process_for_each_test()
def test_qwen35_dense_model_lora(qwen35_dense_model_lora_files):
llm = vllm.LLM(
MODEL_PATH,
max_model_len=512,
enable_lora=True,
max_loras=2,
max_num_seqs=16,
max_lora_rank=8,
trust_remote_code=True,
)
output1 = do_sample(llm, qwen35_dense_model_lora_files, lora_id=1)
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output1[i] == EXPECTED_LORA_OUTPUT[i]
output2 = do_sample(llm, qwen35_dense_model_lora_files, lora_id=2)
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output2[i] == EXPECTED_LORA_OUTPUT[i]
@multi_gpu_test(num_gpus=4)
def test_qwen35_dense_model_lora_tp4(qwen35_dense_model_lora_files):
llm = vllm.LLM(
MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=2,
max_lora_rank=8,
max_num_seqs=16,
tensor_parallel_size=4,
trust_remote_code=True,
fully_sharded_loras=False,
compilation_config=vllm.config.CompilationConfig( # Avoid OOM
cudagraph_specialize_lora=False,
),
)
output1 = do_sample(llm, qwen35_dense_model_lora_files, lora_id=1)
print(output1)
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output1[i] == EXPECTED_LORA_OUTPUT[i]
output2 = do_sample(llm, qwen35_dense_model_lora_files, lora_id=2)
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output2[i] == EXPECTED_LORA_OUTPUT[i]
@multi_gpu_test(num_gpus=4)
def test_qwen35_dense_model_lora_tp4_fully_sharded_loras(qwen35_dense_model_lora_files):
llm = vllm.LLM(
MODEL_PATH,
max_model_len=512,
enable_lora=True,
max_loras=2,
max_lora_rank=8,
tensor_parallel_size=4,
trust_remote_code=True,
fully_sharded_loras=True,
gpu_memory_utilization=0.8,
compilation_config=vllm.config.CompilationConfig( # Avoid OOM
cudagraph_specialize_lora=False,
),
)
output1 = do_sample(llm, qwen35_dense_model_lora_files, lora_id=1)
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output1[i] == EXPECTED_LORA_OUTPUT[i]
output2 = do_sample(llm, qwen35_dense_model_lora_files, lora_id=2)
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output2[i] == EXPECTED_LORA_OUTPUT[i]
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from unittest.mock import MagicMock
import pytest
import torch
from vllm.platforms import current_platform
if current_platform.is_cuda():
pytest.skip(
"ROCm skinny GEMM tests are not supported on CUDA.",
allow_module_level=True,
)
from vllm.model_executor.layers import utils
def test_rocm_unquantized_gemm_gfx1x_wvsplitk_path(monkeypatch):
x = torch.randn(1, 64, dtype=torch.float16)
weight = torch.randn(128, 64, dtype=torch.float16)
monkeypatch.setattr(utils, "use_aiter_triton_gemm", lambda *args: False)
monkeypatch.setattr(utils.envs, "VLLM_ROCM_USE_SKINNY_GEMM", True)
monkeypatch.setattr("vllm.platforms.rocm.on_gfx1x", lambda: True)
monkeypatch.setattr("vllm.platforms.rocm.on_gfx9", lambda: False)
monkeypatch.setattr("vllm.platforms.rocm.on_gfx950", lambda: False)
monkeypatch.setattr(utils, "get_cu_count", lambda: 120)
wvsplitk_mock = MagicMock(side_effect=lambda w, x_view, _, __: x_view @ w.t())
monkeypatch.setattr(utils.ops, "wvSplitK", wvsplitk_mock)
llmm1_mock = MagicMock(side_effect=lambda w, x_view, _: x_view @ w.t())
monkeypatch.setattr(utils.ops, "LLMM1", llmm1_mock)
out = utils.rocm_unquantized_gemm_impl(x, weight, None)
ref = torch.nn.functional.linear(x, weight, None)
wvsplitk_mock.assert_called_once()
llmm1_mock.assert_not_called()
assert torch.allclose(out, ref, atol=1e-3, rtol=1e-3)
def test_rocm_unquantized_gemm_gfx1x_n_gt_4_falls_back(monkeypatch):
x = torch.randn(5, 64, dtype=torch.float16)
weight = torch.randn(128, 64, dtype=torch.float16)
monkeypatch.setattr(utils, "use_aiter_triton_gemm", lambda *args: False)
monkeypatch.setattr(utils.envs, "VLLM_ROCM_USE_SKINNY_GEMM", True)
monkeypatch.setattr("vllm.platforms.rocm.on_gfx1x", lambda: True)
monkeypatch.setattr("vllm.platforms.rocm.on_gfx9", lambda: False)
monkeypatch.setattr("vllm.platforms.rocm.on_gfx950", lambda: False)
monkeypatch.setattr(utils, "get_cu_count", lambda: 120)
wvsplitk_mock = MagicMock(side_effect=lambda w, x_view, _, __: x_view @ w.t())
monkeypatch.setattr(utils.ops, "wvSplitK", wvsplitk_mock)
llmm1_mock = MagicMock(side_effect=lambda w, x_view, _: x_view @ w.t())
monkeypatch.setattr(utils.ops, "LLMM1", llmm1_mock)
out = utils.rocm_unquantized_gemm_impl(x, weight, None)
ref = torch.nn.functional.linear(x, weight, None)
wvsplitk_mock.assert_not_called()
llmm1_mock.assert_not_called()
assert torch.allclose(out, ref, atol=1e-3, rtol=1e-3)
def test_rocm_unquantized_gemm_gfx950_wvsplitkrc_path(monkeypatch):
x = torch.randn(16, 1024, dtype=torch.float16)
weight = torch.randn(256, 1024, dtype=torch.float16)
monkeypatch.setattr(utils, "use_aiter_triton_gemm", lambda *args: False)
monkeypatch.setattr(utils.envs, "VLLM_ROCM_USE_SKINNY_GEMM", True)
monkeypatch.setattr("vllm.platforms.rocm.on_gfx1x", lambda: False)
monkeypatch.setattr("vllm.platforms.rocm.on_gfx9", lambda: False)
monkeypatch.setattr("vllm.platforms.rocm.on_gfx950", lambda: True)
monkeypatch.setattr(utils, "get_cu_count", lambda: 120)
wvsplitkrc_mock = MagicMock(side_effect=lambda w, x_view, _, __: x_view @ w.t())
monkeypatch.setattr(utils.ops, "wvSplitKrc", wvsplitkrc_mock)
wvsplitk_mock = MagicMock(side_effect=lambda w, x_view, _, __: x_view @ w.t())
monkeypatch.setattr(utils.ops, "wvSplitK", wvsplitk_mock)
out = utils.rocm_unquantized_gemm_impl(x, weight, None)
ref = torch.nn.functional.linear(x, weight, None)
wvsplitkrc_mock.assert_called_once()
wvsplitk_mock.assert_not_called()
assert torch.allclose(out, ref, atol=1e-3, rtol=1e-3)
...@@ -59,6 +59,22 @@ COLBERT_MODELS = { ...@@ -59,6 +59,22 @@ COLBERT_MODELS = {
"model_cls": "AutoModel", "model_cls": "AutoModel",
}, },
}, },
"lfm2": {
"model": "LiquidAI/LFM2-ColBERT-350M",
"colbert_dim": 128,
"max_model_len": 511,
"extra_kwargs": {
"hf_overrides": {
"architectures": ["ColBERTLfm2Model"],
},
},
"hf_comparison": {
"weights_file": "1_Dense/model.safetensors",
"weights_key": "linear.weight",
"trust_remote_code": False,
"model_cls": "AutoModel",
},
},
} }
......
...@@ -220,7 +220,10 @@ VLM_TEST_SETTINGS = { ...@@ -220,7 +220,10 @@ VLM_TEST_SETTINGS = {
vllm_runner_kwargs={ vllm_runner_kwargs={
"model_impl": "transformers", "model_impl": "transformers",
}, },
marks=[pytest.mark.core_model], marks=[
pytest.mark.core_model,
*([large_gpu_mark(min_gb=80)] if current_platform.is_rocm() else []),
],
), ),
"idefics3-transformers": VLMTestInfo( "idefics3-transformers": VLMTestInfo(
models=["HuggingFaceTB/SmolVLM-256M-Instruct"], models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
......
...@@ -39,7 +39,11 @@ models = [MODEL_NAME] ...@@ -39,7 +39,11 @@ models = [MODEL_NAME]
def granite_speech_attention_config(): def granite_speech_attention_config():
"""Return attention config for Granite Speech tests on ROCm.""" """Return attention config for Granite Speech tests on ROCm."""
if current_platform.is_rocm(): if current_platform.is_rocm():
return {"backend": "ROCM_AITER_FA"} from vllm.platforms.rocm import on_mi3xx
if on_mi3xx():
return {"backend": "ROCM_AITER_FA"}
return {"backend": "TRITON_ATTN"}
return None return None
......
...@@ -24,12 +24,8 @@ class ModelRequestData(NamedTuple): ...@@ -24,12 +24,8 @@ class ModelRequestData(NamedTuple):
sampling_params: SamplingParams | None = None sampling_params: SamplingParams | None = None
@pytest.mark.core_model
@pytest.mark.parametrize("question", [QUESTION]) @pytest.mark.parametrize("question", [QUESTION])
def test_keye_vl( def test_keye_vl(image_assets, question: str):
image_assets,
question: str,
):
images = [asset.pil_image for asset in image_assets] images = [asset.pil_image for asset in image_assets]
image_urls = [encode_image_url(image) for image in images] image_urls = [encode_image_url(image) for image in images]
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable, Sequence
from collections.abc import Sequence
import pytest import pytest
import regex as re
from transformers import AutoModel from transformers import AutoModel
from tests.models.utils import check_logprobs_close from tests.models.utils import check_logprobs_close
from vllm.assets.image import ImageAsset from vllm.assets.image import ImageAsset
from vllm.logprobs import Logprob, SampleLogprobs
from vllm.tokenizers import TokenizerLike
from ....conftest import HfRunner, PromptImageInput, VllmRunner from ....conftest import HfRunner, PromptImageInput, VllmRunner
from ....utils import create_new_process_for_each_test
IMAGE = ImageAsset("paper-11").pil_image_ext(ext="png").convert("RGB") IMAGE = ImageAsset("paper-11").pil_image_ext(ext="png").convert("RGB")
PROMPT = "</s><s><predict_bbox><predict_classes><output_markdown>" PROMPT = "</s><s><predict_bbox><predict_classes><output_markdown>"
class DummyLogprobs(dict[int, Logprob]):
def __init__(self, vocab_ids: Iterable[int]):
super().__init__(dict.fromkeys(vocab_ids, Logprob(0.0)))
def __repr__(self):
return "DummyLogprobs()"
def mask_bbox_tokens(
output: tuple[list[int], str, SampleLogprobs],
tokenizer: TokenizerLike,
) -> tuple[list[int], str, SampleLogprobs]:
"""
Always pass check_logprobs_close check for bounding box tokens
because it is reasonable for them to differ slightly.
"""
ignore_pattern = r"<[xy]_[\d.]+>"
vocab = tokenizer.get_vocab()
output_ids, output_str, out_logprobs = output
masked_logprobs = list[dict[int, Logprob]]()
for token, logprobs in zip(output_ids, out_logprobs):
if re.match(ignore_pattern, tokenizer.decode(token)):
masked_logprobs.append(DummyLogprobs(vocab.values()))
else:
masked_logprobs.append(logprobs)
return output_ids, output_str, masked_logprobs
def run_test( def run_test(
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
...@@ -44,6 +76,8 @@ def run_test( ...@@ -44,6 +76,8 @@ def run_test(
for prompts, images in inputs for prompts, images in inputs
] ]
tokenizer = vllm_model.llm.get_tokenizer()
with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model: with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
hf_outputs_per_case = [ hf_outputs_per_case = [
hf_model.generate_greedy_logprobs_limit( hf_model.generate_greedy_logprobs_limit(
...@@ -58,18 +92,20 @@ def run_test( ...@@ -58,18 +92,20 @@ def run_test(
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case): for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
check_logprobs_close( check_logprobs_close(
outputs_0_lst=hf_outputs, outputs_0_lst=[
outputs_1_lst=vllm_outputs, mask_bbox_tokens(output, tokenizer) for output in hf_outputs
],
outputs_1_lst=[
mask_bbox_tokens(output, tokenizer) for output in vllm_outputs
],
name_0="hf", name_0="hf",
name_1="vllm", name_1="vllm",
) )
@pytest.mark.core_model
@pytest.mark.parametrize("model", ["nvidia/NVIDIA-Nemotron-Parse-v1.1"]) @pytest.mark.parametrize("model", ["nvidia/NVIDIA-Nemotron-Parse-v1.1"])
@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("num_logprobs", [5])
@create_new_process_for_each_test("spawn")
def test_models( def test_models(
hf_runner, vllm_runner, model: str, dtype: str, num_logprobs: int hf_runner, vllm_runner, model: str, dtype: str, num_logprobs: int
) -> None: ) -> None:
...@@ -77,10 +113,7 @@ def test_models( ...@@ -77,10 +113,7 @@ def test_models(
hf_runner, hf_runner,
vllm_runner, vllm_runner,
inputs=[ inputs=[
( ([PROMPT] * 10, [IMAGE] * 10),
[PROMPT] * 10,
[IMAGE] * 10,
),
], ],
model=model, model=model,
dtype=dtype, dtype=dtype,
......
...@@ -323,10 +323,7 @@ def build_audio_inputs_from_test_info( ...@@ -323,10 +323,7 @@ def build_audio_inputs_from_test_info(
test_info.audio_idx_to_prompt, test_info.audio_idx_to_prompt,
test_info.prompt_formatter, test_info.prompt_formatter,
) )
resampler = AudioResampler( resampler = AudioResampler(target_sr=16000)
target_sr=16000,
method="librosa",
)
audios = [asset.audio_and_sample_rate for asset in audio_assets] audios = [asset.audio_and_sample_rate for asset in audio_assets]
resampled_audios = [ resampled_audios = [
( (
......
...@@ -24,6 +24,7 @@ from transformers import ( ...@@ -24,6 +24,7 @@ from transformers import (
GenerationConfig, GenerationConfig,
GenerationMixin, GenerationMixin,
) )
from transformers.masking_utils import create_causal_mask
from transformers.video_utils import VideoMetadata from transformers.video_utils import VideoMetadata
from vllm.logprobs import SampleLogprobs from vllm.logprobs import SampleLogprobs
...@@ -489,13 +490,14 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner: ...@@ -489,13 +490,14 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self.image_size = self.vision_config.image_size self.image_size = self.vision_config.image_size
def __call__(self, text: str, images: Image | list[Image], **kwargs): def __call__(self, text: str, images: Image | list[Image], **kwargs):
from vllm.model_executor.models.h2ovl import ( from vllm.transformers_utils.processors.h2ovl import (
IMG_CONTEXT,
IMG_END,
IMG_START,
image_to_pixel_values_h2ovl, image_to_pixel_values_h2ovl,
) )
IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<IMG_CONTEXT>"
images = [images] if isinstance(images, Image) else images images = [images] if isinstance(images, Image) else images
pixel_values = [ pixel_values = [
image_to_pixel_values_h2ovl( image_to_pixel_values_h2ovl(
...@@ -679,10 +681,14 @@ def isaac_patch_hf_runner(hf_model: HfRunner) -> HfRunner: ...@@ -679,10 +681,14 @@ def isaac_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
sin = sin.to(inputs_embeds.dtype) sin = sin.to(inputs_embeds.dtype)
# Prepare attention mask # Prepare attention mask
if attention_mask is not None: attention_mask = create_causal_mask(
attention_mask = self._update_causal_mask( config=self.config,
attention_mask, inputs_embeds, cache_position, past_key_values, False input_embeds=inputs_embeds,
) attention_mask=attention_mask,
past_key_values=past_key_values,
position_ids=position_ids,
cache_position=cache_position,
)
# Initialize and collect hidden states # Initialize and collect hidden states
hidden_states = inputs_embeds hidden_states = inputs_embeds
...@@ -751,16 +757,17 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner: ...@@ -751,16 +757,17 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self.image_size = self.vision_config.image_size self.image_size = self.vision_config.image_size
def __call__(self, text: str, images: Image | list[Image], **kwargs): def __call__(self, text: str, images: Image | list[Image], **kwargs):
from vllm.model_executor.models.skyworkr1v import ( from vllm.transformers_utils.processors.internvl import (
IMG_CONTEXT, image_to_pixel_values_internvl,
IMG_END,
IMG_START,
image_to_pixel_values_skyworkr1v,
) )
IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<IMG_CONTEXT>"
images = [images] if isinstance(images, Image) else images images = [images] if isinstance(images, Image) else images
pixel_values = [ pixel_values = [
image_to_pixel_values_skyworkr1v( image_to_pixel_values_internvl(
image, image,
input_size=self.image_size, input_size=self.image_size,
min_num=self.min_num, min_num=self.min_num,
...@@ -815,14 +822,15 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner: ...@@ -815,14 +822,15 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
videos: npt.NDArray | list[npt.NDArray] = None, videos: npt.NDArray | list[npt.NDArray] = None,
**kwargs, **kwargs,
): ):
from vllm.model_executor.models.internvl import ( from vllm.transformers_utils.processors.internvl import (
IMG_CONTEXT,
IMG_END,
IMG_START,
image_to_pixel_values_internvl, image_to_pixel_values_internvl,
video_to_pixel_values_internvl, video_to_pixel_values_internvl,
) )
IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<IMG_CONTEXT>"
images = [images] if isinstance(images, Image) else images images = [images] if isinstance(images, Image) else images
videos = [videos] if isinstance(videos, np.ndarray) else videos videos = [videos] if isinstance(videos, np.ndarray) else videos
if images is not None: if images is not None:
...@@ -1260,9 +1268,9 @@ def voxtral_patch_hf_runner(hf_model: "HfRunner") -> "HfRunner": ...@@ -1260,9 +1268,9 @@ def voxtral_patch_hf_runner(hf_model: "HfRunner") -> "HfRunner":
generated). generated).
""" """
import base64
import io import io
import pybase64 as base64
import soundfile as sf import soundfile as sf
processor = hf_model.processor processor = hf_model.processor
......
...@@ -7,9 +7,9 @@ ColPali is a multi-vector retrieval model based on PaliGemma backbone ...@@ -7,9 +7,9 @@ ColPali is a multi-vector retrieval model based on PaliGemma backbone
It produces per-token embeddings for both text and image inputs. It produces per-token embeddings for both text and image inputs.
""" """
import base64
from io import BytesIO from io import BytesIO
import pybase64 as base64
import pytest import pytest
import torch import torch
from PIL import Image from PIL import Image
......
...@@ -7,9 +7,9 @@ ColBERT-style late interaction scoring (MaxSim). It produces per-token ...@@ -7,9 +7,9 @@ ColBERT-style late interaction scoring (MaxSim). It produces per-token
embeddings for both text and image inputs. embeddings for both text and image inputs.
""" """
import base64
from io import BytesIO from io import BytesIO
import pybase64 as base64
import pytest import pytest
import torch import torch
from PIL import Image from PIL import Image
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for ColQwen3.5 late interaction model for multi-modal retrieval.
ColQwen3.5 is a multi-vector retrieval model based on Qwen3.5 backbone with
ColBERT-style late interaction scoring (MaxSim). It produces per-token
embeddings for both text and image inputs.
"""
import pytest
import torch
from ....conftest import VllmRunner
MODELS = [
"athrael-soju/colqwen3.5-4.5B-v3",
]
EMBED_DIMS = {
"athrael-soju/colqwen3.5-4.5B-v3": 320,
}
TEXT_QUERIES = [
"What is the capital of France?",
"Describe the contents of the document.",
]
TEXT_DOCUMENTS = [
"The capital of France is Paris.",
"This document contains important financial data.",
]
DTYPE = "half"
def _run_token_embed_test(
vllm_runner: type[VllmRunner],
model: str,
*,
dtype: str,
) -> None:
"""Verify per-token embedding shape and L2 normalization."""
with vllm_runner(
model,
runner="pooling",
dtype=dtype,
max_model_len=4096,
enforce_eager=True,
) as vllm_model:
outputs = vllm_model.token_embed([TEXT_QUERIES[0]])
assert len(outputs) == 1
emb = torch.tensor(outputs[0])
# Token embeddings should be 2D: [num_tokens, embed_dim]
assert emb.dim() == 2
assert emb.shape[1] == EMBED_DIMS[model]
assert emb.shape[0] > 1
# Verify L2 normalization
norms = torch.norm(emb, p=2, dim=-1)
torch.testing.assert_close(
norms,
torch.ones_like(norms),
rtol=1e-2,
atol=1e-2,
)
def _run_late_interaction_test(
vllm_runner: type[VllmRunner],
model: str,
*,
dtype: str,
) -> None:
"""Verify MaxSim scoring matches manual computation."""
from vllm.entrypoints.pooling.score.utils import compute_maxsim_score
with vllm_runner(
model,
runner="pooling",
dtype=dtype,
max_model_len=4096,
enforce_eager=True,
) as vllm_model:
q_outputs = vllm_model.token_embed([TEXT_QUERIES[0]])
d_outputs = vllm_model.token_embed([TEXT_DOCUMENTS[0]])
q_emb = torch.tensor(q_outputs[0])
d_emb = torch.tensor(d_outputs[0])
manual_score = compute_maxsim_score(q_emb, d_emb).item()
vllm_scores = vllm_model.score(TEXT_QUERIES[0], TEXT_DOCUMENTS[0])
assert len(vllm_scores) == 1
assert vllm_scores[0] == pytest.approx(manual_score, rel=0.01)
def _run_relevance_test(
vllm_runner: type[VllmRunner],
model: str,
*,
dtype: str,
) -> None:
"""Verify that relevant documents score higher than irrelevant ones."""
query = "What is machine learning?"
documents = [
"Machine learning is a subset of artificial intelligence.",
"The weather forecast shows rain tomorrow.",
"Deep learning uses neural networks for complex tasks.",
]
with vllm_runner(
model,
runner="pooling",
dtype=dtype,
max_model_len=4096,
enforce_eager=True,
) as vllm_model:
scores = vllm_model.score(query, documents)
assert len(scores) == 3
assert scores[0] > scores[1], "ML doc should score higher than weather doc"
assert scores[2] > scores[1], "DL doc should score higher than weather doc"
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", [DTYPE])
def test_colqwen3_5_token_embed(
vllm_runner,
model: str,
dtype: str,
) -> None:
_run_token_embed_test(vllm_runner, model, dtype=dtype)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", [DTYPE])
def test_colqwen3_5_late_interaction_scoring(
vllm_runner,
model: str,
dtype: str,
) -> None:
_run_late_interaction_test(vllm_runner, model, dtype=dtype)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", [DTYPE])
def test_colqwen3_5_relevance_ordering(
vllm_runner,
model: str,
dtype: str,
) -> None:
_run_relevance_test(vllm_runner, model, dtype=dtype)
...@@ -9,10 +9,10 @@ Tests for the LlamaNemotronVL model family: ...@@ -9,10 +9,10 @@ Tests for the LlamaNemotronVL model family:
Both variants share a SigLIP vision encoder with a bidirectional LLaMA backbone. Both variants share a SigLIP vision encoder with a bidirectional LLaMA backbone.
""" """
import base64
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
import pybase64 as base64
import pytest import pytest
import torch import torch
from transformers import AutoModel, AutoModelForSequenceClassification, AutoProcessor from transformers import AutoModel, AutoModelForSequenceClassification, AutoProcessor
...@@ -22,8 +22,10 @@ from vllm.entrypoints.chat_utils import ( ...@@ -22,8 +22,10 @@ from vllm.entrypoints.chat_utils import (
ChatCompletionContentPartTextParam, ChatCompletionContentPartTextParam,
) )
from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam
from vllm.platforms import current_platform
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ....utils import ROCM_ENGINE_KWARGS
from ...utils import check_embeddings_close from ...utils import check_embeddings_close
# Prefixes used by the model API # Prefixes used by the model API
...@@ -70,6 +72,7 @@ def _run_test( ...@@ -70,6 +72,7 @@ def _run_test(
max_model_len=2048, max_model_len=2048,
enforce_eager=True, enforce_eager=True,
trust_remote_code=True, trust_remote_code=True,
**ROCM_ENGINE_KWARGS,
) as vllm_model: ) as vllm_model:
vllm_outputs = vllm_model.embed(input_texts, images=input_images) vllm_outputs = vllm_model.embed(input_texts, images=input_images)
...@@ -250,6 +253,7 @@ def _run_vllm_reranker( ...@@ -250,6 +253,7 @@ def _run_vllm_reranker(
max_model_len=2048, max_model_len=2048,
enforce_eager=True, enforce_eager=True,
trust_remote_code=True, trust_remote_code=True,
**ROCM_ENGINE_KWARGS,
) as vllm_model: ) as vllm_model:
has_images = any(img is not None for _, img in docs) has_images = any(img is not None for _, img in docs)
...@@ -322,8 +326,11 @@ def _run_reranker_test( ...@@ -322,8 +326,11 @@ def _run_reranker_test(
assert len(hf_scores) == len(vllm_scores), ( assert len(hf_scores) == len(vllm_scores), (
f"Output length mismatch: HF={len(hf_scores)}, vLLM={len(vllm_scores)}" f"Output length mismatch: HF={len(hf_scores)}, vLLM={len(vllm_scores)}"
) )
# NOTE: ROCm shows slightly higher numerical variance dues to different attention
# backend between vLLM and HF; use a marginally looser tolerance
rel_tol = 0.022 if current_platform.is_rocm() else 0.02
for i, (hf_score, vllm_score) in enumerate(zip(hf_scores, vllm_scores)): for i, (hf_score, vllm_score) in enumerate(zip(hf_scores, vllm_scores)):
assert hf_score == pytest.approx(vllm_score, rel=0.02), ( assert hf_score == pytest.approx(vllm_score, rel=rel_tol), (
f"Score mismatch at index {i}: HF={hf_score:.4f}, vLLM={vllm_score:.4f}" f"Score mismatch at index {i}: HF={hf_score:.4f}, vLLM={vllm_score:.4f}"
) )
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
import pytest import pytest
import torch.nn.functional as F import torch.nn.functional as F
import transformers.utils
from PIL import Image from PIL import Image
from vllm.assets.base import get_vllm_public_assets from vllm.assets.base import get_vllm_public_assets
...@@ -12,6 +13,12 @@ from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner ...@@ -12,6 +13,12 @@ from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ....utils import large_gpu_test from ....utils import large_gpu_test
from ...utils import check_embeddings_close from ...utils import check_embeddings_close
# BC for method that was deleted in Transformers v5.
# Only needed for generating the HF reference.
transformers.utils.is_flash_attn_greater_or_equal_2_10 = (
lambda: transformers.utils.is_flash_attn_greater_or_equal("2.1.0")
)
HF_TEXT_PROMPTS = [ HF_TEXT_PROMPTS = [
# T -> X # T -> X
"Find me an everyday image that matches the given caption: The label of the object is stop sign", # noqa: E501 "Find me an everyday image that matches the given caption: The label of the object is stop sign", # noqa: E501
......
...@@ -23,7 +23,7 @@ def _get_expected_num_patches( ...@@ -23,7 +23,7 @@ def _get_expected_num_patches(
min_num: int, min_num: int,
max_num: int, max_num: int,
): ):
from vllm.model_executor.models.h2ovl import ( from vllm.transformers_utils.processors.h2ovl import (
calculate_h2ovl_targets, calculate_h2ovl_targets,
get_h2ovl_target_ratios, get_h2ovl_target_ratios,
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment