Merge tag 'v0.18.1rc0' into v0.18.1rc0-ori

0da93439 · zhuwenwen · 25f2f756 · 298e5108 · 0da93439 · 0da93439
Commit 0da93439 authored Mar 26, 2026 by zhuwenwen
20 changed files
--- a/tests/kernels/quantization/test_mxfp4_triton_ep.py
+++ b/tests/kernels/quantization/test_mxfp4_triton_ep.py
@@ -17,89 +17,6 @@ from unittest.mock import MagicMock, patch
 import pytest
 import torch
-from vllm.model_executor.layers.quantization.mxfp4 import (
-    Mxfp4Backend,
-    Mxfp4MoEMethod,
-)
-def _make_mock_moe_config(ep_size: int = 1) -> MagicMock:
-    """Create a mock FusedMoEConfig with the given EP size."""
-    parallel_config = MagicMock()
-    parallel_config.ep_size = ep_size
-    moe_config = MagicMock()
-    moe_config.ep_size = ep_size
-    moe_config.is_lora_enabled = False
-    moe_config.moe_parallel_config = parallel_config
-    return moe_config
-class TestMxfp4TritonIsMonolithic:
-    """Verify that is_monolithic is always True for the TRITON backend,
-    regardless of EP size, since triton_kernel_moe_forward now handles
-    expert_map remapping internally."""
-    @pytest.mark.parametrize(
-        "backend,ep_size,expected_monolithic",
-        [
-            # TRITON is always monolithic (handles EP via expert_map remapping)
-            (Mxfp4Backend.TRITON, 1, True),
-            (Mxfp4Backend.TRITON, 2, True),
-            (Mxfp4Backend.TRITON, 4, True),
-            # SM100 backends are always monolithic
-            (Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM, 1, True),
-            (Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM, 2, True),
-            (Mxfp4Backend.SM100_FI_MXFP4_BF16, 1, True),
-            (Mxfp4Backend.SM100_FI_MXFP4_BF16, 2, True),
-            # MARLIN is never monolithic
-            (Mxfp4Backend.MARLIN, 1, False),
-            (Mxfp4Backend.MARLIN, 2, False),
-        ],
-        ids=[
-            "triton-no-ep",
-            "triton-ep2",
-            "triton-ep4",
-            "sm100-trtllm-no-ep",
-            "sm100-trtllm-ep2",
-            "sm100-bf16-no-ep",
-            "sm100-bf16-ep2",
-            "marlin-no-ep",
-            "marlin-ep2",
-        ],
-    )
-    @patch(
-        "vllm.model_executor.layers.quantization.mxfp4.get_mxfp4_backend",
-    )
-    @patch(
-        "vllm.model_executor.layers.quantization.mxfp4.get_current_vllm_config",
-    )
-    def test_is_monolithic(
-        self,
-        mock_get_config,
-        mock_get_backend,
-        backend,
-        ep_size,
-        expected_monolithic,
-    ):
-        """is_monolithic should be True for TRITON regardless of EP size."""
-        mock_get_backend.return_value = backend
-        mock_compilation_config = MagicMock()
-        mock_compilation_config.max_cudagraph_capture_size = 1024
-        mock_vllm_config = MagicMock()
-        mock_vllm_config.compilation_config = mock_compilation_config
-        mock_get_config.return_value = mock_vllm_config
-        moe_config = _make_mock_moe_config(ep_size=ep_size)
-        method = Mxfp4MoEMethod(moe_config)
-        assert method.is_monolithic == expected_monolithic, (
-            f"Expected is_monolithic={expected_monolithic} for "
-            f"backend={backend.name}, ep_size={ep_size}, "
-            f"but got {method.is_monolithic}."
-        )
 class TestTritonMoeForwardExpertMap:
    """Test that triton_kernel_moe_forward applies expert_map remapping

--- a/tests/kernels/quantization/test_rocm_skinny_gemms.py
+++ b/tests/kernels/quantization/test_rocm_skinny_gemms.py
@@ -160,6 +160,8 @@ def test_rocm_wvsplitkrc_kernel(xnorm, n, k, m, dtype, seed, padded_a, bias_mode
        BIAS = torch.rand(m, dtype=dtype, device="cuda") * 2 - 1
    elif bias_mode == 2:
        BIAS = torch.rand(n, m, dtype=dtype, device="cuda") * 2 - 1
+    elif bias_mode == 3:
+        BIAS = torch.rand(1, m, dtype=dtype, device="cuda") * 2 - 1
    ref_out = torch.nn.functional.linear(A, B, BIAS)
    out = ops.wvSplitKrc(A, B, cu_count, BIAS)
@@ -224,10 +226,9 @@ def test_rocm_wvsplitk_kernel(
    ref_out = torch.nn.functional.linear(A, B, BIAS)
    out = ops.wvSplitK(B, A.view(-1, A.size(-1)), cu_count, BIAS)
-    if xnorm:
+    # Accumulation error in fp16 GEMM scales with sqrt(K)
-        assert torch.allclose(out, ref_out, atol=1e-3, rtol=1e-8)
+    atol = torch.finfo(dtype).eps * math.sqrt(k)
-    else:
+    torch.testing.assert_close(out, ref_out, atol=atol, rtol=1e-2)
-        assert torch.allclose(out, ref_out, atol=1e-3, rtol=1e-2)
 @pytest.mark.parametrize("xnorm", [False, True])

--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -294,6 +294,11 @@ def whisper_lora_files():
    return snapshot_download(repo_id="chengyili2005/whisper-small-mandarin-lora")
+@pytest.fixture(scope="session")
+def qwen35_dense_model_lora_files():
+    return snapshot_download(repo_id="jeeejeee/qwen35-4b-text-only-sql-lora")
 @pytest.fixture
 def reset_default_device():
    """

--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -711,3 +711,192 @@ def test_packed_loras(default_vllm_config, dist_init, dummy_model_gate_up, devic
    torch.testing.assert_close(
        packed_lora1.lora_b[1], model_lora_clone1.get_lora("up_proj").lora_b
    )
+def _test_target_modules(
+    model,
+    target_modules: list[str] | None,
+    device: str,
+    expected_lora: list[tuple[str, type]],
+    expected_no_lora: list[tuple[str, type]],
+):
+    """Create a LoRAModelManager and assert which modules have LoRA applied."""
+    LoRAModelManager(
+        model,
+        2,
+        2,
+        2,
+        LoRAConfig(
+            max_lora_rank=8,
+            max_cpu_loras=2,
+            max_loras=2,
+            lora_dtype=DEFAULT_DTYPE,
+            target_modules=target_modules,
+        ),
+        device=device,
+    )
+    for module_path, lora_cls in expected_lora:
+        assert isinstance(model.get_submodule(module_path), lora_cls)
+    for module_path, lora_cls in expected_no_lora:
+        assert not isinstance(model.get_submodule(module_path), lora_cls)
+@pytest.mark.parametrize("device", DEVICES)
+def test_target_modules_config(default_vllm_config, dist_init, dummy_model, device):
+    """Test that target_modules config restricts which modules get LoRA applied."""
+    _test_target_modules(
+        dummy_model,
+        ["dense1"],
+        device,
+        expected_lora=[
+            ("dense1", ColumnParallelLinearWithLoRA),
+            ("layer1.dense1", ColumnParallelLinearWithLoRA),
+        ],
+        expected_no_lora=[
+            ("dense2", RowParallelLinearWithLoRA),
+            ("layer1.dense2", RowParallelLinearWithLoRA),
+        ],
+    )
+@pytest.mark.parametrize("device", DEVICES)
+def test_target_modules_multiple(default_vllm_config, dist_init, dummy_model, device):
+    """Test that multiple target_modules work correctly."""
+    _test_target_modules(
+        dummy_model,
+        ["dense1", "dense2"],
+        device,
+        expected_lora=[
+            ("dense1", ColumnParallelLinearWithLoRA),
+            ("layer1.dense1", ColumnParallelLinearWithLoRA),
+            ("dense2", RowParallelLinearWithLoRA),
+            ("layer1.dense2", RowParallelLinearWithLoRA),
+        ],
+        expected_no_lora=[],
+    )
+@pytest.mark.parametrize("device", DEVICES)
+def test_target_modules_none_uses_all(
+    default_vllm_config, dist_init, dummy_model, device
+):
+    """Test that target_modules=None uses all supported modules."""
+    _test_target_modules(
+        dummy_model,
+        None,
+        device,
+        expected_lora=[
+            ("dense1", ColumnParallelLinearWithLoRA),
+            ("layer1.dense1", ColumnParallelLinearWithLoRA),
+            ("dense2", RowParallelLinearWithLoRA),
+            ("layer1.dense2", RowParallelLinearWithLoRA),
+        ],
+        expected_no_lora=[],
+    )
+@pytest.mark.parametrize("device", DEVICES)
+def test_load_adapter_warns_on_unsupported_modules(
+    default_vllm_config, dist_init, dummy_model_gate_up, device, tmp_path
+):
+    """Test that _load_adapter warns when a LoRA adapter contains modules
+    not in the model's supported LoRA target modules."""
+    from unittest.mock import patch
+    import vllm.lora.worker_manager as wm_module
+    lora_config = LoRAConfig(
+        max_lora_rank=8, max_cpu_loras=4, max_loras=4, lora_dtype=DEFAULT_DTYPE
+    )
+    dummy_lora_files = f"{tmp_path}/lora_adapter"
+    os.makedirs(dummy_lora_files, exist_ok=True)
+    create_peft_lora(
+        dummy_model_gate_up,
+        save_dir=dummy_lora_files,
+        target_modules=["layer1.dense1", "dense2"],
+        lora_dtype=DEFAULT_DTYPE,
+    )
+    model_config = ModelConfig(max_model_len=16)
+    vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config)
+    vllm_config.scheduler_config.max_num_seqs = 4
+    vllm_config.scheduler_config.max_num_batched_tokens = 2
+    worker_manager = WorkerLoRAManager(vllm_config, device, EMBEDDING_MODULES)
+    worker_manager.vocab_size = dummy_model_gate_up.unpadded_vocab_size
+    worker_manager.create_lora_manager(dummy_model_gate_up)
+    # Patch from_local_checkpoint to inject an unsupported module
+    original_from_checkpoint = LoRAModel.from_local_checkpoint
+    def patched_from_checkpoint(*args, **kwargs):
+        lora = original_from_checkpoint(*args, **kwargs)
+        lora.loras["unsupported_module"] = LoRALayerWeights(
+            module_name="unsupported_module",
+            rank=8,
+            lora_alpha=16,
+            lora_a=torch.randn(8, 10),
+            lora_b=torch.randn(10, 8),
+        )
+        return lora
+    lora_request = LoRARequest("test", 1, dummy_lora_files)
+    with (
+        patch.object(LoRAModel, "from_local_checkpoint", patched_from_checkpoint),
+        patch.object(wm_module.logger, "warning_once") as mock_warning,
+    ):
+        worker_manager._load_adapter(lora_request)
+        warning_args = mock_warning.call_args_list
+        found = any("unsupported_module" in str(call) for call in warning_args)
+        assert found, (
+            f"Expected warning about 'unsupported_module', got: {warning_args}"
+        )
+@pytest.mark.parametrize("device", DEVICES)
+def test_load_adapter_warns_on_target_modules_restriction(
+    default_vllm_config, dist_init, dummy_model_gate_up, device, tmp_path
+):
+    """Test that _load_adapter warns when a LoRA adapter contains modules
+    excluded by the deployment-time target_modules restriction."""
+    from unittest.mock import patch
+    import vllm.lora.worker_manager as wm_module
+    # Restrict to only dense2 — adapter has dense1 which will be excluded
+    lora_config = LoRAConfig(
+        max_lora_rank=8,
+        max_cpu_loras=4,
+        max_loras=4,
+        lora_dtype=DEFAULT_DTYPE,
+        target_modules=["dense2"],
+    )
+    dummy_lora_files = f"{tmp_path}/lora_adapter"
+    os.makedirs(dummy_lora_files, exist_ok=True)
+    create_peft_lora(
+        dummy_model_gate_up,
+        save_dir=dummy_lora_files,
+        target_modules=["layer1.dense1", "dense2"],
+        lora_dtype=DEFAULT_DTYPE,
+    )
+    model_config = ModelConfig(max_model_len=16)
+    vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config)
+    vllm_config.scheduler_config.max_num_seqs = 4
+    vllm_config.scheduler_config.max_num_batched_tokens = 2
+    worker_manager = WorkerLoRAManager(vllm_config, device, EMBEDDING_MODULES)
+    worker_manager.vocab_size = dummy_model_gate_up.unpadded_vocab_size
+    worker_manager.create_lora_manager(dummy_model_gate_up)
+    lora_request = LoRARequest("test", 1, dummy_lora_files)
+    with patch.object(wm_module.logger, "warning_once") as mock_warning:
+        worker_manager._load_adapter(lora_request)
+        warning_args = mock_warning.call_args_list
+        # dense1 is supported by the model but excluded by target_modules
+        found = any("target_modules" in str(call) for call in warning_args)
+        assert found, (
+            f"Expected warning about target_modules restriction, got: {warning_args}"
+        )
--- a/tests/lora/test_lora_utils.py
+++ b/tests/lora/test_lora_utils.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.lora.utils import is_in_target_modules, is_supported_lora_module
+class TestIsSupportedLoraModule:
+    """Tests for is_supported_lora_module (model-definition check)."""
+    def test_suffix_match(self):
+        assert is_supported_lora_module(
+            "model.layers.0.self_attn.o_proj", ["o_proj", "q_proj"]
+        )
+    def test_no_match(self):
+        assert not is_supported_lora_module(
+            "model.layers.0.self_attn.o_proj", ["q_proj", "k_proj"]
+        )
+    def test_exact_match(self):
+        assert is_supported_lora_module("o_proj", ["o_proj"])
+    def test_regex_suffix_matching(self):
+        """Regex anchors to end — partial suffix should not match."""
+        assert not is_supported_lora_module("model.layers.0.self_attn.o_proj", ["proj"])
+    def test_empty_supported_modules(self):
+        assert not is_supported_lora_module("model.layers.0.self_attn.o_proj", [])
+    def test_multiple_supported_modules(self):
+        supported = ["q_proj", "k_proj", "v_proj", "o_proj"]
+        assert is_supported_lora_module("model.layers.0.self_attn.v_proj", supported)
+        assert not is_supported_lora_module("model.layers.0.mlp.gate_proj", supported)
+class TestIsInTargetModules:
+    """Tests for is_in_target_modules (deployment-time filter)."""
+    def test_none_allows_all(self):
+        assert is_in_target_modules("model.layers.0.self_attn.o_proj", None)
+    def test_suffix_in_target(self):
+        assert is_in_target_modules(
+            "model.layers.0.self_attn.o_proj", ["o_proj", "q_proj"]
+        )
+    def test_suffix_not_in_target(self):
+        assert not is_in_target_modules(
+            "model.layers.0.self_attn.o_proj", ["q_proj", "k_proj"]
+        )
+    def test_empty_target_modules(self):
+        assert not is_in_target_modules("model.layers.0.self_attn.o_proj", [])
+    def test_exact_name_match(self):
+        assert is_in_target_modules("dense1", ["dense1", "dense2"])
+    def test_exact_name_no_match(self):
+        assert not is_in_target_modules("dense3", ["dense1", "dense2"])
--- a/tests/lora/test_qwen35_densemoel_lora.py
+++ b/tests/lora/test_qwen35_densemoel_lora.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from transformers import AutoTokenizer
+import vllm
+import vllm.config
+from vllm.lora.request import LoRARequest
+from ..utils import create_new_process_for_each_test, multi_gpu_test
+MODEL_PATH = "Qwen/Qwen3.5-4B"
+PROMPT_TEMPLATE = """Write a SQL query for the given database.\nSchema:\nTables:\n  - stadium(Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average)\n  - singer(Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male)\n  - concert(concert_ID, concert_Name, Theme, Stadium_ID, Year)\n  - singer_in_concert(concert_ID, Singer_ID)\n\nQuestion:\n{query}"""  # noqa: E501
+EXPECTED_LORA_OUTPUT = [
+    "SELECT count(*) FROM singer",
+    "SELECT avg(age) ,  min(age) ,  max(age) FROM singer WHERE country  =  'France'",
+    "SELECT name FROM stadium WHERE stadium_id NOT IN (SELECT stadium_id FROM concert)",
+]
+tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
+    prompts = [
+        PROMPT_TEMPLATE.format(query="How many singers do we have?"),
+        PROMPT_TEMPLATE.format(
+            query=(
+                "What is the average, minimum, and maximum "
+                "age of all singers from France?"
+            )
+        ),
+        PROMPT_TEMPLATE.format(
+            query=("What are the names of the stadiums without any concerts?")
+        ),
+    ]
+    input_templates = []
+    for prmpt in prompts:
+        messages = [{"role": "user", "content": prmpt}]
+        prompt = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+            enable_thinking=False,  # disable thinking
+        )
+        input_templates.append(prompt)
+    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=512)
+    outputs = llm.generate(
+        input_templates,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
+    )
+    generated_texts: list[str] = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+@create_new_process_for_each_test()
+def test_qwen35_dense_model_lora(qwen35_dense_model_lora_files):
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=512,
+        enable_lora=True,
+        max_loras=2,
+        max_num_seqs=16,
+        max_lora_rank=8,
+        trust_remote_code=True,
+    )
+    output1 = do_sample(llm, qwen35_dense_model_lora_files, lora_id=1)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output1[i] == EXPECTED_LORA_OUTPUT[i]
+    output2 = do_sample(llm, qwen35_dense_model_lora_files, lora_id=2)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output2[i] == EXPECTED_LORA_OUTPUT[i]
+@multi_gpu_test(num_gpus=4)
+def test_qwen35_dense_model_lora_tp4(qwen35_dense_model_lora_files):
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=2,
+        max_lora_rank=8,
+        max_num_seqs=16,
+        tensor_parallel_size=4,
+        trust_remote_code=True,
+        fully_sharded_loras=False,
+        compilation_config=vllm.config.CompilationConfig(  # Avoid OOM
+            cudagraph_specialize_lora=False,
+        ),
+    )
+    output1 = do_sample(llm, qwen35_dense_model_lora_files, lora_id=1)
+    print(output1)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output1[i] == EXPECTED_LORA_OUTPUT[i]
+    output2 = do_sample(llm, qwen35_dense_model_lora_files, lora_id=2)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output2[i] == EXPECTED_LORA_OUTPUT[i]
+@multi_gpu_test(num_gpus=4)
+def test_qwen35_dense_model_lora_tp4_fully_sharded_loras(qwen35_dense_model_lora_files):
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=512,
+        enable_lora=True,
+        max_loras=2,
+        max_lora_rank=8,
+        tensor_parallel_size=4,
+        trust_remote_code=True,
+        fully_sharded_loras=True,
+        gpu_memory_utilization=0.8,
+        compilation_config=vllm.config.CompilationConfig(  # Avoid OOM
+            cudagraph_specialize_lora=False,
+        ),
+    )
+    output1 = do_sample(llm, qwen35_dense_model_lora_files, lora_id=1)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output1[i] == EXPECTED_LORA_OUTPUT[i]
+    output2 = do_sample(llm, qwen35_dense_model_lora_files, lora_id=2)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output2[i] == EXPECTED_LORA_OUTPUT[i]
--- a/tests/model_executor/layers/test_rocm_unquantized_gemm.py
+++ b/tests/model_executor/layers/test_rocm_unquantized_gemm.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from unittest.mock import MagicMock
+import pytest
+import torch
+from vllm.platforms import current_platform
+if current_platform.is_cuda():
+    pytest.skip(
+        "ROCm skinny GEMM tests are not supported on CUDA.",
+        allow_module_level=True,
+    )
+from vllm.model_executor.layers import utils
+def test_rocm_unquantized_gemm_gfx1x_wvsplitk_path(monkeypatch):
+    x = torch.randn(1, 64, dtype=torch.float16)
+    weight = torch.randn(128, 64, dtype=torch.float16)
+    monkeypatch.setattr(utils, "use_aiter_triton_gemm", lambda *args: False)
+    monkeypatch.setattr(utils.envs, "VLLM_ROCM_USE_SKINNY_GEMM", True)
+    monkeypatch.setattr("vllm.platforms.rocm.on_gfx1x", lambda: True)
+    monkeypatch.setattr("vllm.platforms.rocm.on_gfx9", lambda: False)
+    monkeypatch.setattr("vllm.platforms.rocm.on_gfx950", lambda: False)
+    monkeypatch.setattr(utils, "get_cu_count", lambda: 120)
+    wvsplitk_mock = MagicMock(side_effect=lambda w, x_view, _, __: x_view @ w.t())
+    monkeypatch.setattr(utils.ops, "wvSplitK", wvsplitk_mock)
+    llmm1_mock = MagicMock(side_effect=lambda w, x_view, _: x_view @ w.t())
+    monkeypatch.setattr(utils.ops, "LLMM1", llmm1_mock)
+    out = utils.rocm_unquantized_gemm_impl(x, weight, None)
+    ref = torch.nn.functional.linear(x, weight, None)
+    wvsplitk_mock.assert_called_once()
+    llmm1_mock.assert_not_called()
+    assert torch.allclose(out, ref, atol=1e-3, rtol=1e-3)
+def test_rocm_unquantized_gemm_gfx1x_n_gt_4_falls_back(monkeypatch):
+    x = torch.randn(5, 64, dtype=torch.float16)
+    weight = torch.randn(128, 64, dtype=torch.float16)
+    monkeypatch.setattr(utils, "use_aiter_triton_gemm", lambda *args: False)
+    monkeypatch.setattr(utils.envs, "VLLM_ROCM_USE_SKINNY_GEMM", True)
+    monkeypatch.setattr("vllm.platforms.rocm.on_gfx1x", lambda: True)
+    monkeypatch.setattr("vllm.platforms.rocm.on_gfx9", lambda: False)
+    monkeypatch.setattr("vllm.platforms.rocm.on_gfx950", lambda: False)
+    monkeypatch.setattr(utils, "get_cu_count", lambda: 120)
+    wvsplitk_mock = MagicMock(side_effect=lambda w, x_view, _, __: x_view @ w.t())
+    monkeypatch.setattr(utils.ops, "wvSplitK", wvsplitk_mock)
+    llmm1_mock = MagicMock(side_effect=lambda w, x_view, _: x_view @ w.t())
+    monkeypatch.setattr(utils.ops, "LLMM1", llmm1_mock)
+    out = utils.rocm_unquantized_gemm_impl(x, weight, None)
+    ref = torch.nn.functional.linear(x, weight, None)
+    wvsplitk_mock.assert_not_called()
+    llmm1_mock.assert_not_called()
+    assert torch.allclose(out, ref, atol=1e-3, rtol=1e-3)
+def test_rocm_unquantized_gemm_gfx950_wvsplitkrc_path(monkeypatch):
+    x = torch.randn(16, 1024, dtype=torch.float16)
+    weight = torch.randn(256, 1024, dtype=torch.float16)
+    monkeypatch.setattr(utils, "use_aiter_triton_gemm", lambda *args: False)
+    monkeypatch.setattr(utils.envs, "VLLM_ROCM_USE_SKINNY_GEMM", True)
+    monkeypatch.setattr("vllm.platforms.rocm.on_gfx1x", lambda: False)
+    monkeypatch.setattr("vllm.platforms.rocm.on_gfx9", lambda: False)
+    monkeypatch.setattr("vllm.platforms.rocm.on_gfx950", lambda: True)
+    monkeypatch.setattr(utils, "get_cu_count", lambda: 120)
+    wvsplitkrc_mock = MagicMock(side_effect=lambda w, x_view, _, __: x_view @ w.t())
+    monkeypatch.setattr(utils.ops, "wvSplitKrc", wvsplitkrc_mock)
+    wvsplitk_mock = MagicMock(side_effect=lambda w, x_view, _, __: x_view @ w.t())
+    monkeypatch.setattr(utils.ops, "wvSplitK", wvsplitk_mock)
+    out = utils.rocm_unquantized_gemm_impl(x, weight, None)
+    ref = torch.nn.functional.linear(x, weight, None)
+    wvsplitkrc_mock.assert_called_once()
+    wvsplitk_mock.assert_not_called()
+    assert torch.allclose(out, ref, atol=1e-3, rtol=1e-3)
--- a/tests/models/language/pooling/test_colbert.py
+++ b/tests/models/language/pooling/test_colbert.py
@@ -59,6 +59,22 @@ COLBERT_MODELS = {
            "model_cls": "AutoModel",
        },
    },
+    "lfm2": {
+        "model": "LiquidAI/LFM2-ColBERT-350M",
+        "colbert_dim": 128,
+        "max_model_len": 511,
+        "extra_kwargs": {
+            "hf_overrides": {
+                "architectures": ["ColBERTLfm2Model"],
+            },
+        },
+        "hf_comparison": {
+            "weights_file": "1_Dense/model.safetensors",
+            "weights_key": "linear.weight",
+            "trust_remote_code": False,
+            "model_cls": "AutoModel",
+        },
+    },
 }

--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -220,7 +220,10 @@ VLM_TEST_SETTINGS = {
        vllm_runner_kwargs={
            "model_impl": "transformers",
        },
-        marks=[pytest.mark.core_model],
+        marks=[
+            pytest.mark.core_model,
+            *([large_gpu_mark(min_gb=80)] if current_platform.is_rocm() else []),
+        ],
    ),
    "idefics3-transformers": VLMTestInfo(
        models=["HuggingFaceTB/SmolVLM-256M-Instruct"],

--- a/tests/models/multimodal/generation/test_granite_speech.py
+++ b/tests/models/multimodal/generation/test_granite_speech.py
@@ -39,7 +39,11 @@ models = [MODEL_NAME]
 def granite_speech_attention_config():
    """Return attention config for Granite Speech tests on ROCm."""
    if current_platform.is_rocm():
-        return {"backend": "ROCM_AITER_FA"}
+        from vllm.platforms.rocm import on_mi3xx
+        if on_mi3xx():
+            return {"backend": "ROCM_AITER_FA"}
+        return {"backend": "TRITON_ATTN"}
    return None

--- a/tests/models/multimodal/generation/test_keye.py
+++ b/tests/models/multimodal/generation/test_keye.py
@@ -24,12 +24,8 @@ class ModelRequestData(NamedTuple):
    sampling_params: SamplingParams | None = None
-@pytest.mark.core_model
 @pytest.mark.parametrize("question", [QUESTION])
-def test_keye_vl(
+def test_keye_vl(image_assets, question: str):
-    image_assets,
-    question: str,
-):
    images = [asset.pil_image for asset in image_assets]
    image_urls = [encode_image_url(image) for image in images]

--- a/tests/models/multimodal/generation/test_nemotron_parse.py
+++ b/tests/models/multimodal/generation/test_nemotron_parse.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable, Sequence
-from collections.abc import Sequence
 import pytest
+import regex as re
 from transformers import AutoModel
 from tests.models.utils import check_logprobs_close
 from vllm.assets.image import ImageAsset
+from vllm.logprobs import Logprob, SampleLogprobs
+from vllm.tokenizers import TokenizerLike
 from ....conftest import HfRunner, PromptImageInput, VllmRunner
-from ....utils import create_new_process_for_each_test
 IMAGE = ImageAsset("paper-11").pil_image_ext(ext="png").convert("RGB")
 PROMPT = "</s><s><predict_bbox><predict_classes><output_markdown>"
+class DummyLogprobs(dict[int, Logprob]):
+    def __init__(self, vocab_ids: Iterable[int]):
+        super().__init__(dict.fromkeys(vocab_ids, Logprob(0.0)))
+    def __repr__(self):
+        return "DummyLogprobs()"
+def mask_bbox_tokens(
+    output: tuple[list[int], str, SampleLogprobs],
+    tokenizer: TokenizerLike,
+) -> tuple[list[int], str, SampleLogprobs]:
+    """
+    Always pass check_logprobs_close check for bounding box tokens
+    because it is reasonable for them to differ slightly.
+    """
+    ignore_pattern = r"<[xy]_[\d.]+>"
+    vocab = tokenizer.get_vocab()
+    output_ids, output_str, out_logprobs = output
+    masked_logprobs = list[dict[int, Logprob]]()
+    for token, logprobs in zip(output_ids, out_logprobs):
+        if re.match(ignore_pattern, tokenizer.decode(token)):
+            masked_logprobs.append(DummyLogprobs(vocab.values()))
+        else:
+            masked_logprobs.append(logprobs)
+    return output_ids, output_str, masked_logprobs
 def run_test(
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
@@ -44,6 +76,8 @@ def run_test(
            for prompts, images in inputs
        ]
+        tokenizer = vllm_model.llm.get_tokenizer()
    with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
        hf_outputs_per_case = [
            hf_model.generate_greedy_logprobs_limit(
@@ -58,18 +92,20 @@ def run_test(
    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
+            outputs_0_lst=[
-            outputs_1_lst=vllm_outputs,
+                mask_bbox_tokens(output, tokenizer) for output in hf_outputs
+            ],
+            outputs_1_lst=[
+                mask_bbox_tokens(output, tokenizer) for output in vllm_outputs
+            ],
            name_0="hf",
            name_1="vllm",
        )
-@pytest.mark.core_model
 @pytest.mark.parametrize("model", ["nvidia/NVIDIA-Nemotron-Parse-v1.1"])
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("num_logprobs", [5])
-@create_new_process_for_each_test("spawn")
 def test_models(
    hf_runner, vllm_runner, model: str, dtype: str, num_logprobs: int
 ) -> None:
@@ -77,10 +113,7 @@ def test_models(
        hf_runner,
        vllm_runner,
        inputs=[
-            (
+            ([PROMPT] * 10, [IMAGE] * 10),
-                [PROMPT] * 10,
-                [IMAGE] * 10,
-            ),
        ],
        model=model,
        dtype=dtype,

--- a/tests/models/multimodal/generation/vlm_utils/builders.py
+++ b/tests/models/multimodal/generation/vlm_utils/builders.py
@@ -323,10 +323,7 @@ def build_audio_inputs_from_test_info(
        test_info.audio_idx_to_prompt,
        test_info.prompt_formatter,
    )
-    resampler = AudioResampler(
+    resampler = AudioResampler(target_sr=16000)
-        target_sr=16000,
-        method="librosa",
-    )
    audios = [asset.audio_and_sample_rate for asset in audio_assets]
    resampled_audios = [
        (

--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -24,6 +24,7 @@ from transformers import (
    GenerationConfig,
    GenerationMixin,
 )
+from transformers.masking_utils import create_causal_mask
 from transformers.video_utils import VideoMetadata
 from vllm.logprobs import SampleLogprobs
@@ -489,13 +490,14 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            self.image_size = self.vision_config.image_size
        def __call__(self, text: str, images: Image | list[Image], **kwargs):
-            from vllm.model_executor.models.h2ovl import (
+            from vllm.transformers_utils.processors.h2ovl import (
-                IMG_CONTEXT,
-                IMG_END,
-                IMG_START,
                image_to_pixel_values_h2ovl,
            )
+            IMG_START = "<img>"
+            IMG_END = "</img>"
+            IMG_CONTEXT = "<IMG_CONTEXT>"
            images = [images] if isinstance(images, Image) else images
            pixel_values = [
                image_to_pixel_values_h2ovl(
@@ -679,10 +681,14 @@ def isaac_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
        sin = sin.to(inputs_embeds.dtype)
        # Prepare attention mask
-        if attention_mask is not None:
+        attention_mask = create_causal_mask(
-            attention_mask = self._update_causal_mask(
+            config=self.config,
-                attention_mask, inputs_embeds, cache_position, past_key_values, False
+            input_embeds=inputs_embeds,
-            )
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+            cache_position=cache_position,
+        )
        # Initialize and collect hidden states
        hidden_states = inputs_embeds
@@ -751,16 +757,17 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            self.image_size = self.vision_config.image_size
        def __call__(self, text: str, images: Image | list[Image], **kwargs):
-            from vllm.model_executor.models.skyworkr1v import (
+            from vllm.transformers_utils.processors.internvl import (
-                IMG_CONTEXT,
+                image_to_pixel_values_internvl,
-                IMG_END,
-                IMG_START,
-                image_to_pixel_values_skyworkr1v,
            )
+            IMG_START = "<img>"
+            IMG_END = "</img>"
+            IMG_CONTEXT = "<IMG_CONTEXT>"
            images = [images] if isinstance(images, Image) else images
            pixel_values = [
-                image_to_pixel_values_skyworkr1v(
+                image_to_pixel_values_internvl(
                    image,
                    input_size=self.image_size,
                    min_num=self.min_num,
@@ -815,14 +822,15 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            videos: npt.NDArray | list[npt.NDArray] = None,
            **kwargs,
        ):
-            from vllm.model_executor.models.internvl import (
+            from vllm.transformers_utils.processors.internvl import (
-                IMG_CONTEXT,
-                IMG_END,
-                IMG_START,
                image_to_pixel_values_internvl,
                video_to_pixel_values_internvl,
            )
+            IMG_START = "<img>"
+            IMG_END = "</img>"
+            IMG_CONTEXT = "<IMG_CONTEXT>"
            images = [images] if isinstance(images, Image) else images
            videos = [videos] if isinstance(videos, np.ndarray) else videos
            if images is not None:
@@ -1260,9 +1268,9 @@ def voxtral_patch_hf_runner(hf_model: "HfRunner") -> "HfRunner":
    generated).
    """
-    import base64
    import io
+    import pybase64 as base64
    import soundfile as sf
    processor = hf_model.processor

--- a/tests/models/multimodal/pooling/test_colpali.py
+++ b/tests/models/multimodal/pooling/test_colpali.py
@@ -7,9 +7,9 @@ ColPali is a multi-vector retrieval model based on PaliGemma backbone
 It produces per-token embeddings for both text and image inputs.
 """
-import base64
 from io import BytesIO
+import pybase64 as base64
 import pytest
 import torch
 from PIL import Image

--- a/tests/models/multimodal/pooling/test_colqwen3.py
+++ b/tests/models/multimodal/pooling/test_colqwen3.py
@@ -7,9 +7,9 @@ ColBERT-style late interaction scoring (MaxSim). It produces per-token
 embeddings for both text and image inputs.
 """
-import base64
 from io import BytesIO
+import pybase64 as base64
 import pytest
 import torch
 from PIL import Image

--- a/tests/models/multimodal/pooling/test_colqwen3_5.py
+++ b/tests/models/multimodal/pooling/test_colqwen3_5.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for ColQwen3.5 late interaction model for multi-modal retrieval.
+ColQwen3.5 is a multi-vector retrieval model based on Qwen3.5 backbone with
+ColBERT-style late interaction scoring (MaxSim). It produces per-token
+embeddings for both text and image inputs.
+"""
+import pytest
+import torch
+from ....conftest import VllmRunner
+MODELS = [
+    "athrael-soju/colqwen3.5-4.5B-v3",
+]
+EMBED_DIMS = {
+    "athrael-soju/colqwen3.5-4.5B-v3": 320,
+}
+TEXT_QUERIES = [
+    "What is the capital of France?",
+    "Describe the contents of the document.",
+]
+TEXT_DOCUMENTS = [
+    "The capital of France is Paris.",
+    "This document contains important financial data.",
+]
+DTYPE = "half"
+def _run_token_embed_test(
+    vllm_runner: type[VllmRunner],
+    model: str,
+    *,
+    dtype: str,
+) -> None:
+    """Verify per-token embedding shape and L2 normalization."""
+    with vllm_runner(
+        model,
+        runner="pooling",
+        dtype=dtype,
+        max_model_len=4096,
+        enforce_eager=True,
+    ) as vllm_model:
+        outputs = vllm_model.token_embed([TEXT_QUERIES[0]])
+        assert len(outputs) == 1
+        emb = torch.tensor(outputs[0])
+        # Token embeddings should be 2D: [num_tokens, embed_dim]
+        assert emb.dim() == 2
+        assert emb.shape[1] == EMBED_DIMS[model]
+        assert emb.shape[0] > 1
+        # Verify L2 normalization
+        norms = torch.norm(emb, p=2, dim=-1)
+        torch.testing.assert_close(
+            norms,
+            torch.ones_like(norms),
+            rtol=1e-2,
+            atol=1e-2,
+        )
+def _run_late_interaction_test(
+    vllm_runner: type[VllmRunner],
+    model: str,
+    *,
+    dtype: str,
+) -> None:
+    """Verify MaxSim scoring matches manual computation."""
+    from vllm.entrypoints.pooling.score.utils import compute_maxsim_score
+    with vllm_runner(
+        model,
+        runner="pooling",
+        dtype=dtype,
+        max_model_len=4096,
+        enforce_eager=True,
+    ) as vllm_model:
+        q_outputs = vllm_model.token_embed([TEXT_QUERIES[0]])
+        d_outputs = vllm_model.token_embed([TEXT_DOCUMENTS[0]])
+        q_emb = torch.tensor(q_outputs[0])
+        d_emb = torch.tensor(d_outputs[0])
+        manual_score = compute_maxsim_score(q_emb, d_emb).item()
+        vllm_scores = vllm_model.score(TEXT_QUERIES[0], TEXT_DOCUMENTS[0])
+        assert len(vllm_scores) == 1
+        assert vllm_scores[0] == pytest.approx(manual_score, rel=0.01)
+def _run_relevance_test(
+    vllm_runner: type[VllmRunner],
+    model: str,
+    *,
+    dtype: str,
+) -> None:
+    """Verify that relevant documents score higher than irrelevant ones."""
+    query = "What is machine learning?"
+    documents = [
+        "Machine learning is a subset of artificial intelligence.",
+        "The weather forecast shows rain tomorrow.",
+        "Deep learning uses neural networks for complex tasks.",
+    ]
+    with vllm_runner(
+        model,
+        runner="pooling",
+        dtype=dtype,
+        max_model_len=4096,
+        enforce_eager=True,
+    ) as vllm_model:
+        scores = vllm_model.score(query, documents)
+        assert len(scores) == 3
+        assert scores[0] > scores[1], "ML doc should score higher than weather doc"
+        assert scores[2] > scores[1], "DL doc should score higher than weather doc"
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", [DTYPE])
+def test_colqwen3_5_token_embed(
+    vllm_runner,
+    model: str,
+    dtype: str,
+) -> None:
+    _run_token_embed_test(vllm_runner, model, dtype=dtype)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", [DTYPE])
+def test_colqwen3_5_late_interaction_scoring(
+    vllm_runner,
+    model: str,
+    dtype: str,
+) -> None:
+    _run_late_interaction_test(vllm_runner, model, dtype=dtype)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", [DTYPE])
+def test_colqwen3_5_relevance_ordering(
+    vllm_runner,
+    model: str,
+    dtype: str,
+) -> None:
+    _run_relevance_test(vllm_runner, model, dtype=dtype)
--- a/tests/models/multimodal/pooling/test_llama_nemotron_vl.py
+++ b/tests/models/multimodal/pooling/test_llama_nemotron_vl.py
@@ -9,10 +9,10 @@ Tests for the LlamaNemotronVL model family:
 Both variants share a SigLIP vision encoder with a bidirectional LLaMA backbone.
 """
-import base64
 from io import BytesIO
 from pathlib import Path
+import pybase64 as base64
 import pytest
 import torch
 from transformers import AutoModel, AutoModelForSequenceClassification, AutoProcessor
@@ -22,8 +22,10 @@ from vllm.entrypoints.chat_utils import (
    ChatCompletionContentPartTextParam,
 )
 from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam
+from vllm.platforms import current_platform
 from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
+from ....utils import ROCM_ENGINE_KWARGS
 from ...utils import check_embeddings_close
 # Prefixes used by the model API
@@ -70,6 +72,7 @@ def _run_test(
        max_model_len=2048,
        enforce_eager=True,
        trust_remote_code=True,
+        **ROCM_ENGINE_KWARGS,
    ) as vllm_model:
        vllm_outputs = vllm_model.embed(input_texts, images=input_images)
@@ -250,6 +253,7 @@ def _run_vllm_reranker(
        max_model_len=2048,
        enforce_eager=True,
        trust_remote_code=True,
+        **ROCM_ENGINE_KWARGS,
    ) as vllm_model:
        has_images = any(img is not None for _, img in docs)
@@ -322,8 +326,11 @@ def _run_reranker_test(
    assert len(hf_scores) == len(vllm_scores), (
        f"Output length mismatch: HF={len(hf_scores)}, vLLM={len(vllm_scores)}"
    )
+    # NOTE: ROCm shows slightly higher numerical variance dues to different attention
+    # backend between vLLM and HF; use a marginally looser tolerance
+    rel_tol = 0.022 if current_platform.is_rocm() else 0.02
    for i, (hf_score, vllm_score) in enumerate(zip(hf_scores, vllm_scores)):
-        assert hf_score == pytest.approx(vllm_score, rel=0.02), (
+        assert hf_score == pytest.approx(vllm_score, rel=rel_tol), (
            f"Score mismatch at index {i}: HF={hf_score:.4f}, vLLM={vllm_score:.4f}"
        )

--- a/tests/models/multimodal/pooling/test_phi3v.py
+++ b/tests/models/multimodal/pooling/test_phi3v.py
@@ -3,6 +3,7 @@
 import pytest
 import torch.nn.functional as F
+import transformers.utils
 from PIL import Image
 from vllm.assets.base import get_vllm_public_assets
@@ -12,6 +13,12 @@ from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
 from ....utils import large_gpu_test
 from ...utils import check_embeddings_close
+# BC for method that was deleted in Transformers v5.
+# Only needed for generating the HF reference.
+transformers.utils.is_flash_attn_greater_or_equal_2_10 = (
+    lambda: transformers.utils.is_flash_attn_greater_or_equal("2.1.0")
+)
 HF_TEXT_PROMPTS = [
    # T -> X
    "Find me an everyday image that matches the given caption: The label of the object is stop sign",  # noqa: E501

--- a/tests/models/multimodal/processing/test_h2ovl.py
+++ b/tests/models/multimodal/processing/test_h2ovl.py
@@ -23,7 +23,7 @@ def _get_expected_num_patches(
    min_num: int,
    max_num: int,
 ):
-    from vllm.model_executor.models.h2ovl import (
+    from vllm.transformers_utils.processors.h2ovl import (
        calculate_h2ovl_targets,
        get_h2ovl_target_ratios,
    )