Merge tag 'v0.14.0' into v0.14.0-dev

7e63ef82 · zhuwenwen · 8cbcac5d · b17039bc · 7e63ef82 · 7e63ef82
Commit 7e63ef82 authored Jan 21, 2026 by zhuwenwen
20 changed files
--- a/tests/kernels/quantization/test_int8_quant.py
+++ b/tests/kernels/quantization/test_int8_quant.py
@@ -7,7 +7,7 @@ import torch
 from tests.kernels.quant_utils import ref_dynamic_per_token_quant
 from tests.kernels.utils import opcheck
 from vllm._custom_ops import scaled_int8_quant
-from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 DTYPES = [torch.bfloat16, torch.float]
 HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193]
@@ -48,7 +48,7 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True):
 def test_dynamic_scaled_int8_quant(
    num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
@@ -74,7 +74,7 @@ def test_dynamic_scaled_int8_quant(
 def test_dynamic_scaled_int8_azp_quant(
    num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
    int8_traits = torch.iinfo(torch.int8)
    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 - 300
@@ -115,7 +115,7 @@ def test_dynamic_scaled_int8_azp_quant(
 def test_static_scaled_int8_quant(
    num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int, scale: float
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
    int8_traits = torch.iinfo(torch.int8)
    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
@@ -148,7 +148,7 @@ def test_static_scaled_int8_azp_quant(
    scale: float,
    azp: int,
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
    int8_traits = torch.iinfo(torch.int8)
    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 - 300

--- a/tests/kernels/quantization/test_mxfp4_qutlass.py
+++ b/tests/kernels/quantization/test_mxfp4_qutlass.py
@@ -24,6 +24,7 @@ from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_m
 from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn
 from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 if not torch.cuda.is_available():
    pytest.skip("CUDA required for these tests.", allow_module_level=True)
@@ -205,7 +206,7 @@ LLAMA_MODELS = {
 @pytest.fixture(autouse=True)
 def _seed_each_test():
-    current_platform.seed_everything(0)
+    set_random_seed(0)
    np.random.seed(0)
    torch.random.manual_seed(0)

--- a/tests/kernels/quantization/test_nvfp4_qutlass.py
+++ b/tests/kernels/quantization/test_nvfp4_qutlass.py
@@ -25,6 +25,7 @@ from vllm import _custom_ops as ops  # use existing nvfp4 gemm in vllm
 from vllm._custom_ops import fusedQuantizeNv
 from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 if not torch.cuda.is_available():
    pytest.skip("CUDA required for these tests.", allow_module_level=True)
@@ -193,7 +194,7 @@ LLAMA_MODELS = {
 @pytest.fixture(autouse=True)
 def _seed_each_test():
-    current_platform.seed_everything(0)
+    set_random_seed(0)
    np.random.seed(0)
    torch.random.manual_seed(0)

--- a/tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
+++ b/tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
@@ -11,6 +11,7 @@ from tests.kernels.quantization.nvfp4_utils import (
 from vllm._custom_ops import scaled_fp4_quant
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 if not current_platform.has_device_capability(100):
    pytest.skip(
@@ -30,10 +31,11 @@ BLOCK_SIZE = 16
 @pytest.mark.parametrize("shape", SHAPES)
 @torch.inference_mode()
 def test_silu_mul_nvfp4_quant(
+    default_vllm_config,
    dtype: torch.dtype,
    shape: tuple[int, int],
 ) -> None:
-    current_platform.seed_everything(42)
+    set_random_seed(42)
    device = "cuda:0"
    torch.set_default_device(device)

--- a/tests/kernels/quantization/test_triton_scaled_mm.py
+++ b/tests/kernels/quantization/test_triton_scaled_mm.py
@@ -11,7 +11,9 @@ import pytest
 import torch
 from vllm.platforms import current_platform
 from ...utils import models_path_prefix
+from vllm.utils.torch_utils import set_random_seed
 device = "cuda"
@@ -86,7 +88,7 @@ def test_scaled_mm(
 ):
    is_floating_point_type = lambda t: torch.tensor([1, 1], dtype=t).is_floating_point()
-    current_platform.seed_everything(0)
+    set_random_seed(0)
    # NOTE: There are cases, where if the matrix is large enough, an output
    # like 65504.4 can be produced, and can easily turn into inf when

--- a/tests/kernels/quantization/untest_block_fp8.py
+++ b/tests/kernels/quantization/untest_block_fp8.py
@@ -24,6 +24,10 @@ from vllm.utils.deep_gemm import (
    per_block_cast_to_fp8,
    should_use_deepgemm_for_fp8_linear,
 )
+from vllm.utils.flashinfer import (
+    flashinfer_fp8_blockscale_gemm,
+    has_flashinfer_fp8_blockscale_gemm,
+)
 from vllm.utils.import_utils import has_deep_gemm
 if current_platform.get_device_capability() < (9, 0):
@@ -205,3 +209,50 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))
    ) / torch.mean(torch.abs(ref_out.to(torch.float32)))
    assert rel_diff < 0.001
+@pytest.mark.skipif(
+    current_platform.is_fp8_fnuz(),
+    reason="This platform supports e4m3fnuz, not e4m3fn.",
+)
+@pytest.mark.parametrize(
+    "M,N,K,block_size,out_dtype,seed",
+    itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS),
+)
+@torch.inference_mode()
+def test_w8a8_block_fp8_flashinfer_matmul(M, N, K, block_size, out_dtype, seed):
+    if not has_flashinfer_fp8_blockscale_gemm():
+        pytest.skip(
+            "FlashInfer block GEMM not available (requires SM90+ and FlashInfer)"
+        )
+    # only aligned sizes
+    if K % 128 != 0 or N % 64 != 0:
+        pytest.skip(f"Skipping test; invalid size {M}, {N}, {K}")
+    torch.manual_seed(seed)
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max = fp8_info.max
+    A_bf16 = (torch.rand(M, K, dtype=torch.bfloat16) - 0.5) * 2 * fp8_max
+    B_bf16 = (torch.rand(N, K, dtype=torch.bfloat16) - 0.5) * 2 * fp8_max
+    A_fp8, As_fp8 = per_token_group_quant_fp8(A_bf16, block_size[1], use_ue8m0=False)
+    B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_bf16, block_size, use_ue8m0=False)
+    As = As_fp8.to(torch.float32)
+    Bs = Bs_fp8.to(torch.float32)
+    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
+    out = flashinfer_fp8_blockscale_gemm(
+        input=A_bf16,
+        weight=B_fp8,
+        input_scale=None,
+        weight_scale=Bs,
+        out_dtype=out_dtype,
+    )
+    rel_diff = torch.mean(
+        torch.abs(out.to(torch.bfloat16) - ref_out.to(torch.bfloat16))
+    ) / torch.mean(torch.abs(ref_out.to(torch.bfloat16)))
+    assert rel_diff < 0.001
--- a/tests/kernels/quantization/untest_fp8_quant.py
+++ b/tests/kernels/quantization/untest_fp8_quant.py
@@ -11,7 +11,11 @@ from tests.kernels.quant_utils import (
    ref_dynamic_per_token_quant,
 )
 from tests.kernels.utils import opcheck
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    scaled_quantize,
+)
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 DTYPES = [torch.bfloat16, torch.float]
 HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193]
@@ -21,10 +25,18 @@ SEEDS = [0]
 def opcheck_fp8_quant(
-    output, input, scale=None, scale_ub=None, use_per_token_if_dynamic=False
+    output,
+    input,
+    scale=None,
+    scale_ub=None,
+    use_per_token_if_dynamic=False,
+    group_shape=None,
 ):
    if scale is not None:
-        opcheck(torch.ops._C.static_scaled_fp8_quant, (output, input, scale))
+        opcheck(
+            torch.ops._C.static_scaled_fp8_quant,
+            (output, input, scale, group_shape),
+        )
    elif use_per_token_if_dynamic:
        scale = torch.empty(
            (input.shape[0], 1), device=input.device, dtype=torch.float32
@@ -51,7 +63,7 @@ def opcheck_fp8_quant(
 def test_dynamic_per_token_fp8_quant(
    num_tokens: int, hidden_size: int, dtype: torch.dtype, scale_ub: bool, seed: int
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
    x = (
        torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") + 1e-6
@@ -81,7 +93,7 @@ def test_dynamic_per_token_fp8_quant(
 def test_dynamic_per_tensor_fp8_quant(
    num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda")
@@ -101,7 +113,7 @@ def test_dynamic_per_tensor_fp8_quant(
 @torch.inference_mode()
 @pytest.mark.parametrize("seed", SEEDS)
 def test_fp8_quant_large(seed: int) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
    num_tokens = 1024000  # Mistral-Nemo's max_position_embeddings
    hidden_size = 1152  # Smallest hidden_size to reproduce the error
@@ -117,4 +129,93 @@ def test_fp8_quant_large(seed: int) -> None:
    ref_out = ref_out.to(dtype=dtype)
    ops_out = ops_out.to(dtype=dtype)
    torch.testing.assert_close(ref_out, ops_out)
\ No newline at end of file
+# Test static FP8 quantization with 2D group scales
+GROUP_SHAPES_2D = [
+    (-1, -1),  # Per-tensor
+    (-1, 1),  # Per-channel
+    (1, -1),  # Per-token
+    (-1, 128),  # Per-head quantization
+    (1, 128),  # DeepSeek-style per-token-per-group (group_m=1, group_n=128)
+    (128, 128),  # DeepSeek-style block quantization
+    (1, 64),  # Smaller group size
+    (1, 16),  # Small group (scalar path in kernel)
+    (4, 256),  # Non-trivial both dimensions
+]
+# Use sizes divisible by all group shapes
+NUM_TOKENS_GROUP = [128, 512]
+HIDDEN_SIZES_GROUP = [256, 1024, 2048]
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS_GROUP)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES_GROUP)
+@pytest.mark.parametrize("group_shape", GROUP_SHAPES_2D)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@torch.inference_mode()
+def test_static_fp8_quant_group_2d(
+    num_tokens: int,
+    hidden_size: int,
+    group_shape: tuple[int, int],
+    dtype: torch.dtype,
+    seed: int,
+) -> None:
+    """Test static FP8 quantization with 2D group scales using scaled_quantize."""
+    # Normalize group_shape (-1 means full extent)
+    norm_group_m = num_tokens if group_shape[0] == -1 else group_shape[0]
+    norm_group_n = hidden_size if group_shape[1] == -1 else group_shape[1]
+    # Skip if sizes are not divisible by group shape
+    if num_tokens % norm_group_m != 0 or hidden_size % norm_group_n != 0:
+        pytest.skip(
+            f"Skipping: ({num_tokens}, {hidden_size}) not divisible by "
+            f"group_shape ({group_shape[0]}, {group_shape[1]})"
+        )
+    current_platform.seed_everything(seed)
+    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda")
+    ref_out, scale = scaled_quantize(
+        x, group_shape, FP8_DTYPE, compute_dtype=torch.float32
+    )
+    ops_out, ops_scale = ops.scaled_fp8_quant(x, scale=scale, group_shape=group_shape)
+    torch.testing.assert_close(scale, ops_scale)
+    torch.testing.assert_close(ref_out.float(), ops_out.float(), rtol=0.12, atol=0.0)
+    opcheck_fp8_quant(ops_out, x, scale=scale)
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS_GROUP)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES_GROUP)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("group_shape", [(1, -1), (-1, 1)])  # per-token, per-channel
+@torch.inference_mode()
+def test_static_fp8_quant_1d_scale(
+    num_tokens: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    seed: int,
+    group_shape: tuple[int, int],
+) -> None:
+    """Test static FP8 quantization with 1D scale (per-token or per-channel)."""
+    current_platform.seed_everything(seed)
+    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda")
+    ref_out, scale_2d = scaled_quantize(
+        x, group_shape, FP8_DTYPE, compute_dtype=torch.float32
+    )
+    # Flatten scale to 1D for testing 1D scale path
+    scale_1d = scale_2d.flatten()
+    ops_out, ops_scale = ops.scaled_fp8_quant(
+        x, scale=scale_1d, group_shape=group_shape
+    )
+    torch.testing.assert_close(scale_1d, ops_scale)
+    torch.testing.assert_close(ref_out.float(), ops_out.float(), rtol=0.12, atol=0.0)
+    opcheck_fp8_quant(ops_out, x, scale=scale_1d, group_shape=group_shape)
--- a/tests/kernels/quantization/untest_nvfp4_quant.py
+++ b/tests/kernels/quantization/untest_nvfp4_quant.py
@@ -6,6 +6,7 @@ import torch
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
+from vllm.utils.torch_utils import set_random_seed
 if not current_platform.has_device_capability(100):
    pytest.skip(
@@ -134,7 +135,7 @@ def test_quantize_to_fp4(
    seed: int,
    device: str,
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
    torch.set_default_device(device)
    m, n = shape
@@ -156,7 +157,7 @@ def test_quantize_to_fp4(
 @torch.inference_mode()
 def test_quantize_to_fp4_padded(pad_shape: tuple[int, int]) -> None:
    dtype = torch.float16
-    current_platform.seed_everything(42)
+    set_random_seed(42)
    torch.set_default_device("cuda:0")
    m, n = pad_shape

--- a/tests/kernels/quantization/untest_nvfp4_scaled_mm.py
+++ b/tests/kernels/quantization/untest_nvfp4_scaled_mm.py
@@ -6,6 +6,7 @@ from nvfp4_utils import FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX, dequantize_nvfp4_to_dt
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 if not current_platform.has_device_capability(100):
    pytest.skip(
@@ -59,7 +60,7 @@ def test_nvfp4_gemm(
    seed: int,
    device: str,
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
    m, n, packed_k = shape
    k = packed_k * 2
    block_size = 16

--- a/tests/kernels/test_apply_repetition_penalties.py
+++ b/tests/kernels/test_apply_repetition_penalties.py
@@ -9,6 +9,7 @@ from vllm._custom_ops import (
    apply_repetition_penalties_torch,
 )
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 NUM_SEQS = [1, 2, 3, 4, 8, 13, 17, 32, 37, 256, 1023, 1024, 1025]
 # [stress, stress, stress, Qwen, llama 4]
@@ -38,7 +39,7 @@ def test_apply_repetition_penalties(
    Test the apply_repetition_penalties custom op
    against a reference implementation.
    """
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
    torch.set_default_device("cuda:0")
    # Create test data
@@ -95,7 +96,7 @@ def test_apply_repetition_penalties_zero_seqs() -> None:
    dtype = torch.float32
    seed = 0
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
    torch.set_default_device("cuda:0")
    # Create test data

--- a/tests/kernels/test_fla_layernorm_guard.py
+++ b/tests/kernels/test_fla_layernorm_guard.py
@@ -10,7 +10,7 @@ from vllm.model_executor.layers.fla.ops.layernorm_guard import (
    layernorm_fn,
    rms_norm_ref,
 )
-from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 def layer_norm_ref(
@@ -114,7 +114,7 @@ def test_layer_norm_fwd_basic(
    is_rms_norm: bool,
 ) -> None:
    """Test basic layer norm forward pass without z (gate) tensor."""
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
    device = torch.device("cuda:0")
    # Create inputs
@@ -156,7 +156,7 @@ def test_layer_norm_fwd_with_gate(
    is_rms_norm: bool,
 ) -> None:
    """Test layer norm forward pass with z (gate) tensor."""
-    current_platform.seed_everything(42)
+    set_random_seed(42)
    device = torch.device("cuda:0")
    # Create inputs
@@ -213,7 +213,7 @@ def test_layer_norm_fwd_with_groups(
            f"hidden_size {hidden_size} not divisible by group_size {group_size}"
        )
-    current_platform.seed_everything(42)
+    set_random_seed(42)
    device = torch.device("cuda:0")
    # Create inputs
@@ -253,7 +253,7 @@ def test_layer_norm_rows_per_block(
    dtype: torch.dtype,
 ) -> None:
    """Test that rows_per_block logic works correctly for various M sizes."""
-    current_platform.seed_everything(42)
+    set_random_seed(42)
    device = torch.device("cuda:0")
    hidden_size = 1024
@@ -278,7 +278,7 @@ def test_layer_norm_rows_per_block(
 def test_strided_input(dtype: torch.dtype) -> None:
    """Test that the kernel handles non-contiguous (strided)
    inputs correctly."""
-    current_platform.seed_everything(42)
+    set_random_seed(42)
    device = torch.device("cuda:0")
    num_tokens = 128
    hidden_size = 1024
@@ -318,7 +318,7 @@ def test_output_buffer_provided(
    dtype: torch.dtype,
 ) -> None:
    """Test that the kernel works when an output buffer is provided."""
-    current_platform.seed_everything(42)
+    set_random_seed(42)
    device = torch.device("cuda:0")
    # Create inputs
@@ -359,7 +359,7 @@ def test_multidimensional_input(
    dtype: torch.dtype,
 ) -> None:
    """Test that the autograd function handles multidimensional inputs."""
-    current_platform.seed_everything(42)
+    set_random_seed(42)
    device = torch.device("cuda:0")
    hidden_size = shape[-1]

--- a/tests/kernels/test_flex_attention.py
+++ b/tests/kernels/test_flex_attention.py
@@ -42,7 +42,7 @@ def set_seed(seed):
    not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION,
    reason="CUDA not available or PyTorch version < 2.7",
 )
-def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
+def test_flex_attention_vs_default_backend(vllm_runner):
    """Test that FlexAttention produces the same outputs as the default backend.
    This test compares the outputs from the FlexAttention backend with
@@ -59,35 +59,32 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
    ]
    # Run with flex attention
-    with monkeypatch.context() as m:
+    set_seed(seed)
-        m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
+    with vllm_runner(
+        model_name,
-        set_seed(seed)
+        runner="generate",
-        with vllm_runner(
+        tensor_parallel_size=1,
-            model_name,
+        num_gpu_blocks_override=128,
-            runner="generate",
+        enforce_eager=True,
-            tensor_parallel_size=1,
+        attention_config={"backend": "FLEX_ATTENTION"},
-            num_gpu_blocks_override=128,
+    ) as llm_flex:
-            enforce_eager=True,
+        output_flex = llm_flex.generate_greedy_logprobs(
-        ) as llm_flex:
+            prompts, max_tokens, num_logprobs
-            output_flex = llm_flex.generate_greedy_logprobs(
+        )
-                prompts, max_tokens, num_logprobs
-            )
    # Run with default backend
-    with monkeypatch.context() as m:
+    set_seed(seed)
-        set_seed(seed)
+    with vllm_runner(
-        with vllm_runner(
+        model_name,
-            model_name,
+        runner="generate",
-            runner="generate",
+        tensor_parallel_size=1,
-            tensor_parallel_size=1,
+        num_gpu_blocks_override=128,
-            num_gpu_blocks_override=128,
+        enforce_eager=True,
-            enforce_eager=True,
+        gpu_memory_utilization=0.85,
-            gpu_memory_utilization=0.85,
+    ) as llm_default:
-        ) as llm_default:
+        output_default = llm_default.generate_greedy_logprobs(
-            output_default = llm_default.generate_greedy_logprobs(
+            prompts, max_tokens, num_logprobs
-                prompts, max_tokens, num_logprobs
+        )
-            )
    check_logprobs_close(
        outputs_0_lst=output_flex,
@@ -101,7 +98,7 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
    not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION,
    reason="CUDA not available or PyTorch version < 2.7",
 )
-def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
+def test_encoder_flex_attention_vs_default_backend(vllm_runner):
    """Test that FlexAttention produces the same outputs as the default backend.
    This test compares the outputs from the FlexAttention backend with
@@ -115,30 +112,26 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
    ]
    # Run with flex attention
-    with monkeypatch.context() as m:
+    with vllm_runner(
-        m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
+        model_name,
-        with vllm_runner(
+        runner="pooling",
-            model_name,
+        dtype=torch.bfloat16,
-            runner="pooling",
+        tensor_parallel_size=1,
-            dtype=torch.bfloat16,
+        max_model_len=100,
-            tensor_parallel_size=1,
+        enforce_eager=True,
-            max_model_len=100,
+        attention_config={"backend": "FLEX_ATTENTION"},
-            enforce_eager=True,
+    ) as llm_flex:
-        ) as llm_flex:
+        flex_outputs = llm_flex.embed(prompts)
-            flex_outputs = llm_flex.embed(prompts)
    # Run with default backend
-    with (
+    with vllm_runner(
-        monkeypatch.context() as m,
+        model_name,
-        vllm_runner(
+        runner="pooling",
-            model_name,
+        dtype=torch.bfloat16,
-            runner="pooling",
+        tensor_parallel_size=1,
-            dtype=torch.bfloat16,
+        max_model_len=100,
-            tensor_parallel_size=1,
+        enforce_eager=True,
-            max_model_len=100,
+    ) as llm_default:
-            enforce_eager=True,
-        ) as llm_default,
-    ):
        default_outputs = llm_default.embed(prompts)
    check_embeddings_close(

--- a/tests/kernels/untest_fused_quant_activation.py
+++ b/tests/kernels/untest_fused_quant_activation.py
@@ -39,6 +39,7 @@ def ops_impl(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_silu_and_mul(
+    default_vllm_config,
    num_tokens: int,
    hidden_size: int,
    dtype: torch.dtype,

--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -13,11 +13,11 @@ import torch
 from torch._prims_common import TensorLikeType
 from tests.kernels.quant_utils import native_w8a8_block_matmul
-from vllm.attention.backends.abstract import AttentionType
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
 from vllm.utils.torch_utils import make_tensor_with_pad
+from vllm.v1.attention.backend import AttentionType
 # For now, disable "test_aot_dispatch_dynamic" since there are some
 # bugs related to this test in PyTorch 2.4.

--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -84,7 +84,7 @@ class DummyLoRAModel(nn.Sequential, SupportsLoRA):
 @pytest.fixture
-def dummy_model() -> nn.Module:
+def dummy_model(default_vllm_config) -> nn.Module:
    model = DummyLoRAModel(
        OrderedDict(
            [
@@ -117,7 +117,7 @@ def dummy_model() -> nn.Module:
 @pytest.fixture
-def dummy_model_gate_up() -> nn.Module:
+def dummy_model_gate_up(default_vllm_config) -> nn.Module:
    model = DummyLoRAModel(
        OrderedDict(
            [
@@ -214,6 +214,31 @@ def qwen25vl_lora_files():
    return snapshot_download(repo_id="jeeejeee/qwen25-vl-lora-pokemon")
+@pytest.fixture(scope="session")
+def qwen2vl_language_lora_files():
+    return snapshot_download(repo_id="prashanth058/qwen2vl-flickr-lora-language")
+@pytest.fixture(scope="session")
+def qwen2vl_vision_tower_connector_lora_files():
+    return snapshot_download(repo_id="prashanth058/qwen2vl-flickr-lora-tower-connector")
+@pytest.fixture(scope="session")
+def qwen2vl_vision_tower_lora_files():
+    return snapshot_download(repo_id="prashanth058/qwen2vl-flickr-lora-tower")
+@pytest.fixture(scope="session")
+def qwen25vl_vision_lora_files():
+    return snapshot_download(repo_id="EpochEcho/qwen2.5-3b-vl-lora-vision-connector")
+@pytest.fixture(scope="session")
+def qwen3vl_vision_lora_files():
+    return snapshot_download(repo_id="EpochEcho/qwen3-4b-vl-lora-vision-connector")
 @pytest.fixture(scope="session")
 def tinyllama_lora_files():
    # return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")

--- a/tests/lora/test_fused_moe_lora_kernel.py
+++ b/tests/lora/test_fused_moe_lora_kernel.py
@@ -18,8 +18,8 @@ from vllm.distributed.parallel_state import (
    get_tensor_model_parallel_world_size,
 )
 from vllm.lora.ops.triton_ops import fused_moe_lora
-from vllm.platforms import current_platform
 from vllm.utils.network_utils import get_open_port
+from vllm.utils.torch_utils import set_random_seed
 @pytest.fixture(autouse=True)
@@ -265,7 +265,7 @@ def test_fused_moe_lora_kernel(
    seed,
 ):
    torch.set_default_device(device)
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
    # the number of randomly generated sentences.
    num_sequences = 10
    # generate data
@@ -358,7 +358,7 @@ def test_fused_moe_lora_kernel_fully_sharded(
    seed,
    column_parallel,
 ):
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
    # the number of randomly generated sentences.
    num_sequences = 10
    # generate data
@@ -415,7 +415,7 @@ def use_fused_moe_lora_kernel_tensor_parallel(
    def _get_shard_slice(shard_size):
        return slice(local_rank * shard_size, (local_rank + 1) * shard_size)
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
    device = torch.device(f"cuda:{local_rank}")
    torch.cuda.set_device(device)

--- a/tests/lora/test_gptoss_tp.py
+++ b/tests/lora/test_gptoss_tp.py
@@ -34,9 +34,9 @@ The Competition_ID of competition_record is the foreign key of Competition_ID of
 ###Response:<|end|><|start|>assistant<|channel|>final<|message|>"""  # noqa: E501
 EXPECTED_LORA_OUTPUT = [
-    "SELECT AVG(Working_Horses) FROM farm WHERE Total_Horses > 5000;",
+    "SELECT avg(Working_Horses) FROM farm WHERE Total_Horses  >  5000",
-    "SELECT MAX(Cows) AS Max_Cows, MIN(Cows) AS Min_Cows FROM farm;",
+    "SELECT max(Cows) ,  min(Cows) FROM farm",
-    "SELECT MAX(Cows) AS Max_Cows, MIN(Cows) AS Min_Cows FROM farm;",
+    "SELECT max(Cows) ,  min(Cows) FROM farm",
 ]
@@ -69,38 +69,54 @@ def generate_and_test(llm: vllm.LLM, lora_path: str, lora_id: int) -> None:
        assert generated_texts[i].startswith(EXPECTED_LORA_OUTPUT[i])
-def test_gpt_oss_lora(gptoss20b_lora_files):
+@pytest.mark.parametrize("mxfp4_use_marlin", [True, False])
-    llm = vllm.LLM(
+def test_gpt_oss_lora(
-        MODEL_PATH,
+    monkeypatch: pytest.MonkeyPatch, gptoss20b_lora_files, mxfp4_use_marlin
-        max_model_len=1024,
+):
-        enable_lora=True,
+    with monkeypatch.context() as m:
-        max_loras=4,
+        m.setenv("VLLM_MXFP4_USE_MARLIN", "1" if mxfp4_use_marlin else "0")
-        max_lora_rank=8,
+        llm = vllm.LLM(
-        compilation_config=vllm.config.CompilationConfig(  # Avoid OOM
+            MODEL_PATH,
-            cudagraph_specialize_lora=False,
+            max_model_len=1024,
-        ),
+            enable_lora=True,
-    )
+            max_loras=4,
+            max_lora_rank=8,
-    generate_and_test(llm, gptoss20b_lora_files, lora_id=1)
+            max_num_seqs=2,
-    generate_and_test(llm, gptoss20b_lora_files, lora_id=2)
+            max_num_batched_tokens=2048,
+            compilation_config=vllm.config.CompilationConfig(  # Avoid OOM
+                cudagraph_specialize_lora=False,
+            ),
+        )
+        generate_and_test(llm, gptoss20b_lora_files, lora_id=1)
+        generate_and_test(llm, gptoss20b_lora_files, lora_id=2)
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize("fully_sharded_loras", [False, True])
-def test_gpt_oss_lora_tp2(gptoss20b_lora_files, fully_sharded_loras):
+@pytest.mark.parametrize("mxfp4_use_marlin", [True, False])
-    llm = vllm.LLM(
+def test_gpt_oss_lora_tp2(
-        MODEL_PATH,
+    monkeypatch: pytest.MonkeyPatch,
-        max_model_len=1024,
+    gptoss20b_lora_files,
-        enable_lora=True,
+    fully_sharded_loras,
-        max_loras=2,
+    mxfp4_use_marlin,
-        max_lora_rank=8,
+):
-        max_num_seqs=16,
+    with monkeypatch.context() as m:
-        tensor_parallel_size=2,
+        m.setenv("VLLM_MXFP4_USE_MARLIN", "1" if mxfp4_use_marlin else "0")
-        fully_sharded_loras=fully_sharded_loras,
+        llm = vllm.LLM(
-        compilation_config=vllm.config.CompilationConfig(  # Avoid OOM
+            MODEL_PATH,
-            cudagraph_specialize_lora=False,
+            max_model_len=1024,
-        ),
+            enable_lora=True,
-    )
+            max_loras=2,
+            max_num_seqs=2,
-    generate_and_test(llm, gptoss20b_lora_files, lora_id=1)
+            max_num_batched_tokens=2048,
-    generate_and_test(llm, gptoss20b_lora_files, lora_id=2)
+            tensor_parallel_size=2,
+            gpu_memory_utilization=0.8,
+            fully_sharded_loras=fully_sharded_loras,
+            compilation_config=vllm.config.CompilationConfig(  # Avoid OOM
+                cudagraph_specialize_lora=False,
+            ),
+        )
+        generate_and_test(llm, gptoss20b_lora_files, lora_id=1)
+        generate_and_test(llm, gptoss20b_lora_files, lora_id=2)
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -43,8 +43,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
    VocabParallelEmbedding,
    get_masked_input_and_mask,
 )
-from vllm.model_executor.utils import set_random_seed
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 from .utils import DummyLoRAManager
@@ -252,7 +252,9 @@ def check_punica_wrapper(punica_wrapper) -> bool:
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
 @pytest.mark.parametrize("stage", STAGES)
-def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
+def test_embeddings(
+    default_vllm_config, dist_init, num_loras, device, vocab_size, stage
+) -> None:
    # For multi-GPU testing of Triton kernel, we must explicitly set the CUDA
    # device, see: https://github.com/triton-lang/triton/issues/2925
    # Same below.
@@ -261,11 +263,11 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
    torch.set_default_device(device)
    max_loras = 8
-    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
-    assert check_punica_wrapper(punica_wrapper)
    lora_config = LoRAConfig(
        max_loras=max_loras, max_lora_rank=8, lora_dtype=torch.float16
    )
+    punica_wrapper = get_punica_wrapper(8192, 256, device, lora_config=lora_config)
+    assert check_punica_wrapper(punica_wrapper)
    def create_random_embedding_layer():
        embedding = VocabParallelEmbedding(vocab_size, 256)
@@ -353,18 +355,18 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 256512])
 @pytest.mark.parametrize("stage", STAGES)
 def test_lm_head_logits_processor(
-    dist_init, num_loras, device, vocab_size, stage
+    default_vllm_config, dist_init, num_loras, device, vocab_size, stage
 ) -> None:
    if current_platform.is_cuda_alike():
        torch.cuda.set_device(device)
    torch.set_default_device(device)
    max_loras = 8
-    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
-    assert check_punica_wrapper(punica_wrapper)
    lora_config = LoRAConfig(
        max_loras=max_loras, max_lora_rank=8, lora_dtype=torch.float16
    )
+    punica_wrapper = get_punica_wrapper(8192, 256, device, lora_config=lora_config)
+    assert check_punica_wrapper(punica_wrapper)
    def _pretest():
        linear = ParallelLMHead(
@@ -470,6 +472,7 @@ def test_lm_head_logits_processor(
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
 def test_linear_replicated(
+    default_vllm_config,
    dist_init,
    num_loras,
    device,
@@ -480,13 +483,13 @@ def test_linear_replicated(
    max_loras = 8
    torch.set_default_device(device)
-    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
-    assert check_punica_wrapper(punica_wrapper)
    lora_config = LoRAConfig(
        max_loras=max_loras,
        max_lora_rank=8,
        lora_dtype=torch.float16,
    )
+    punica_wrapper = get_punica_wrapper(8192, 256, device, lora_config=lora_config)
+    assert check_punica_wrapper(punica_wrapper)
    def create_random_linear_replicated_layer():
        linear = ReplicatedLinear(4096, 4096, bias=False, params_dtype=torch.float16)
@@ -580,21 +583,21 @@ def test_linear_replicated(
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
 def test_linear_parallel(
-    dist_init, num_loras, orientation, fully_shard, device, stage
+    default_vllm_config, dist_init, num_loras, orientation, fully_shard, device, stage
 ) -> None:
    if current_platform.is_cuda_alike():
        torch.cuda.set_device(device)
    max_loras = 8
    torch.set_default_device(device)
-    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
-    assert check_punica_wrapper(punica_wrapper)
    lora_config = LoRAConfig(
        max_loras=max_loras,
        max_lora_rank=8,
        fully_sharded_loras=fully_shard,
        lora_dtype=torch.float16,
    )
+    punica_wrapper = get_punica_wrapper(8192, 256, device, lora_config=lora_config)
+    assert check_punica_wrapper(punica_wrapper)
    def create_random_linear_parallel_layer():
        if orientation == "row":
@@ -705,21 +708,21 @@ def test_linear_parallel(
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
 def test_column_parallel_packed(
-    dist_init, num_loras, repeats, fully_shard, device, stage
+    default_vllm_config, dist_init, num_loras, repeats, fully_shard, device, stage
 ) -> None:
    if current_platform.is_cuda_alike():
        torch.cuda.set_device(device)
    max_loras = 8
    torch.set_default_device(device)
-    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
-    assert check_punica_wrapper(punica_wrapper)
    lora_config = LoRAConfig(
        max_loras=max_loras,
        max_lora_rank=8,
        fully_sharded_loras=fully_shard,
        lora_dtype=torch.float16,
    )
+    punica_wrapper = get_punica_wrapper(8192, 256, device, lora_config=lora_config)
+    assert check_punica_wrapper(punica_wrapper)
    def create_column_parallel_packed_layer():
        if repeats == 2:
@@ -851,7 +854,7 @@ def test_column_parallel_packed(
 @pytest.mark.parametrize(
    "seed", list(range(VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS))
 )
-def test_vocab_parallel_embedding_indices(tp_size, seed):
+def test_vocab_parallel_embedding_indices(tp_size, seed, default_vllm_config):
    random.seed(seed)
    vocab_size = random.randint(4000, 64000)
    added_vocab_size = random.randint(0, 1024)

--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -77,11 +77,18 @@ def do_sample(
            if lora_id
            else None,
        )
-    # Print the outputs.
+    lora_request = LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None
    generated_texts: list[str] = []
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
+        # The output should include  correct lora_request info
+        if lora_request is not None:
+            assert output.lora_request.lora_name == lora_request.lora_name
+            assert output.lora_request.lora_int_id == lora_request.lora_int_id
+            assert output.lora_request.lora_path == lora_request.lora_path
+        else:
+            assert output.lora_request is None
        generated_texts.append(generated_text)
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
    return generated_texts

--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -18,6 +18,7 @@ from vllm.lora.layers import (
 from vllm.lora.lora_model import LoRAModel
 from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
 from vllm.lora.model_manager import (
+    DEFAULT_LANGUAGE_WRAPPER_KEY,
    LoRAMapping,
    LoRAModelManager,
    LRUCacheLoRAModelManager,
@@ -110,7 +111,7 @@ def create_packed_lora(
    return LoRAModel(lora_id, 8, loras)
-def test_replace_submodules(dist_init, dummy_model):
+def test_replace_submodules(default_vllm_config, dist_init, dummy_model):
    model = dummy_model
    manager = LoRAModelManager(
        model,
@@ -132,7 +133,7 @@ def test_replace_submodules(dist_init, dummy_model):
 @pytest.mark.parametrize("device", DEVICES)
-def test_lora_model_manager(dist_init, dummy_model, device):
+def test_lora_model_manager(default_vllm_config, dist_init, dummy_model, device):
    model = dummy_model
    model_lora1 = create_lora(
        1, model, ["layer1.dense1", "dense2", "lm_head"], device=device
@@ -183,9 +184,11 @@ def test_lora_model_manager(dist_init, dummy_model, device):
    assert manager.activate_adapter(2)
    assert manager.lora_index_to_id[0] == 3
    assert manager.lora_index_to_id[1] == 2
    assert manager.device == device
-    assert manager.punica_wrapper.device == device
+    assert (
+        manager.punica_wrapper_mapping.get(DEFAULT_LANGUAGE_WRAPPER_KEY).device
+        == device
+    )
    assert hasattr(manager, "supported_lora_modules")
    assert sorted(manager.supported_lora_modules) == [
        "dense1",
@@ -196,7 +199,9 @@ def test_lora_model_manager(dist_init, dummy_model, device):
 @pytest.mark.parametrize("device", DEVICES)
-def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
+def test_lora_lru_cache_model_manager(
+    default_vllm_config, dist_init, dummy_model, device
+):
    model = dummy_model
    model_lora1 = create_lora(
        1, model, ["layer1.dense1", "dense2", "lm_head"], device=device
@@ -278,13 +283,15 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
    assert manager.remove_adapter(3)
    with pytest.raises(ValueError):
        assert manager.pin_adapter(3)
+    assert (
-    assert manager.punica_wrapper.device == device
+        manager.punica_wrapper_mapping.get(DEFAULT_LANGUAGE_WRAPPER_KEY).device
+        == device
+    )
    assert manager.device == device
 @pytest.mark.parametrize("device", DEVICES)
-def test_lru_lora_model_manager(dist_init, dummy_model, device):
+def test_lru_lora_model_manager(default_vllm_config, dist_init, dummy_model, device):
    # This tests just the LRU cache functionality, everything else is
    # tested in test_lora_model_manager
    model = dummy_model
@@ -402,12 +409,17 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
        assert manager.remove_oldest_adapter()
    assert set(manager.list_adapters()) == {1}
-    assert manager.punica_wrapper.device == device
+    assert (
+        manager.punica_wrapper_mapping.get(DEFAULT_LANGUAGE_WRAPPER_KEY).device
+        == device
+    )
    assert manager.device == device
 @pytest.mark.parametrize("device", DEVICES)
-def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_path):
+def test_lru_cache_worker_adapter_manager(
+    default_vllm_config, dist_init, dummy_model, device, tmp_path
+):
    lora_config = LoRAConfig(
        max_lora_rank=8, max_cpu_loras=4, max_loras=4, lora_dtype=DEFAULT_DTYPE
    )
@@ -514,11 +526,16 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_pa
        )
    assert worker_adapter_manager.device == device
-    assert worker_adapter_manager._adapter_manager.punica_wrapper.device == device
+    punica_wrapper = worker_adapter_manager._adapter_manager.punica_wrapper_mapping.get(
+        DEFAULT_LANGUAGE_WRAPPER_KEY
+    )
+    assert punica_wrapper.device == device
 @pytest.mark.parametrize("device", DEVICES)
-def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path):
+def test_worker_adapter_manager(
+    default_vllm_config, dist_init, dummy_model_gate_up, device, tmp_path
+):
    # Should remove every LoRA not specified in the request.
    lora_config = LoRAConfig(
        max_lora_rank=8, max_cpu_loras=4, max_loras=4, lora_dtype=DEFAULT_DTYPE
@@ -618,11 +635,14 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path
        )
    assert worker_adapter_manager.device == device
-    assert worker_adapter_manager._adapter_manager.punica_wrapper.device == device
+    punica_wrapper = worker_adapter_manager._adapter_manager.punica_wrapper_mapping.get(
+        DEFAULT_LANGUAGE_WRAPPER_KEY
+    )
+    assert punica_wrapper.device == device
 @pytest.mark.parametrize("device", DEVICES)
-def test_packed_loras(dist_init, dummy_model_gate_up, device):
+def test_packed_loras(default_vllm_config, dist_init, dummy_model_gate_up, device):
    model = dummy_model_gate_up
    model_lora = create_packed_lora(
        1,