Merge tag 'v0.10.2rc1' into v0.10.2rc1-ori

d2b52805 · zhuwenwen · 9a521c23 · 5438967f · d2b52805 · d2b52805
Commit d2b52805 authored Sep 07, 2025 by zhuwenwen
20 changed files
--- a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
+++ b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
@@ -24,7 +24,7 @@ def test_silu_mul_fp8_quant_deep_gemm(E, T, H, group_size, seed):
    current_platform.seed_everything(seed)

    # Input tensor of shape (E, T, 2*H)
-    y = torch.randn((E, T, 2 * H), dtype=torch.float32, device="cuda")
+    y = torch.randn((E, T, 2 * H), dtype=torch.bfloat16, device="cuda")
    tokens_per_expert = torch.randint(
        low=0,
        high=T,
@@ -74,7 +74,7 @@ def test_silu_mul_fp8_quant_deep_gemm(E, T, H, group_size, seed):
        y_se = y_s[e]
        y_qe = y_q[e]

-        torch.testing.assert_close(y_se[:nt], ref_s[:nt])
+        torch.testing.assert_close(y_se[:nt], ref_s[:nt], atol=1e-4, rtol=1e-2)
        torch.testing.assert_close(
            y_qe[:nt].to(torch.float32),
            ref_q[:nt].to(torch.float32),

--- a/tests/kernels/quantization/test_awq_triton.py
+++ b/tests/kernels/quantization/test_awq_triton.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the AWQ Triton kernel.

-Run `pytest tests/kernels/test_awq_triton.py`.
+Run `pytest tests/kernels/quantization/test_awq_triton.py`.
 """
 import pytest
 import torch

--- a/tests/kernels/quantization/test_cutlass_2of4_sparse.py
+++ b/tests/kernels/quantization/test_cutlass_2of4_sparse.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for sparse cutlass kernels

-Run `pytest tests/kernels/test_semi_structured.py`.
+Run `pytest tests/kernels/quantization/test_cutlass_2of4_sparse.py`.
 """

 import pytest

--- a/tests/kernels/quantization/test_cutlass_scaled_mm.py
+++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for cutlass kernels

-Run `pytest tests/kernels/test_cutlass.py`.
+Run `pytest tests/kernels/quantization/test_cutlass_scaled_mm.py`.
 """
 import random

@@ -535,7 +535,7 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,

    expert_offsets = torch.zeros((num_experts + 1),
                                 device=device,
-                                 dtype=torch.int32)
+                                 dtype=torch.int64)

    problem_sizes = torch.zeros((num_experts, 3),
                                device=device,

--- a/tests/kernels/quantization/test_cutlass_w4a8.py
+++ b/tests/kernels/quantization/test_cutlass_w4a8.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for the CUTLASS W4A8 kernel.
+
+Run `pytest tests/kernels/quantization/test_cutlass_w4a8.py`.
+"""
+
+from dataclasses import dataclass
+from typing import Optional
+
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    pack_rows, quantize_weights)
+from vllm.platforms import current_platform
+from vllm.scalar_type import ScalarType, scalar_types
+
+# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
+#  unit tests to a common utility function. Currently the use of
+#  `is_quant_method_supported` conflates kernels with quantization methods
+#  an assumption which is breaking down as quantizations methods can have
+#  have kernels and some kernels support multiple quantization methods.
+IS_SUPPORTED_BY_GPU = current_platform.get_device_capability()[0] >= 9
+
+MNK_SHAPES = [(1, 128, 128), (1, 512, 1024), (1, 4096, 4096), (1, 8192, 28672),
+              (13, 8192, 4096), (26, 4096, 8192), (64, 4096, 4096),
+              (64, 8192, 28672), (257, 128, 4096), (257, 4096, 4096),
+              (1024, 4096, 8192), (1024, 8192, 4096)]
+
+# TODO(czhu): get supported schedules from fn
+SCHEDULES = [
+    '128x16_1x1x1', '256x16_1x1x1', '128x32_1x1x1', '256x32_1x1x1',
+    '128x64_1x1x1', '256x64_1x1x1', '128x128_1x1x1', '256x128_1x1x1',
+    '128x256_1x1x1', '128x256_2x1x1'
+]
+
+
+@dataclass
+class TypeConfig:
+    act_type: torch.dtype
+    weight_type: ScalarType
+    output_type: Optional[torch.dtype]
+    group_scale_type: Optional[torch.dtype]
+    channel_scale_type: Optional[torch.dtype]
+    token_scale_type: Optional[torch.dtype]
+
+
+@dataclass
+class Tensors:
+    w_ref: torch.Tensor
+    a_ref: torch.Tensor
+    a: torch.Tensor
+    w_q: torch.Tensor
+    w_g_s: torch.Tensor
+    w_ch_s: torch.Tensor
+    w_tok_s: torch.Tensor
+
+
+# (Act Type, Weight Type, Output Type, Scale Type, ZeroPoints,
+#  Ch Scales Type, Tok Scales Type)
+TestTypeTuple = tuple[list[torch.dtype], ScalarType, Optional[torch.dtype],
+                      Optional[torch.dtype], bool]
+TEST_TYPES = [
+    *(
+        TypeConfig(act_type=torch.float8_e4m3fn,
+                   weight_type=w_type,
+                   output_type=o_type,
+                   group_scale_type=torch.float8_e4m3fn,
+                   channel_scale_type=torch.float32,
+                   token_scale_type=torch.float32)
+        for w_type in [scalar_types.int4]
+        # TODO(czhu): fp16 out type
+        for o_type in [torch.bfloat16]),
+]
+
+# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
+#  unit tests to a common utility function. Currently the use of
+#  `is_quant_method_supported` conflates kernels with quantization methods
+#  an assumption which is breaking down as quantizations methods can have
+#  have kernels and some kernels support multiple quantization methods.
+IS_SUPPORTED_BY_GPU = current_platform.has_device_capability(90)
+
+
+# For testing quantized linear kernels
+def to_fp8(tensor: torch.Tensor):
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return tensor.clamp(min=finfo.min,
+                        max=finfo.max).to(dtype=torch.float8_e4m3fn)
+
+
+def cutlass_quantize_and_pack(atype: torch.dtype,
+                              w: torch.Tensor,
+                              wtype: ScalarType,
+                              stype: Optional[torch.dtype],
+                              group_size: Optional[int],
+                              zero_points: bool = False):
+    assert wtype.is_integer(), "TODO: support floating point weights"
+
+    w_ref, w_q, w_s, w_zp = quantize_weights(w,
+                                             wtype,
+                                             group_size=group_size,
+                                             zero_points=zero_points)
+
+    # since scales are cast to fp8, we need to compute w_ref this way
+    w_ref = ((w_q).to(torch.float32) * w_s.to(atype).to(
+        torch.float32).repeat_interleave(group_size, dim=0)).to(atype)
+
+    # bit mask prevents sign extending int4 when packing
+    w_q = pack_rows(w_q & 0x0F, wtype.size_bits, *w_q.shape)
+    w_q = w_q.t().contiguous().t()  # convert to col major
+
+    w_q_packed = ops.cutlass_encode_and_reorder_int4b(w_q)
+    w_s_packed = ops.cutlass_pack_scale_fp8(w_s.to(atype))
+
+    return w_ref, w_q_packed, w_s_packed, w_zp
+
+
+def create_test_tensors(shape: tuple[int, int, int], types: TypeConfig,
+                        group_size: Optional[int]) -> Tensors:
+    m, n, k = shape
+
+    print("create_test_tensors, shape:", shape, "types:", types, "group_size:",
+          group_size)
+
+    a = to_fp8(torch.randn((m, k), device="cuda"))
+    w = to_fp8(torch.randn((k, n), device="cuda"))
+
+    if types.group_scale_type is not None:
+        w = w.to(types.group_scale_type)
+    if w.dtype.itemsize == 1:
+        w = w.to(torch.float16)
+
+    w_ref, w_q_packed, w_s, _ = cutlass_quantize_and_pack(
+        a.dtype, w, types.weight_type, types.group_scale_type, group_size,
+        False)
+
+    a_ref = a.to(torch.float32)
+    w_ref = w_ref.to(torch.float32)
+
+    # for the practical use case we need per-tok scales for fp8 activations
+    w_tok_s = torch.randn((m, ), device='cuda', dtype=types.token_scale_type)
+    # weights are already per-group quantized, use placeholder here
+    w_ch_s = torch.ones((n, ), device='cuda', dtype=types.channel_scale_type)
+
+    return Tensors(w_ref=w_ref,
+                   a_ref=a_ref,
+                   a=a,
+                   w_q=w_q_packed,
+                   w_g_s=w_s,
+                   w_ch_s=w_ch_s,
+                   w_tok_s=w_tok_s)
+
+
+def mm_test_helper(types: TypeConfig,
+                   tensors: Tensors,
+                   group_size: Optional[int] = None,
+                   schedule: Optional[str] = None):
+    # CUTLASS upstream uses fp8 with fastaccum as reference
+    # https://github.com/NVIDIA/cutlass/blob/main/examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_fp8_gemm.cu#L406
+    output_ref = torch._scaled_mm(
+        tensors.a_ref.to(types.act_type),
+        tensors.w_ref.to(types.act_type).t().contiguous().t(),  # col major
+        tensors.w_tok_s.unsqueeze(1),
+        tensors.w_ch_s.unsqueeze(0),
+        out_dtype=types.output_type,
+        use_fast_accum=True)
+
+    output = ops.cutlass_w4a8_mm(
+        a=tensors.a,
+        b_q=tensors.w_q,
+        b_group_scales=tensors.w_g_s,
+        b_group_size=group_size,
+        b_channel_scales=tensors.w_ch_s,
+        a_token_scales=tensors.w_tok_s,
+    )
+
+    print(output)
+    print(output_ref)
+
+    torch.testing.assert_close(output,
+                               output_ref.to(output.dtype),
+                               rtol=1e-3,
+                               atol=1e-3)
+
+
+@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
+                    reason="CUTLASS W4A8 is not supported on this GPU type.")
+@pytest.mark.parametrize("shape",
+                         MNK_SHAPES,
+                         ids=lambda x: "x".join(str(v) for v in x))
+@pytest.mark.parametrize("types", TEST_TYPES)
+@pytest.mark.parametrize("schedule", SCHEDULES)
+def test_cutlass_w4a8(shape, types: TypeConfig, schedule):
+    group_sizes = [128]
+    for group_size in group_sizes:
+        tensors = create_test_tensors(shape, types, group_size)
+        mm_test_helper(types, tensors, group_size, schedule)
+
+
+# Test to make sure cuda graphs work
+class W4A8Layer(torch.nn.Module):
+
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.kwargs = kwargs
+
+    def forward(self, a):
+        return ops.cutlass_w4a8_mm(a=a, **self.kwargs)
+
+
+@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
+                    reason="CUTLASS W4A8 is not supported on this GPU type.")
+def test_w4a8_cuda_graph():
+    m, n, k = 512, 4096, 4096
+
+    a = to_fp8(torch.randn((m, k), device="cuda"))
+    b = to_fp8(torch.randn((k, n), device="cuda"))
+
+    wtype = scalar_types.int4
+    stype = torch.float8_e4m3fn
+    group_size = 128
+    zero_points = False
+
+    w_ref, w_q_packed, w_s, _ = cutlass_quantize_and_pack(
+        a.dtype, b.to(torch.float16), wtype, stype, group_size, zero_points)
+
+    w_tok_s = torch.randn((m, ), device='cuda', dtype=torch.float32)
+    w_ch_s = torch.ones((n, ), device='cuda', dtype=torch.float32)
+
+    # Construct a trivial model with a single layer that calls the kernel
+    model = W4A8Layer(
+        b_q=w_q_packed,
+        b_group_scales=w_s,
+        b_group_size=group_size,
+        b_channel_scales=w_ch_s,
+        a_token_scales=w_tok_s,
+    )
+
+    output_ref = torch._scaled_mm(
+        a,
+        w_ref.to(a.dtype).t().contiguous().t(),  # col major
+        w_tok_s.unsqueeze(1),
+        w_ch_s.unsqueeze(0),
+        out_dtype=torch.bfloat16,
+        use_fast_accum=True)
+
+    # Run the model with a cuda graph
+    stream = torch.cuda.Stream()
+    with torch.cuda.stream(stream):
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g):
+            output = model(a)
+
+    output.zero_()
+    g.replay()
+
+    torch.testing.assert_close(output, output_ref, rtol=1e-3, atol=1e-3)
--- a/tests/kernels/quantization/test_flashinfer_scaled_mm.py
+++ b/tests/kernels/quantization/test_flashinfer_scaled_mm.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+from vllm.utils.flashinfer import flashinfer_scaled_fp8_mm
+
+if not current_platform.has_device_capability(100):
+    pytest.skip(
+        reason=
+        "Flashinfer FP8 gemms requires compute capability of 10.0 or above.",
+        allow_module_level=True,
+    )
+
+DTYPES = [torch.float16, torch.bfloat16]
+# m, n, k
+SHAPES = [(128, 128, 64), (128, 128, 128), (256, 128, 64), (128, 256, 128)]
+PAD_SHAPES = [(150, 128, 64), (128, 128, 96)]
+SHAPES.extend(PAD_SHAPES)
+
+SEEDS = [42]
+CUDA_DEVICES = ["cuda:0"]
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("use_bias", [True, False])
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("autotune", [False, True])
+@torch.inference_mode()
+def test_flashinfer_fp8_gemm(
+    dtype: torch.dtype,
+    shape: tuple[int, int, int],
+    use_bias: bool,
+    seed: int,
+    device: str,
+    autotune: bool,
+) -> None:
+    current_platform.seed_everything(seed)
+    m, n, k = shape
+    a = torch.randn((m, k), dtype=dtype, device=device)
+    b = torch.randn((n, k), dtype=dtype, device=device) / k
+
+    a_fp8, a_scale = ops.scaled_fp8_quant(a)
+    b_fp8, b_scale = ops.scaled_fp8_quant(b)
+
+    expected_out = torch.mm(
+        a_scale * a_fp8.to(dtype=torch.float32),
+        b_scale * b_fp8.to(dtype=torch.float32).t(),
+    ).to(dtype=dtype)
+
+    if use_bias:
+        bias = torch.randn((n, ), dtype=dtype, device=device)
+        expected_out = expected_out + bias
+    else:
+        bias = None
+
+    import flashinfer
+
+    with flashinfer.autotune(autotune):
+        out = flashinfer_scaled_fp8_mm(
+            a_fp8,
+            b_fp8.t(),
+            a_scale,
+            b_scale,
+            dtype,
+            bias=bias,
+        )
+
+    torch.testing.assert_close(out, expected_out, atol=1e-2, rtol=1e-2)
--- a/tests/kernels/quantization/test_machete_mm.py
+++ b/tests/kernels/quantization/test_machete_mm.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the machete kernel.

-Run `pytest tests/kernels/test_machete_mm.py`.
+Run `pytest tests/kernels/quantization/test_machete_mm.py`.
 """

 import math
@@ -95,23 +95,23 @@ TEST_TYPES = [
                 token_scale_type=None)
      for w_type in [scalar_types.uint4, scalar_types.uint8]
      for a_type in [torch.float16, torch.bfloat16]),
-    # QQQ style
-    *(TypeConfig(act_type=torch.int8,
-                 weight_type=scalar_types.uint4b8,
-                 output_type=torch.float16,
-                 group_scale_type=group_scale_type,
-                 group_zero_type=None,
-                 channel_scale_type=torch.float,
-                 token_scale_type=torch.float)
-      for group_scale_type in [None, torch.float16]),
-    *(TypeConfig(act_type=torch.float8_e4m3fn,
-                 weight_type=scalar_types.uint4b8,
-                 output_type=torch.float16,
-                 group_scale_type=group_scale_type,
-                 group_zero_type=None,
-                 channel_scale_type=torch.float,
-                 token_scale_type=torch.float)
-      for group_scale_type in [None, torch.float16]),
+    # # QQQ style
+    # *(TypeConfig(act_type=torch.int8,
+    #              weight_type=scalar_types.uint4b8,
+    #              output_type=torch.float16,
+    #              group_scale_type=group_scale_type,
+    #              group_zero_type=None,
+    #              channel_scale_type=torch.float,
+    #              token_scale_type=torch.float)
+    #   for group_scale_type in [None, torch.float16]),
+    # *(TypeConfig(act_type=torch.float8_e4m3fn,
+    #              weight_type=scalar_types.uint4b8,
+    #              output_type=torch.float16,
+    #              group_scale_type=group_scale_type,
+    #              group_zero_type=None,
+    #              channel_scale_type=torch.float,
+    #              token_scale_type=torch.float)
+    #   for group_scale_type in [None, torch.float16]),
 ]

 # TODO: in future PR refactor this and `is_quant_method_supported` in the kernel

--- a/tests/kernels/quantization/test_marlin_gemm.py
+++ b/tests/kernels/quantization/test_marlin_gemm.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the marlin kernel.

-Run `pytest tests/kernels/marlin/test_marlin_gemm.py`.
+Run `pytest tests/kernels/quantization/test_marlin_gemm.py`.
 """
 import pytest
 import torch
@@ -13,11 +13,7 @@ from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
    GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES)
-from vllm.model_executor.layers.quantization.qqq import (
-    MARLIN_QQQ_MAX_PARALLEL, MARLIN_QQQ_MIN_THREAD_N,
-    MARLIN_QQQ_SUPPORTED_GROUP_SIZES, MARLIN_QQQ_SUPPORTED_NUM_BITS)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
    MARLIN_SUPPORTED_GROUP_SIZES, marlin_make_empty_g_idx,
    marlin_make_workspace_new, marlin_permute_bias, marlin_permute_scales,
    query_marlin_supported_quant_types)
@@ -31,8 +27,6 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
    marlin_weights)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
    marlin_24_quantize)
-from vllm.model_executor.layers.quantization.utils.marlin_utils_test_qqq import (  # noqa: E501
-    marlin_qqq_quantize)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    awq_pack, gptq_pack, gptq_quantize_weights, quantize_weights, sort_weights)
 from vllm.scalar_type import scalar_types
@@ -449,68 +443,6 @@ def test_hqq_marlin_gemm(
    assert max_diff < 0.04


-@pytest.mark.skipif(not is_quant_method_supported("qqq"),
-                    reason="Marlin is not supported on this GPU type.")
-@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
-@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
-@pytest.mark.parametrize("num_bits", MARLIN_QQQ_SUPPORTED_NUM_BITS)
-@pytest.mark.parametrize("group_size", MARLIN_QQQ_SUPPORTED_GROUP_SIZES)
-@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
-def test_marlin_qqq_gemm(
-    k_chunk,
-    n_chunk,
-    num_bits,
-    group_size,
-    mnk_factors,
-):
-    int8_traits = torch.iinfo(torch.int8)
-    m_factor, n_factor, k_factor = mnk_factors
-
-    size_m = m_factor
-    size_k = k_chunk * k_factor
-    size_n = n_chunk * n_factor
-
-    a_input = rand_data((size_m, size_k))
-    b_weight = rand_data((size_k, size_n))
-
-    # Quantize activations
-    s_a = a_input.abs().max(dim=-1, keepdim=True)[0].div(int8_traits.max).to(
-        torch.float)
-    q_a = (a_input / s_a).round().clamp(int8_traits.min,
-                                        int8_traits.max).to(torch.int8)
-
-    # Quantize weights
-    w_ref, marlin_qqq_q_w, marlin_qqq_s_group, marlin_qqq_s_channel = \
-    marlin_qqq_quantize(b_weight, num_bits, group_size)
-
-    workspace = MarlinWorkspace(size_n, MARLIN_QQQ_MIN_THREAD_N,
-                                MARLIN_QQQ_MAX_PARALLEL)
-
-    opcheck(torch.ops._C.marlin_qqq_gemm,
-            (q_a, marlin_qqq_q_w, s_a, marlin_qqq_s_channel,
-             marlin_qqq_s_group, workspace.scratch, a_input.shape[0],
-             b_weight.shape[1], a_input.shape[1]))
-
-    output = ops.marlin_qqq_gemm(
-        q_a,
-        marlin_qqq_q_w,
-        s_a,
-        marlin_qqq_s_channel,
-        marlin_qqq_s_group,
-        workspace.scratch,
-        a_input.shape[0],
-        b_weight.shape[1],
-        a_input.shape[1],
-    )
-    output_ref = torch.matmul(q_a.half() * s_a.half(), w_ref)
-
-    torch.cuda.synchronize()
-
-    max_diff = compute_max_diff(output, output_ref)
-
-    assert max_diff < 0.04
-
-
 def test_marlin_gemm_subset_input():
    quant_type = scalar_types.uint4b8
    group_size = 128
@@ -602,18 +534,3 @@ def test_marlin_gemm_with_bias(size_m):
    max_diff = compute_max_diff(output, output_ref)

    assert max_diff < 0.04
-
-
-def test_marlin_gemm_opcheck():
-    size_m = 2048
-    size_n = 4096
-    size_k = 4096
-    a = torch.rand((size_m, size_n), device='cuda', dtype=torch.float16)
-    w = torch.randint(-5, 5, (256, 8192), device='cuda', dtype=torch.int32)
-    s = torch.full((32, size_k), 0.125, device='cuda', dtype=torch.float16)
-    wk = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
-                         GPTQ_MARLIN_MAX_PARALLEL).scratch
-    x = torch.ops._C.marlin_gemm(a, w, s, wk, size_m, size_n, size_k)
-    y = torch.ops._C.marlin_gemm(a, w, s, wk, size_m, size_n, size_k)
-    torch.testing.assert_close(x, y)
-    opcheck(torch.ops._C.marlin_gemm, (a, w, s, wk, size_m, size_n, size_k))
--- a/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
+++ b/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+from tests.kernels.utils import opcheck
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+
+if not current_platform.has_device_capability(100):
+    pytest.skip(reason="Nvfp4 Requires compute capability of 10 or above.",
+                allow_module_level=True)
+
+DTYPES = [torch.float16, torch.bfloat16]
+SHAPES = [(128, 64), (128, 128), (256, 64), (256, 128)]
+SEEDS = [42]
+CUDA_DEVICES = ['cuda:0']
+
+FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
+FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
+
+BLOCK_SIZE = 16
+
+
+def ref_impl(silu_and_mul: SiluAndMul, x: torch.Tensor,
+             global_scale: torch.Tensor,
+             ref_output_scale: torch.Tensor) -> torch.Tensor:
+    silu_and_mul_out = silu_and_mul.forward_native(x)
+    assert not current_platform.is_rocm()
+    assert silu_and_mul_out.ndim >= 1, (
+        f'input.ndim needs to be >= 1, but got {silu_and_mul_out.ndim}.')
+    other_dims = 1 if silu_and_mul_out.ndim == 1 else -1
+    silu_and_mul_out = silu_and_mul_out.reshape(other_dims,
+                                                silu_and_mul_out.shape[-1])
+    m, n = silu_and_mul_out.shape
+    device = silu_and_mul_out.device
+
+    # Two fp4 values will be packed into an uint8.
+    out = torch.empty((m, n // 2), device=device, dtype=torch.uint8)
+
+    output_scale = ref_output_scale
+
+    torch.ops._C.scaled_fp4_quant(out, silu_and_mul_out, output_scale,
+                                  global_scale)
+
+    return out, output_scale
+
+
+def ops_impl(x: torch.Tensor, global_scale: torch.Tensor,
+             ref_output_scale: torch.Tensor) -> torch.Tensor:
+    out_shape = (x.shape[0], x.shape[1] // 4)
+    output_scale = ref_output_scale
+    out = torch.empty(out_shape, dtype=torch.uint8, device=x.device)
+    torch.ops._C.silu_and_mul_nvfp4_quant(out, output_scale, x, global_scale)
+    return out, output_scale
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_quantize_to_fp4(
+    dtype: torch.dtype,
+    shape: tuple[int, int],
+    seed: int,
+    device: str,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+
+    m, n = shape
+
+    x = torch.randn((m, n), dtype=dtype)
+    tensor_amax = torch.abs(x).max().to(torch.float32)
+    global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / tensor_amax
+
+    block_size = 16
+
+    assert n % block_size == 0, (
+        f'last dim has to be multiple of 16, but got {n}.')
+    assert x.dtype in (torch.float16, torch.bfloat16), (
+        f'input.dtype needs to be fp16 or bf16 but got {x.dtype}.')
+
+    round_up = lambda x, y: (x + y - 1) // y * y
+    rounded_m = round_up(x.shape[0], 128)
+    scale_n = x.shape[1] // (2 * block_size)
+    rounded_n = round_up(scale_n, 4)
+    output_scale = torch.empty((rounded_m, rounded_n // 4),
+                               device=x.device,
+                               dtype=torch.int32)
+
+    layer = SiluAndMul()
+
+    ref_out, ref_out_scale = ref_impl(layer, x, global_scale, output_scale)
+
+    fusion_out, fusion_out_scale = ops_impl(x, global_scale, output_scale)
+
+    assert ref_out.dtype == torch.uint8
+    assert fusion_out.dtype == torch.uint8
+    assert ref_out.shape == fusion_out.shape
+
+    assert ref_out_scale.dtype == torch.int32
+    assert fusion_out_scale.dtype == torch.int32
+    assert ref_out_scale.shape == fusion_out_scale.shape
+
+    # Allow up to 2% of mismatched values since BF16 has accuracy issues.
+    mis_threshold = 0.02
+    atol = 0.4
+    rtol = 0.4
+    ref_logits = ref_out[-1]
+    fusion_logits = fusion_out[-1]
+
+    mis_count = torch.sum(
+        torch.abs(fusion_logits - ref_logits) > (atol +
+                                                 rtol * torch.abs(ref_logits)))
+    mis_ratio = mis_count / fusion_logits.numel()
+
+    assert mis_ratio < mis_threshold, \
+        f"Mismatch ratio {mis_ratio} exceeds threshold {mis_threshold}"
+
+    torch.testing.assert_close(ref_out_scale, fusion_out_scale)
+
+    opcheck(torch.ops._C.silu_and_mul_nvfp4_quant,
+            (fusion_out, fusion_out_scale, x, global_scale))
--- a/tests/kernels/quantization/test_triton_scaled_mm.py
+++ b/tests/kernels/quantization/test_triton_scaled_mm.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the triton_scaled_mm kernel

-Run `pytest tests/kernels/test_triton_scaled_mm.py`.
+Run `pytest tests/kernels/quantization/test_triton_scaled_mm.py`.
 """
 import importlib
 from typing import Optional

--- a/tests/kernels/test_flex_attention.py
+++ b/tests/kernels/test_flex_attention.py
@@ -9,12 +9,17 @@ import pytest
 import torch
 from packaging import version

-from vllm import SamplingParams
+from tests.v1.attention.utils import (BatchSpec, create_common_attn_metadata,
+                                      create_standard_kv_cache_spec,
+                                      create_vllm_config)
+from vllm.v1.attention.backends.flex_attention import (
+    FlexAttentionMetadataBuilder)

-from ..models.utils import check_embeddings_close
+from ..models.utils import check_embeddings_close, check_logprobs_close

 TORCH_VERSION = version.parse(torch.__version__)
 MINIMUM_TORCH_VERSION = version.parse("2.7.0")
+DIRECT_BUILD_VERSION = version.parse("2.9.dev0")


 def set_seed(seed):
@@ -34,22 +39,18 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
    """Test that FlexAttention produces the same outputs as the default backend.

    This test compares the outputs from the FlexAttention backend with
-    the default backend, ensuring they are identical when using the same seed.
+    the default backend, ensuring they are similar when using the same seed.
    """
    model_name = "Qwen/Qwen2.5-1.5B-Instruct"
    seed = 42
    max_tokens = 24
+    num_logprobs = 5
    prompts = [
        "Hello, my name is",
        "The president of the United States is",
        "The capital of France is",
    ]

-    sampling_params = SamplingParams(temperature=0.0,
-                                     top_p=1.0,
-                                     seed=seed,
-                                     max_tokens=max_tokens)
-
    # Run with flex attention
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
@@ -61,7 +62,8 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
                         tensor_parallel_size=1,
                         num_gpu_blocks_override=128,
                         enforce_eager=True) as llm_flex:
-            output_flex = llm_flex.generate(prompts, sampling_params)
+            output_flex = llm_flex.generate_greedy_logprobs(
+                prompts, max_tokens, num_logprobs)

    # Run with default backend
    with monkeypatch.context() as m:
@@ -71,20 +73,17 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
                         runner="generate",
                         tensor_parallel_size=1,
                         num_gpu_blocks_override=128,
-                         enforce_eager=True) as llm_default:
-            output_default = llm_default.generate(prompts, sampling_params)
-
-    # Compare outputs from both backends
-    for i, (flex_result,
-            default_result) in enumerate(zip(output_flex, output_default)):
-        prompt = prompts[i]
-        flex_text = flex_result[1][0]
-        default_text = default_result[1][0]
-
-        assert flex_text == default_text, (
-            f"FlexAttention output doesn't match default for: {prompt!r}\n"
-            f"FlexAttention: {flex_text!r}\n"
-            f"Default: {default_text!r}")
+                         enforce_eager=True,
+                         gpu_memory_utilization=0.85) as llm_default:
+            output_default = llm_default.generate_greedy_logprobs(
+                prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=output_flex,
+        outputs_1_lst=output_default,
+        name_0="flex",
+        name_1="default",
+    )


 @pytest.mark.skipif(
@@ -136,5 +135,70 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
    )


+@pytest.mark.skipif(
+    not torch.cuda.is_available() or TORCH_VERSION < DIRECT_BUILD_VERSION,
+    reason="CUDA not available or PyTorch version < 2.7",
+)
+def test_block_mask_direct_vs_slow_path():
+    """Test that direct path block mask is a superset of slow path.
+
+    The direct path may include extra blocks for performance (over-estimation),
+    but must include all blocks that the slow path determines are necessary.
+    """
+    device = torch.device("cuda")
+
+    vllm_config = create_vllm_config(model_name="meta-llama/Meta-Llama-3-8B",
+                                     block_size=16,
+                                     max_model_len=1024)
+    kv_cache_spec = create_standard_kv_cache_spec(vllm_config)
+
+    # Use a mixed batch that will create groups spanning multiple sequences
+    batch_spec = BatchSpec(seq_lens=[35, 64, 128, 256],
+                           query_lens=[33, 5, 32, 64],
+                           name="test_mixed_batch")
+
+    common_attn_metadata = create_common_attn_metadata(
+        batch_spec, vllm_config.cache_config.block_size, device)
+
+    builder = FlexAttentionMetadataBuilder(kv_cache_spec, [], vllm_config,
+                                           device)
+
+    metadata_direct = builder.build(common_prefix_len=0,
+                                    common_attn_metadata=common_attn_metadata)
+    builder.direct_build = False
+    metadata_slow = builder.build(common_prefix_len=0,
+                                  common_attn_metadata=common_attn_metadata)
+
+    assert metadata_direct.block_mask is not None
+    assert metadata_slow.block_mask is not None
+
+    # Extract block indices for comparison, B, H are the same
+    direct_indices = metadata_direct.block_mask.kv_indices[0, 0]
+    slow_indices = metadata_slow.block_mask.kv_indices[0, 0]
+    direct_num = metadata_direct.block_mask.kv_num_blocks[0, 0]
+    slow_num = metadata_slow.block_mask.kv_num_blocks[0, 0]
+
+    # main test: every block needed by slow path must be in direct path
+    num_groups = direct_num.shape[0]
+    all_contained = True
+    missing_details = []
+
+    for group_idx in range(num_groups):
+        direct_blocks = set(
+            direct_indices[group_idx, :direct_num[group_idx]].tolist())
+        slow_blocks = set(
+            slow_indices[group_idx, :slow_num[group_idx]].tolist())
+
+        missing_blocks = slow_blocks - direct_blocks
+        if missing_blocks:
+            all_contained = False
+            missing_details.append(
+                f"Group {group_idx}: missing {sorted(missing_blocks)}")
+
+    assert all_contained, (
+        "Direct path is missing blocks required by slow path:\n" +
+        "\n".join(missing_details))
+
+
 if __name__ == "__main__":
    pytest.main([__file__])
--- a/tests/kernels/test_onednn.py
+++ b/tests/kernels/test_onednn.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Integration tests for FlexAttention backend vs default backend"""
+
+from typing import Optional
+
+import pytest
+import torch
+
+from tests.kernels.utils import to_int8
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+
+if not current_platform.is_cpu():
+    pytest.skip("skipping CPU-only tests", allow_module_level=True)
+
+NK_FACTORS = [
+    (256, 128),
+    (4096, 4096),
+    (16384, 4096),
+    (1023, 491),
+    (1001, 15),
+]
+M_FACTORS = [
+    (16, 1, 32, 128, 64),
+    (1, 17, 1, 31, 17),
+]
+CACHE_SIZES = [2]
+DTYPE = [torch.bfloat16]
+
+
+def rand_int8(shape: tuple, device: str = "cpu"):
+    return to_int8(torch.rand(shape, device=device) * 255 - 128)
+
+
+def ref_int8_scaled_mm(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    azp: Optional[torch.Tensor],
+    bias: Optional[torch.Tensor],
+    output_type: torch.dtype,
+):
+    if azp is not None:
+        a = a.to(dtype=torch.float32) - azp.to(dtype=torch.float32)
+    output = torch.mm((scale_a * a.to(dtype=torch.float32)),
+                      (scale_b * b.to(dtype=torch.float32)))
+    if bias is not None:
+        output += bias.float()
+
+    return output.to(dtype=output_type)
+
+
+def onednn_int8_gemm_test_helper(primitive_cache_size: int,
+                                 m: int,
+                                 n: int,
+                                 k: int,
+                                 per_tensor_a_quant: bool,
+                                 per_tensor_b_quant: bool,
+                                 use_azp: bool,
+                                 use_bias: bool,
+                                 out_dtype: torch.dtype = torch.bfloat16,
+                                 device: str = "cpu"):
+    # Test for a oneDNN kernel with per-tensor / per-token activation
+    # quantization and per-tensor / per-output channel weight quantization.
+    a = to_int8(torch.randn((m, k), device=device) * 5)
+    b = to_int8(torch.randn((n, k), device=device).t() * 5)
+
+    a_scales_shape = (1, 1) if per_tensor_a_quant else (m, 1)
+    b_scales_shape = (1, 1) if per_tensor_b_quant else (1, n)
+
+    scale_a = (torch.randn(a_scales_shape, device=device, dtype=torch.float32))
+    scale_b = (torch.randn(b_scales_shape, device=device, dtype=torch.float32))
+
+    if use_azp:
+        azp = torch.rand(a_scales_shape, dtype=torch.float32) * 10 + 1.5
+        azp = (azp / scale_a).round().to(dtype=torch.int32)
+        azp_adj = scale_b * b.sum(dim=0, keepdim=True, dtype=torch.float32)
+    else:
+        azp = None
+        azp_adj = None
+
+    if use_bias:
+        bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10
+    else:
+        bias = None
+
+    handler = ops.create_onednn_scaled_mm(
+        b,
+        scale_b,
+        out_dtype,
+        not per_tensor_a_quant,
+        use_azp,
+        primitive_cache_size,
+    )
+
+    out = torch.zeros((m, n), dtype=out_dtype)
+    ops.onednn_scaled_mm(handler, a, out, scale_a, azp, azp_adj, bias)
+    baseline = ref_int8_scaled_mm(a, b, scale_a, scale_b, azp, bias, out_dtype)
+
+    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
+
+    if use_bias:
+        # To test runtime bias setting
+        out = torch.zeros((m, n), dtype=out_dtype)
+        ops.onednn_scaled_mm(handler, a, out, scale_a, azp, azp_adj, None)
+        baseline = ref_int8_scaled_mm(a, b, scale_a, scale_b, azp, None,
+                                      out_dtype)
+
+        torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
+
+
+@pytest.mark.parametrize("n,k", NK_FACTORS)
+@pytest.mark.parametrize("m_list", M_FACTORS)
+@pytest.mark.parametrize("per_tensor_a_scale", [True, False])
+@pytest.mark.parametrize("per_tensor_b_scale", [True, False])
+@pytest.mark.parametrize("use_bias", [True, False])
+@pytest.mark.parametrize("use_azp", [True, False])
+@pytest.mark.parametrize("output_type", DTYPE)
+@pytest.mark.parametrize("primitive_cache_size", CACHE_SIZES)
+def test_onednn_int8_scaled_gemm(
+    n: int,
+    k: int,
+    m_list: tuple[int],
+    per_tensor_a_scale: bool,
+    per_tensor_b_scale: bool,
+    use_bias: bool,
+    use_azp: bool,
+    output_type: torch.dtype,
+    primitive_cache_size: int,
+):
+    for m in m_list:
+        onednn_int8_gemm_test_helper(
+            primitive_cache_size=primitive_cache_size,
+            m=m,
+            n=n,
+            k=k,
+            per_tensor_a_quant=per_tensor_a_scale,
+            per_tensor_b_quant=per_tensor_b_scale,
+            use_bias=use_bias,
+            use_azp=use_azp,
+            out_dtype=output_type,
+        )
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -3,15 +3,13 @@

 import tempfile
 from collections import OrderedDict
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock

 import pytest
 import torch
 import torch.nn as nn
 from huggingface_hub import snapshot_download

-import vllm
-from vllm.config import LoRAConfig
 from vllm.distributed import (cleanup_dist_env_and_memory,
                              init_distributed_environment,
                              initialize_model_parallel)
@@ -21,7 +19,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
-from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.models.interfaces import SupportsLoRA
 from vllm.platforms import current_platform

@@ -104,6 +101,7 @@ def dummy_model() -> nn.Module:
        ]))
    model.config = MagicMock()
    model.embedding_modules = {"lm_head": "lm_head"}
+    model.unpadded_vocab_size = 32000
    return model


@@ -137,6 +135,8 @@ def dummy_model_gate_up() -> nn.Module:
        ],
    }
    model.embedding_modules = {"lm_head": "lm_head"}
+    model.unpadded_vocab_size = 32000
+
    return model


@@ -216,34 +216,6 @@ def tinyllama_lora_files():
    return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")


-@pytest.fixture(scope="session")
-def phi2_lora_files():
-    return snapshot_download(repo_id="isotr0py/phi-2-test-sql-lora")
-
-
-@pytest.fixture
-def llama_2_7b_engine_extra_embeddings():
-    cleanup_dist_env_and_memory(shutdown_ray=True)
-    get_model_old = get_model
-
-    def get_model_patched(**kwargs):
-        kwargs["vllm_config"].lora_config = LoRAConfig(max_loras=4,
-                                                       max_lora_rank=8)
-        return get_model_old(**kwargs)
-
-    with patch("vllm.worker.model_runner.get_model", get_model_patched):
-        engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)
-    yield engine.llm_engine
-    del engine
-    cleanup_dist_env_and_memory(shutdown_ray=True)
-
-
-@pytest.fixture
-def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings):
-    yield (llama_2_7b_engine_extra_embeddings.model_executor.driver_worker.
-           model_runner.model)
-
-
 @pytest.fixture
 def reset_default_device():
    """

--- a/tests/lora/test_add_lora.py
+++ b/tests/lora/test_add_lora.py
@@ -5,7 +5,6 @@ import time

 import pytest

-import vllm.envs as env
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.entrypoints.openai.api_server import (
    build_async_engine_client_from_engine_args)
@@ -98,12 +97,10 @@ async def test_add_lora(chatglm3_lora_files):
        # Run with warmup
        add_lora_tasks = [llm.add_lora(lr) for lr in warmup_run_requests]
        add_lora_results = await asyncio.gather(*add_lora_tasks)
-        if env.VLLM_USE_V1:
-            # Test that all all_lora calls are successful.
-            assert all(add_lora_results)
-        else:
-            # No way to check V0 engine results as the calls just return None.
-            pass
+
+        # Test that all all_lora calls are successful.
+        assert all(add_lora_results)
+
        time_with_add_lora = await requests_processing_time(
            llm, warmup_run_requests)


--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-import vllm
-from vllm.distributed import cleanup_dist_env_and_memory
-from vllm.lora.request import LoRARequest
-
-MODEL_PATH = "baichuan-inc/Baichuan-7B"
-
-PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501
-
-
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
-    prompts = [
-        PROMPT_TEMPLATE.format(query="How many singers do we have?"),
-        PROMPT_TEMPLATE.format(
-            query=
-            "What is the average, minimum, and maximum age of all singers from France?"  # noqa: E501
-        ),
-        PROMPT_TEMPLATE.format(
-            query=
-            "Show name, country, age for all singers ordered by age from the oldest to the youngest."  # noqa: E501
-        ),
-    ]
-    print(prompts)
-    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
-    outputs = llm.generate(
-        prompts,
-        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None)
-    # Print the outputs.
-    generated_texts: list[str] = []
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text.strip()
-        generated_texts.append(generated_text)
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    return generated_texts
-
-
-def test_baichuan_lora(baichuan_lora_files):
-    llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
-                   enable_lora=True,
-                   max_loras=4,
-                   max_lora_rank=64,
-                   trust_remote_code=True)
-
-    expected_lora_output = [
-        "SELECT count(*) FROM singer",
-        "SELECT avg(age) ,  min(age) ,  max(age) FROM singer WHERE Country  =  'France'",  # noqa: E501
-        "SELECT name ,  country ,  age FROM singer ORDER BY age ASC",
-    ]
-
-    output1 = do_sample(llm, baichuan_lora_files, lora_id=1)
-    for i in range(len(expected_lora_output)):
-        assert output1[i] == expected_lora_output[i]
-    output2 = do_sample(llm, baichuan_lora_files, lora_id=2)
-    for i in range(len(expected_lora_output)):
-        assert output2[i] == expected_lora_output[i]
-
-
-@pytest.mark.parametrize("fully_sharded", [True, False])
-def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
-                                           num_gpus_available, fully_sharded):
-    if num_gpus_available < 4:
-        pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
-
-    llm_tp1 = vllm.LLM(MODEL_PATH,
-                       enable_lora=True,
-                       max_num_seqs=16,
-                       max_loras=4,
-                       max_lora_rank=64,
-                       trust_remote_code=True,
-                       fully_sharded_loras=fully_sharded)
-    output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1)
-
-    del llm_tp1
-    cleanup_dist_env_and_memory()
-
-    llm_tp2 = vllm.LLM(MODEL_PATH,
-                       enable_lora=True,
-                       max_num_seqs=16,
-                       max_loras=4,
-                       max_lora_rank=64,
-                       tensor_parallel_size=2,
-                       trust_remote_code=True,
-                       fully_sharded_loras=fully_sharded)
-    output_tp2 = do_sample(llm_tp2, baichuan_lora_files, lora_id=2)
-
-    del llm_tp2
-    cleanup_dist_env_and_memory()
-
-    assert output_tp1 == output_tp2
-
-    llm_tp4 = vllm.LLM(MODEL_PATH,
-                       enable_lora=True,
-                       max_num_seqs=16,
-                       max_loras=4,
-                       max_lora_rank=64,
-                       tensor_parallel_size=4,
-                       trust_remote_code=True,
-                       fully_sharded_loras=fully_sharded)
-    output_tp4 = do_sample(llm_tp4, baichuan_lora_files, lora_id=2)
-
-    del llm_tp4
-    cleanup_dist_env_and_memory()
-
-    assert output_tp1 == output_tp4
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -87,6 +87,9 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
 @multi_gpu_test(num_gpus=4)
 @create_new_process_for_each_test()
 def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
+    # https://github.com/NVIDIA/nccl/issues/1790, set a lower value for
+    # gpu_memory_utilization here because NCCL >= 2.26.3 seems to use
+    # more GPU memory causing vLLM to OOM
    llm = vllm.LLM(MODEL_PATH,
                   max_model_len=1024,
                   enable_lora=True,
@@ -95,7 +98,8 @@ def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
                   tensor_parallel_size=4,
                   trust_remote_code=True,
                   fully_sharded_loras=True,
-                   enable_chunked_prefill=True)
+                   enable_chunked_prefill=True,
+                   gpu_memory_utilization=0.85)
    output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
    for i in range(len(EXPECTED_LORA_OUTPUT)):
        assert output1[i] == EXPECTED_LORA_OUTPUT[i]

--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -243,7 +243,7 @@ def check_punica_wrapper(punica_wrapper) -> bool:


 @torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
 @pytest.mark.parametrize("stage", STAGES)
@@ -347,7 +347,7 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
 @torch.inference_mode()
 # @pytest.mark.skip(
 #     reason="Fails when loras are in any slot other than the first.")
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
 @pytest.mark.parametrize("stage", STAGES)
@@ -486,7 +486,7 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,


 @torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 256512])
 @pytest.mark.parametrize("stage", STAGES)
@@ -620,12 +620,15 @@ def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,


 @torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
-@pytest.mark.parametrize("bias_enabled", [True, False])
-def test_linear_replicated(dist_init, num_loras, device, stage,
-                           bias_enabled) -> None:
+def test_linear_replicated(
+    dist_init,
+    num_loras,
+    device,
+    stage,
+) -> None:

    if current_platform.is_cuda_alike():
        torch.cuda.set_device(device)
@@ -634,10 +637,11 @@ def test_linear_replicated(dist_init, num_loras, device, stage,
    torch.set_default_device(device)
    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
    assert check_punica_wrapper(punica_wrapper)
-    lora_config = LoRAConfig(max_loras=max_loras,
-                             max_lora_rank=8,
-                             lora_dtype=torch.float16,
-                             bias_enabled=bias_enabled)
+    lora_config = LoRAConfig(
+        max_loras=max_loras,
+        max_lora_rank=8,
+        lora_dtype=torch.float16,
+    )

    def create_random_linear_replicated_layer():

@@ -651,10 +655,6 @@ def test_linear_replicated(dist_init, num_loras, device, stage,
        lora_linear.create_lora_weights(max_loras, lora_config)
        assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
            lora_linear.lora_b_stacked) == 1)
-        if bias_enabled:
-            assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
-        else:
-            assert lora_linear.lora_bias_stacked is None
        return linear, lora_linear

    for i in range(NUM_RANDOM_SEEDS):
@@ -734,14 +734,13 @@ def test_linear_replicated(dist_init, num_loras, device, stage,


 @torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("orientation", ["row", "column"])
 @pytest.mark.parametrize("fully_shard", [True, False])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
-@pytest.mark.parametrize("bias_enabled", [True, False])
 def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
-                         device, stage, bias_enabled) -> None:
+                         device, stage) -> None:

    if current_platform.is_cuda_alike():
        torch.cuda.set_device(device)
@@ -750,11 +749,12 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
    torch.set_default_device(device)
    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
    assert check_punica_wrapper(punica_wrapper)
-    lora_config = LoRAConfig(max_loras=max_loras,
-                             max_lora_rank=8,
-                             fully_sharded_loras=fully_shard,
-                             lora_dtype=torch.float16,
-                             bias_enabled=bias_enabled)
+    lora_config = LoRAConfig(
+        max_loras=max_loras,
+        max_lora_rank=8,
+        fully_sharded_loras=fully_shard,
+        lora_dtype=torch.float16,
+    )

    def create_random_linear_parallel_layer():
        if orientation == "row":
@@ -777,10 +777,7 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
        lora_linear.create_lora_weights(max_loras, lora_config)
        assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
            lora_linear.lora_b_stacked) == 1)
-        if bias_enabled:
-            assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
-        else:
-            assert lora_linear.lora_bias_stacked is None
+
        return linear, lora_linear

    for i in range(NUM_RANDOM_SEEDS):
@@ -860,14 +857,13 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,


 @torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("repeats", [1, 2, 3])
 @pytest.mark.parametrize("fully_shard", [True, False])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
-@pytest.mark.parametrize("bias_enabled", [True, False])
 def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
-                                device, stage, bias_enabled) -> None:
+                                device, stage) -> None:

    if current_platform.is_cuda_alike():
        torch.cuda.set_device(device)
@@ -876,11 +872,12 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
    torch.set_default_device(device)
    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
    assert check_punica_wrapper(punica_wrapper)
-    lora_config = LoRAConfig(max_loras=max_loras,
-                             max_lora_rank=8,
-                             fully_sharded_loras=fully_shard,
-                             lora_dtype=torch.float16,
-                             bias_enabled=bias_enabled)
+    lora_config = LoRAConfig(
+        max_loras=max_loras,
+        max_lora_rank=8,
+        fully_sharded_loras=fully_shard,
+        lora_dtype=torch.float16,
+    )

    def create_column_parallel_packed_layer():
        if repeats == 2:
@@ -924,10 +921,7 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
                                        model_config=FakeConfig())
        assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
            lora_linear.lora_b_stacked) == n_slices)
-        if bias_enabled:
-            assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
-        else:
-            assert lora_linear.lora_bias_stacked is None
+
        return linear, lora_linear

    for i in range(NUM_RANDOM_SEEDS):

--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -113,8 +113,7 @@ def test_llama_lora(sql_lora_files):
        enable_lora=True,
        # also test odd max_num_seqs
        max_num_seqs=13,
-        max_loras=4,
-        enable_chunked_prefill=True)
+        max_loras=4)
    generate_and_test(llm, sql_lora_files)


@@ -128,7 +127,6 @@ def test_llama_lora_tp4(sql_lora_files):
        max_num_seqs=16,
        max_loras=4,
        tensor_parallel_size=4,
-        enable_chunked_prefill=True,
    )
    generate_and_test(llm, sql_lora_files)

@@ -144,7 +142,6 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
        max_loras=4,
        tensor_parallel_size=4,
        fully_sharded_loras=True,
-        enable_chunked_prefill=True,
    )
    generate_and_test(llm, sql_lora_files)


--- a/tests/lora/test_multi_loras_with_tp.py
+++ b/tests/lora/test_multi_loras_with_tp.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
-Script to test multi loras service with tp >= 2
+This script contains:
+1. test multi loras service with tp >= 2
+2. test multi loras request
 """
+import pytest
+
 from tests.utils import multi_gpu_test
 from vllm import LLM, SamplingParams
 from vllm.lora.request import LoRARequest
@@ -156,3 +160,34 @@ def test_multi_loras_with_tp_sync():

        output_text = call_llm_get_outputs(prompt, "Alice")
        check_outputs(output_text, expected_output)
+
+
+def test_multiple_lora_requests():
+    llm = LLM(
+        model=MODEL_PATH,
+        enable_lora=True,
+        max_loras=4,
+        max_lora_rank=LORA_RANK,
+        max_model_len=512,
+        gpu_memory_utilization=0.5,
+        enforce_eager=True,
+    )
+    PROMPTS = ["Hello, my name is"] * 2
+    LORA_NAME = "Alice"
+    lora_request = [
+        LoRARequest(LORA_NAME + str(idx), idx + 1,
+                    LORA_NAME_PATH_MAP[LORA_NAME])
+        for idx in range(len(PROMPTS))
+    ]
+    # Multiple SamplingParams should be matched with each prompt
+    outputs = llm.generate(PROMPTS, lora_request=lora_request)
+    assert len(PROMPTS) == len(outputs)
+
+    # Exception raised, if the size of params does not match the size of prompts
+    with pytest.raises(ValueError):
+        outputs = llm.generate(PROMPTS, lora_request=lora_request[:1])
+
+    # Single LoRARequest should be applied to every prompt
+    single_lora_request = lora_request[0]
+    outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
+    assert len(PROMPTS) == len(outputs)
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -21,6 +21,8 @@ from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
                                      WorkerLoRAManager)
 from vllm.platforms import current_platform

+from .utils import create_peft_lora
+
 EMBEDDING_MODULES = {
    "embed_tokens": "input_embeddings",
    "lm_head": "output_embeddings",
@@ -35,17 +37,6 @@ DEVICES = ([
 DEFAULT_DTYPE = torch.get_default_dtype()


-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch: pytest.MonkeyPatch):
-    """
-    Some tests depend on V0 internals. Since both V0 and V1 use the same
-    LoRAModelManager it is okay to just test V0.
-    """
-    with monkeypatch.context() as m:
-        m.setenv('VLLM_USE_V1', '0')
-        yield
-
-
 @pytest.mark.parametrize("device", DEVICES)
 def test_from_lora_tensors(sql_lora_files, device):
    tensors = load_file(
@@ -326,7 +317,6 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
                                                  max_loras=2,
                                                  lora_dtype=DEFAULT_DTYPE),
                                       device=device)
-
    assert all(x is None for x in manager.lora_index_to_id)

    # Add up to capacity
@@ -430,32 +420,40 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):


 @pytest.mark.parametrize("device", DEVICES)
-def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
-                                          sql_lora_files, device):
+def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device,
+                                          tmp_path):
    lora_config = LoRAConfig(max_lora_rank=8,
                             max_cpu_loras=4,
                             max_loras=4,
                             lora_dtype=DEFAULT_DTYPE)
+
+    dummy_lora_files = f"{tmp_path}/lora_adapter"
+    os.makedirs(dummy_lora_files, exist_ok=True)
+    create_peft_lora(
+        dummy_model,
+        save_dir=dummy_lora_files,
+        target_modules=["layer1.dense1", "dense2"],
+        lora_dtype=DEFAULT_DTYPE,
+    )
    worker_adapter_manager = LRUCacheWorkerLoRAManager(
-        4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
-        lora_config.lora_extra_vocab_size, lora_config, device,
-        EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
-    worker_adapter_manager.create_lora_manager(
-        llama_2_7b_model_extra_embeddings)
+        4, 2,
+        dummy_model.unpadded_vocab_size - lora_config.lora_extra_vocab_size,
+        lora_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
+    worker_adapter_manager.create_lora_manager(dummy_model)

    mapping = LoRAMapping([], [])
    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("2", 2, sql_lora_files)
+        LoRARequest("1", 1, dummy_lora_files),
+        LoRARequest("2", 2, dummy_lora_files)
    ], mapping)
    assert worker_adapter_manager.list_adapters() == {1, 2}
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2

    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("3", 3, sql_lora_files),
-        LoRARequest("4", 4, sql_lora_files)
+        LoRARequest("1", 1, dummy_lora_files),
+        LoRARequest("3", 3, dummy_lora_files),
+        LoRARequest("4", 4, dummy_lora_files)
    ], mapping)
    assert worker_adapter_manager.list_adapters() == {1, 2, 3, 4}
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
@@ -464,9 +462,9 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4

    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("2", 2, sql_lora_files),
-        LoRARequest("5", 5, sql_lora_files)
+        LoRARequest("1", 1, dummy_lora_files),
+        LoRARequest("2", 2, dummy_lora_files),
+        LoRARequest("5", 5, dummy_lora_files)
    ], mapping)
    assert worker_adapter_manager.list_adapters() == {1, 2, 4, 5}
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
@@ -475,9 +473,9 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4

    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("1", 1, sql_lora_files)
+        LoRARequest("1", 1, dummy_lora_files),
+        LoRARequest("1", 1, dummy_lora_files),
+        LoRARequest("1", 1, dummy_lora_files)
    ], mapping)
    assert worker_adapter_manager.list_adapters() == {1, 2, 4, 5}
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
@@ -486,9 +484,9 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4

    worker_adapter_manager.set_active_adapters([
-        LoRARequest("6", 6, sql_lora_files),
-        LoRARequest("7", 7, sql_lora_files),
-        LoRARequest("8", 8, sql_lora_files)
+        LoRARequest("6", 6, dummy_lora_files),
+        LoRARequest("7", 7, dummy_lora_files),
+        LoRARequest("8", 8, dummy_lora_files)
    ], mapping)
    assert worker_adapter_manager.list_adapters() == {1, 6, 7, 8}
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
@@ -499,11 +497,11 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
    # Over capacity
    with pytest.raises(RuntimeError):
        worker_adapter_manager.set_active_adapters([
-            LoRARequest("10", 10, sql_lora_files),
-            LoRARequest("11", 11, sql_lora_files),
-            LoRARequest("12", 12, sql_lora_files),
-            LoRARequest("13", 13, sql_lora_files),
-            LoRARequest("14", 14, sql_lora_files)
+            LoRARequest("10", 10, dummy_lora_files),
+            LoRARequest("11", 11, dummy_lora_files),
+            LoRARequest("12", 12, dummy_lora_files),
+            LoRARequest("13", 13, dummy_lora_files),
+            LoRARequest("14", 14, dummy_lora_files)
        ], mapping)

    assert worker_adapter_manager.device == device
@@ -512,33 +510,41 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,


 @pytest.mark.parametrize("device", DEVICES)
-def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
-                                sql_lora_files, device):
+def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device,
+                                tmp_path):
    # Should remove every LoRA not specified in the request.
    lora_config = LoRAConfig(max_lora_rank=8,
                             max_cpu_loras=4,
                             max_loras=4,
                             lora_dtype=DEFAULT_DTYPE)
    worker_adapter_manager = WorkerLoRAManager(
-        4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
+        4, 2, dummy_model_gate_up.unpadded_vocab_size -
        lora_config.lora_extra_vocab_size, lora_config, device,
        EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
-    worker_adapter_manager.create_lora_manager(
-        llama_2_7b_model_extra_embeddings)
+    worker_adapter_manager.create_lora_manager(dummy_model_gate_up)
+
+    dummy_lora_files = f"{tmp_path}/lora_adapter"
+    os.makedirs(dummy_lora_files, exist_ok=True)
+    create_peft_lora(
+        dummy_model_gate_up,
+        save_dir=dummy_lora_files,
+        target_modules=["layer1.dense1", "dense2"],
+        lora_dtype=DEFAULT_DTYPE,
+    )

    mapping = LoRAMapping([], [])
    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("2", 2, sql_lora_files)
+        LoRARequest("1", 1, dummy_lora_files),
+        LoRARequest("2", 2, dummy_lora_files)
    ], mapping)
    assert worker_adapter_manager.list_adapters() == {1, 2}
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2

    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("3", 3, sql_lora_files),
-        LoRARequest("4", 4, sql_lora_files)
+        LoRARequest("1", 1, dummy_lora_files),
+        LoRARequest("3", 3, dummy_lora_files),
+        LoRARequest("4", 4, dummy_lora_files)
    ], mapping)
    assert worker_adapter_manager.list_adapters() == {1, 3, 4}
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
@@ -546,9 +552,9 @@ def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 4

    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("2", 2, sql_lora_files),
-        LoRARequest("5", 5, sql_lora_files)
+        LoRARequest("1", 1, dummy_lora_files),
+        LoRARequest("2", 2, dummy_lora_files),
+        LoRARequest("5", 5, dummy_lora_files)
    ], mapping)
    assert worker_adapter_manager.list_adapters() == {1, 2, 5}
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
@@ -556,9 +562,9 @@ def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5

    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("1", 1, sql_lora_files)
+        LoRARequest("1", 1, dummy_lora_files),
+        LoRARequest("1", 1, dummy_lora_files),
+        LoRARequest("1", 1, dummy_lora_files)
    ], mapping)
    assert worker_adapter_manager.list_adapters() == {1}
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
@@ -566,9 +572,9 @@ def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] is None

    worker_adapter_manager.set_active_adapters([
-        LoRARequest("6", 6, sql_lora_files),
-        LoRARequest("7", 7, sql_lora_files),
-        LoRARequest("8", 8, sql_lora_files)
+        LoRARequest("6", 6, dummy_lora_files),
+        LoRARequest("7", 7, dummy_lora_files),
+        LoRARequest("8", 8, dummy_lora_files)
    ], mapping)
    assert worker_adapter_manager.list_adapters() == {6, 7, 8}
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 8
@@ -578,11 +584,11 @@ def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
    # Over capacity
    with pytest.raises(RuntimeError):
        worker_adapter_manager.set_active_adapters([
-            LoRARequest("10", 10, sql_lora_files),
-            LoRARequest("11", 11, sql_lora_files),
-            LoRARequest("12", 12, sql_lora_files),
-            LoRARequest("13", 13, sql_lora_files),
-            LoRARequest("14", 14, sql_lora_files)
+            LoRARequest("10", 10, dummy_lora_files),
+            LoRARequest("11", 11, dummy_lora_files),
+            LoRARequest("12", 12, dummy_lora_files),
+            LoRARequest("13", 13, dummy_lora_files),
+            LoRARequest("14", 14, dummy_lora_files)
        ], mapping)

    assert worker_adapter_manager.device == device