Merge tag 'v0.10.2rc1' into v0.10.2rc1-dev

a99300bd · zhuwenwen · cc3e01c7 · 5438967f · a99300bd · a99300bd
Commit a99300bd authored Sep 09, 2025 by zhuwenwen
20 changed files
--- a/tests/kernels/moe/untest_pplx_cutlass_moe.py
+++ b/tests/kernels/moe/untest_pplx_cutlass_moe.py
@@ -17,6 +17,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
 from vllm.platforms import current_platform
 from vllm.utils import cdiv

+from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch

 try:
@@ -76,6 +77,7 @@ def pplx_cutlass_moe(
    assert torch.cuda.current_device() == pgi.local_rank

    num_tokens, hidden_dim = a.shape
+    intermediate_dim = w2.shape[2]
    num_experts = w1.shape[0]
    block_size = hidden_dim  # TODO support more cases
    device = pgi.device
@@ -124,8 +126,27 @@ def pplx_cutlass_moe(
        num_local_experts=num_local_experts,
        num_dispatchers=num_dispatchers)

+    ab_strides1 = torch.full((num_local_experts, ),
+                             hidden_dim,
+                             device="cuda",
+                             dtype=torch.int64)
+    ab_strides2 = torch.full((num_local_experts, ),
+                             intermediate_dim,
+                             device="cuda",
+                             dtype=torch.int64)
+    c_strides1 = torch.full((num_local_experts, ),
+                            2 * intermediate_dim,
+                            device="cuda",
+                            dtype=torch.int64)
+    c_strides2 = torch.full((num_local_experts, ),
+                            hidden_dim,
+                            device="cuda",
+                            dtype=torch.int64)
+
    experts = CutlassBatchedExpertsFp8(num_local_experts, num_dispatchers,
-                                       out_dtype, per_act_token, per_out_ch)
+                                       out_dtype, per_act_token, per_out_ch,
+                                       ab_strides1, ab_strides2, c_strides1,
+                                       c_strides2)

    fused_cutlass_experts = FusedMoEModularKernel(
        prepare_finalize,
@@ -227,6 +248,7 @@ def _pplx_moe(
 @pytest.mark.parametrize("per_out_ch", [True, False])
 @pytest.mark.parametrize("world_dp_size", [[2, 1]])  #, [4, 2]])
 @pytest.mark.parametrize("use_internode", [False])
+@multi_gpu_test(num_gpus=2)
 @pytest.mark.skipif(
    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
        current_platform.get_device_capability()),

--- a/tests/kernels/moe/untest_silu_mul_fp8_quant_deep_gemm.py
+++ b/tests/kernels/moe/untest_silu_mul_fp8_quant_deep_gemm.py
@@ -24,7 +24,7 @@ def test_silu_mul_fp8_quant_deep_gemm(E, T, H, group_size, seed):
    current_platform.seed_everything(seed)

    # Input tensor of shape (E, T, 2*H)
-    y = torch.randn((E, T, 2 * H), dtype=torch.float32, device="cuda")
+    y = torch.randn((E, T, 2 * H), dtype=torch.bfloat16, device="cuda")
    tokens_per_expert = torch.randint(
        low=0,
        high=T,
@@ -74,7 +74,7 @@ def test_silu_mul_fp8_quant_deep_gemm(E, T, H, group_size, seed):
        y_se = y_s[e]
        y_qe = y_q[e]

-        torch.testing.assert_close(y_se[:nt], ref_s[:nt])
+        torch.testing.assert_close(y_se[:nt], ref_s[:nt], atol=1e-4, rtol=1e-2)
        torch.testing.assert_close(
            y_qe[:nt].to(torch.float32),
            ref_q[:nt].to(torch.float32),

--- a/tests/kernels/quantization/test_awq_triton.py
+++ b/tests/kernels/quantization/test_awq_triton.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the AWQ Triton kernel.

-Run `pytest tests/kernels/test_awq_triton.py`.
+Run `pytest tests/kernels/quantization/test_awq_triton.py`.
 """
 import pytest
 import torch

--- a/tests/kernels/quantization/test_cutlass_w4a8.py
+++ b/tests/kernels/quantization/test_cutlass_w4a8.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for the CUTLASS W4A8 kernel.
+
+Run `pytest tests/kernels/quantization/test_cutlass_w4a8.py`.
+"""
+
+from dataclasses import dataclass
+from typing import Optional
+
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    pack_rows, quantize_weights)
+from vllm.platforms import current_platform
+from vllm.scalar_type import ScalarType, scalar_types
+
+# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
+#  unit tests to a common utility function. Currently the use of
+#  `is_quant_method_supported` conflates kernels with quantization methods
+#  an assumption which is breaking down as quantizations methods can have
+#  have kernels and some kernels support multiple quantization methods.
+IS_SUPPORTED_BY_GPU = current_platform.get_device_capability()[0] >= 9
+
+MNK_SHAPES = [(1, 128, 128), (1, 512, 1024), (1, 4096, 4096), (1, 8192, 28672),
+              (13, 8192, 4096), (26, 4096, 8192), (64, 4096, 4096),
+              (64, 8192, 28672), (257, 128, 4096), (257, 4096, 4096),
+              (1024, 4096, 8192), (1024, 8192, 4096)]
+
+# TODO(czhu): get supported schedules from fn
+SCHEDULES = [
+    '128x16_1x1x1', '256x16_1x1x1', '128x32_1x1x1', '256x32_1x1x1',
+    '128x64_1x1x1', '256x64_1x1x1', '128x128_1x1x1', '256x128_1x1x1',
+    '128x256_1x1x1', '128x256_2x1x1'
+]
+
+
+@dataclass
+class TypeConfig:
+    act_type: torch.dtype
+    weight_type: ScalarType
+    output_type: Optional[torch.dtype]
+    group_scale_type: Optional[torch.dtype]
+    channel_scale_type: Optional[torch.dtype]
+    token_scale_type: Optional[torch.dtype]
+
+
+@dataclass
+class Tensors:
+    w_ref: torch.Tensor
+    a_ref: torch.Tensor
+    a: torch.Tensor
+    w_q: torch.Tensor
+    w_g_s: torch.Tensor
+    w_ch_s: torch.Tensor
+    w_tok_s: torch.Tensor
+
+
+# (Act Type, Weight Type, Output Type, Scale Type, ZeroPoints,
+#  Ch Scales Type, Tok Scales Type)
+TestTypeTuple = tuple[list[torch.dtype], ScalarType, Optional[torch.dtype],
+                      Optional[torch.dtype], bool]
+TEST_TYPES = [
+    *(
+        TypeConfig(act_type=torch.float8_e4m3fn,
+                   weight_type=w_type,
+                   output_type=o_type,
+                   group_scale_type=torch.float8_e4m3fn,
+                   channel_scale_type=torch.float32,
+                   token_scale_type=torch.float32)
+        for w_type in [scalar_types.int4]
+        # TODO(czhu): fp16 out type
+        for o_type in [torch.bfloat16]),
+]
+
+# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
+#  unit tests to a common utility function. Currently the use of
+#  `is_quant_method_supported` conflates kernels with quantization methods
+#  an assumption which is breaking down as quantizations methods can have
+#  have kernels and some kernels support multiple quantization methods.
+IS_SUPPORTED_BY_GPU = current_platform.has_device_capability(90)
+
+
+# For testing quantized linear kernels
+def to_fp8(tensor: torch.Tensor):
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return tensor.clamp(min=finfo.min,
+                        max=finfo.max).to(dtype=torch.float8_e4m3fn)
+
+
+def cutlass_quantize_and_pack(atype: torch.dtype,
+                              w: torch.Tensor,
+                              wtype: ScalarType,
+                              stype: Optional[torch.dtype],
+                              group_size: Optional[int],
+                              zero_points: bool = False):
+    assert wtype.is_integer(), "TODO: support floating point weights"
+
+    w_ref, w_q, w_s, w_zp = quantize_weights(w,
+                                             wtype,
+                                             group_size=group_size,
+                                             zero_points=zero_points)
+
+    # since scales are cast to fp8, we need to compute w_ref this way
+    w_ref = ((w_q).to(torch.float32) * w_s.to(atype).to(
+        torch.float32).repeat_interleave(group_size, dim=0)).to(atype)
+
+    # bit mask prevents sign extending int4 when packing
+    w_q = pack_rows(w_q & 0x0F, wtype.size_bits, *w_q.shape)
+    w_q = w_q.t().contiguous().t()  # convert to col major
+
+    w_q_packed = ops.cutlass_encode_and_reorder_int4b(w_q)
+    w_s_packed = ops.cutlass_pack_scale_fp8(w_s.to(atype))
+
+    return w_ref, w_q_packed, w_s_packed, w_zp
+
+
+def create_test_tensors(shape: tuple[int, int, int], types: TypeConfig,
+                        group_size: Optional[int]) -> Tensors:
+    m, n, k = shape
+
+    print("create_test_tensors, shape:", shape, "types:", types, "group_size:",
+          group_size)
+
+    a = to_fp8(torch.randn((m, k), device="cuda"))
+    w = to_fp8(torch.randn((k, n), device="cuda"))
+
+    if types.group_scale_type is not None:
+        w = w.to(types.group_scale_type)
+    if w.dtype.itemsize == 1:
+        w = w.to(torch.float16)
+
+    w_ref, w_q_packed, w_s, _ = cutlass_quantize_and_pack(
+        a.dtype, w, types.weight_type, types.group_scale_type, group_size,
+        False)
+
+    a_ref = a.to(torch.float32)
+    w_ref = w_ref.to(torch.float32)
+
+    # for the practical use case we need per-tok scales for fp8 activations
+    w_tok_s = torch.randn((m, ), device='cuda', dtype=types.token_scale_type)
+    # weights are already per-group quantized, use placeholder here
+    w_ch_s = torch.ones((n, ), device='cuda', dtype=types.channel_scale_type)
+
+    return Tensors(w_ref=w_ref,
+                   a_ref=a_ref,
+                   a=a,
+                   w_q=w_q_packed,
+                   w_g_s=w_s,
+                   w_ch_s=w_ch_s,
+                   w_tok_s=w_tok_s)
+
+
+def mm_test_helper(types: TypeConfig,
+                   tensors: Tensors,
+                   group_size: Optional[int] = None,
+                   schedule: Optional[str] = None):
+    # CUTLASS upstream uses fp8 with fastaccum as reference
+    # https://github.com/NVIDIA/cutlass/blob/main/examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_fp8_gemm.cu#L406
+    output_ref = torch._scaled_mm(
+        tensors.a_ref.to(types.act_type),
+        tensors.w_ref.to(types.act_type).t().contiguous().t(),  # col major
+        tensors.w_tok_s.unsqueeze(1),
+        tensors.w_ch_s.unsqueeze(0),
+        out_dtype=types.output_type,
+        use_fast_accum=True)
+
+    output = ops.cutlass_w4a8_mm(
+        a=tensors.a,
+        b_q=tensors.w_q,
+        b_group_scales=tensors.w_g_s,
+        b_group_size=group_size,
+        b_channel_scales=tensors.w_ch_s,
+        a_token_scales=tensors.w_tok_s,
+    )
+
+    print(output)
+    print(output_ref)
+
+    torch.testing.assert_close(output,
+                               output_ref.to(output.dtype),
+                               rtol=1e-3,
+                               atol=1e-3)
+
+
+@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
+                    reason="CUTLASS W4A8 is not supported on this GPU type.")
+@pytest.mark.parametrize("shape",
+                         MNK_SHAPES,
+                         ids=lambda x: "x".join(str(v) for v in x))
+@pytest.mark.parametrize("types", TEST_TYPES)
+@pytest.mark.parametrize("schedule", SCHEDULES)
+def test_cutlass_w4a8(shape, types: TypeConfig, schedule):
+    group_sizes = [128]
+    for group_size in group_sizes:
+        tensors = create_test_tensors(shape, types, group_size)
+        mm_test_helper(types, tensors, group_size, schedule)
+
+
+# Test to make sure cuda graphs work
+class W4A8Layer(torch.nn.Module):
+
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.kwargs = kwargs
+
+    def forward(self, a):
+        return ops.cutlass_w4a8_mm(a=a, **self.kwargs)
+
+
+@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
+                    reason="CUTLASS W4A8 is not supported on this GPU type.")
+def test_w4a8_cuda_graph():
+    m, n, k = 512, 4096, 4096
+
+    a = to_fp8(torch.randn((m, k), device="cuda"))
+    b = to_fp8(torch.randn((k, n), device="cuda"))
+
+    wtype = scalar_types.int4
+    stype = torch.float8_e4m3fn
+    group_size = 128
+    zero_points = False
+
+    w_ref, w_q_packed, w_s, _ = cutlass_quantize_and_pack(
+        a.dtype, b.to(torch.float16), wtype, stype, group_size, zero_points)
+
+    w_tok_s = torch.randn((m, ), device='cuda', dtype=torch.float32)
+    w_ch_s = torch.ones((n, ), device='cuda', dtype=torch.float32)
+
+    # Construct a trivial model with a single layer that calls the kernel
+    model = W4A8Layer(
+        b_q=w_q_packed,
+        b_group_scales=w_s,
+        b_group_size=group_size,
+        b_channel_scales=w_ch_s,
+        a_token_scales=w_tok_s,
+    )
+
+    output_ref = torch._scaled_mm(
+        a,
+        w_ref.to(a.dtype).t().contiguous().t(),  # col major
+        w_tok_s.unsqueeze(1),
+        w_ch_s.unsqueeze(0),
+        out_dtype=torch.bfloat16,
+        use_fast_accum=True)
+
+    # Run the model with a cuda graph
+    stream = torch.cuda.Stream()
+    with torch.cuda.stream(stream):
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g):
+            output = model(a)
+
+    output.zero_()
+    g.replay()
+
+    torch.testing.assert_close(output, output_ref, rtol=1e-3, atol=1e-3)
--- a/tests/kernels/quantization/test_flashinfer_scaled_mm.py
+++ b/tests/kernels/quantization/test_flashinfer_scaled_mm.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+from vllm.utils.flashinfer import flashinfer_scaled_fp8_mm
+
+if not current_platform.has_device_capability(100):
+    pytest.skip(
+        reason=
+        "Flashinfer FP8 gemms requires compute capability of 10.0 or above.",
+        allow_module_level=True,
+    )
+
+DTYPES = [torch.float16, torch.bfloat16]
+# m, n, k
+SHAPES = [(128, 128, 64), (128, 128, 128), (256, 128, 64), (128, 256, 128)]
+PAD_SHAPES = [(150, 128, 64), (128, 128, 96)]
+SHAPES.extend(PAD_SHAPES)
+
+SEEDS = [42]
+CUDA_DEVICES = ["cuda:0"]
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("use_bias", [True, False])
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("autotune", [False, True])
+@torch.inference_mode()
+def test_flashinfer_fp8_gemm(
+    dtype: torch.dtype,
+    shape: tuple[int, int, int],
+    use_bias: bool,
+    seed: int,
+    device: str,
+    autotune: bool,
+) -> None:
+    current_platform.seed_everything(seed)
+    m, n, k = shape
+    a = torch.randn((m, k), dtype=dtype, device=device)
+    b = torch.randn((n, k), dtype=dtype, device=device) / k
+
+    a_fp8, a_scale = ops.scaled_fp8_quant(a)
+    b_fp8, b_scale = ops.scaled_fp8_quant(b)
+
+    expected_out = torch.mm(
+        a_scale * a_fp8.to(dtype=torch.float32),
+        b_scale * b_fp8.to(dtype=torch.float32).t(),
+    ).to(dtype=dtype)
+
+    if use_bias:
+        bias = torch.randn((n, ), dtype=dtype, device=device)
+        expected_out = expected_out + bias
+    else:
+        bias = None
+
+    import flashinfer
+
+    with flashinfer.autotune(autotune):
+        out = flashinfer_scaled_fp8_mm(
+            a_fp8,
+            b_fp8.t(),
+            a_scale,
+            b_scale,
+            dtype,
+            bias=bias,
+        )
+
+    torch.testing.assert_close(out, expected_out, atol=1e-2, rtol=1e-2)
--- a/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
+++ b/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+from tests.kernels.utils import opcheck
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+
+if not current_platform.has_device_capability(100):
+    pytest.skip(reason="Nvfp4 Requires compute capability of 10 or above.",
+                allow_module_level=True)
+
+DTYPES = [torch.float16, torch.bfloat16]
+SHAPES = [(128, 64), (128, 128), (256, 64), (256, 128)]
+SEEDS = [42]
+CUDA_DEVICES = ['cuda:0']
+
+FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
+FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
+
+BLOCK_SIZE = 16
+
+
+def ref_impl(silu_and_mul: SiluAndMul, x: torch.Tensor,
+             global_scale: torch.Tensor,
+             ref_output_scale: torch.Tensor) -> torch.Tensor:
+    silu_and_mul_out = silu_and_mul.forward_native(x)
+    assert not current_platform.is_rocm()
+    assert silu_and_mul_out.ndim >= 1, (
+        f'input.ndim needs to be >= 1, but got {silu_and_mul_out.ndim}.')
+    other_dims = 1 if silu_and_mul_out.ndim == 1 else -1
+    silu_and_mul_out = silu_and_mul_out.reshape(other_dims,
+                                                silu_and_mul_out.shape[-1])
+    m, n = silu_and_mul_out.shape
+    device = silu_and_mul_out.device
+
+    # Two fp4 values will be packed into an uint8.
+    out = torch.empty((m, n // 2), device=device, dtype=torch.uint8)
+
+    output_scale = ref_output_scale
+
+    torch.ops._C.scaled_fp4_quant(out, silu_and_mul_out, output_scale,
+                                  global_scale)
+
+    return out, output_scale
+
+
+def ops_impl(x: torch.Tensor, global_scale: torch.Tensor,
+             ref_output_scale: torch.Tensor) -> torch.Tensor:
+    out_shape = (x.shape[0], x.shape[1] // 4)
+    output_scale = ref_output_scale
+    out = torch.empty(out_shape, dtype=torch.uint8, device=x.device)
+    torch.ops._C.silu_and_mul_nvfp4_quant(out, output_scale, x, global_scale)
+    return out, output_scale
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_quantize_to_fp4(
+    dtype: torch.dtype,
+    shape: tuple[int, int],
+    seed: int,
+    device: str,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+
+    m, n = shape
+
+    x = torch.randn((m, n), dtype=dtype)
+    tensor_amax = torch.abs(x).max().to(torch.float32)
+    global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / tensor_amax
+
+    block_size = 16
+
+    assert n % block_size == 0, (
+        f'last dim has to be multiple of 16, but got {n}.')
+    assert x.dtype in (torch.float16, torch.bfloat16), (
+        f'input.dtype needs to be fp16 or bf16 but got {x.dtype}.')
+
+    round_up = lambda x, y: (x + y - 1) // y * y
+    rounded_m = round_up(x.shape[0], 128)
+    scale_n = x.shape[1] // (2 * block_size)
+    rounded_n = round_up(scale_n, 4)
+    output_scale = torch.empty((rounded_m, rounded_n // 4),
+                               device=x.device,
+                               dtype=torch.int32)
+
+    layer = SiluAndMul()
+
+    ref_out, ref_out_scale = ref_impl(layer, x, global_scale, output_scale)
+
+    fusion_out, fusion_out_scale = ops_impl(x, global_scale, output_scale)
+
+    assert ref_out.dtype == torch.uint8
+    assert fusion_out.dtype == torch.uint8
+    assert ref_out.shape == fusion_out.shape
+
+    assert ref_out_scale.dtype == torch.int32
+    assert fusion_out_scale.dtype == torch.int32
+    assert ref_out_scale.shape == fusion_out_scale.shape
+
+    # Allow up to 2% of mismatched values since BF16 has accuracy issues.
+    mis_threshold = 0.02
+    atol = 0.4
+    rtol = 0.4
+    ref_logits = ref_out[-1]
+    fusion_logits = fusion_out[-1]
+
+    mis_count = torch.sum(
+        torch.abs(fusion_logits - ref_logits) > (atol +
+                                                 rtol * torch.abs(ref_logits)))
+    mis_ratio = mis_count / fusion_logits.numel()
+
+    assert mis_ratio < mis_threshold, \
+        f"Mismatch ratio {mis_ratio} exceeds threshold {mis_threshold}"
+
+    torch.testing.assert_close(ref_out_scale, fusion_out_scale)
+
+    opcheck(torch.ops._C.silu_and_mul_nvfp4_quant,
+            (fusion_out, fusion_out_scale, x, global_scale))
--- a/tests/kernels/quantization/test_triton_scaled_mm.py
+++ b/tests/kernels/quantization/test_triton_scaled_mm.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the triton_scaled_mm kernel

-Run `pytest tests/kernels/test_triton_scaled_mm.py`.
+Run `pytest tests/kernels/quantization/test_triton_scaled_mm.py`.
 """
 import os
 import importlib

--- a/tests/kernels/quantization/untest_cutlass_2of4_sparse.py
+++ b/tests/kernels/quantization/untest_cutlass_2of4_sparse.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for sparse cutlass kernels

-Run `pytest tests/kernels/test_semi_structured.py`.
+Run `pytest tests/kernels/quantization/test_cutlass_2of4_sparse.py`.
 """

 import pytest

--- a/tests/kernels/quantization/untest_cutlass_scaled_mm.py
+++ b/tests/kernels/quantization/untest_cutlass_scaled_mm.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for cutlass kernels

-Run `pytest tests/kernels/test_cutlass.py`.
+Run `pytest tests/kernels/quantization/test_cutlass_scaled_mm.py`.
 """
 import random

@@ -535,7 +535,7 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,

    expert_offsets = torch.zeros((num_experts + 1),
                                 device=device,
-                                 dtype=torch.int32)
+                                 dtype=torch.int64)

    problem_sizes = torch.zeros((num_experts, 3),
                                device=device,

--- a/tests/kernels/quantization/untest_machete_mm.py
+++ b/tests/kernels/quantization/untest_machete_mm.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the machete kernel.

-Run `pytest tests/kernels/test_machete_mm.py`.
+Run `pytest tests/kernels/quantization/test_machete_mm.py`.
 """

 import math
@@ -95,23 +95,23 @@ TEST_TYPES = [
                 token_scale_type=None)
      for w_type in [scalar_types.uint4, scalar_types.uint8]
      for a_type in [torch.float16, torch.bfloat16]),
-    # QQQ style
-    *(TypeConfig(act_type=torch.int8,
-                 weight_type=scalar_types.uint4b8,
-                 output_type=torch.float16,
-                 group_scale_type=group_scale_type,
-                 group_zero_type=None,
-                 channel_scale_type=torch.float,
-                 token_scale_type=torch.float)
-      for group_scale_type in [None, torch.float16]),
-    *(TypeConfig(act_type=torch.float8_e4m3fn,
-                 weight_type=scalar_types.uint4b8,
-                 output_type=torch.float16,
-                 group_scale_type=group_scale_type,
-                 group_zero_type=None,
-                 channel_scale_type=torch.float,
-                 token_scale_type=torch.float)
-      for group_scale_type in [None, torch.float16]),
+    # # QQQ style
+    # *(TypeConfig(act_type=torch.int8,
+    #              weight_type=scalar_types.uint4b8,
+    #              output_type=torch.float16,
+    #              group_scale_type=group_scale_type,
+    #              group_zero_type=None,
+    #              channel_scale_type=torch.float,
+    #              token_scale_type=torch.float)
+    #   for group_scale_type in [None, torch.float16]),
+    # *(TypeConfig(act_type=torch.float8_e4m3fn,
+    #              weight_type=scalar_types.uint4b8,
+    #              output_type=torch.float16,
+    #              group_scale_type=group_scale_type,
+    #              group_zero_type=None,
+    #              channel_scale_type=torch.float,
+    #              token_scale_type=torch.float)
+    #   for group_scale_type in [None, torch.float16]),
 ]

 # TODO: in future PR refactor this and `is_quant_method_supported` in the kernel

--- a/tests/kernels/quantization/untest_marlin_gemm.py
+++ b/tests/kernels/quantization/untest_marlin_gemm.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the marlin kernel.

-Run `pytest tests/kernels/marlin/test_marlin_gemm.py`.
+Run `pytest tests/kernels/quantization/test_marlin_gemm.py`.
 """
 import pytest
 import torch
@@ -13,11 +13,7 @@ from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
    GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES)
-from vllm.model_executor.layers.quantization.qqq import (
-    MARLIN_QQQ_MAX_PARALLEL, MARLIN_QQQ_MIN_THREAD_N,
-    MARLIN_QQQ_SUPPORTED_GROUP_SIZES, MARLIN_QQQ_SUPPORTED_NUM_BITS)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
    MARLIN_SUPPORTED_GROUP_SIZES, marlin_make_empty_g_idx,
    marlin_make_workspace_new, marlin_permute_bias, marlin_permute_scales,
    query_marlin_supported_quant_types)
@@ -31,8 +27,6 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
    marlin_weights)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
    marlin_24_quantize)
-from vllm.model_executor.layers.quantization.utils.marlin_utils_test_qqq import (  # noqa: E501
-    marlin_qqq_quantize)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    awq_pack, gptq_pack, gptq_quantize_weights, quantize_weights, sort_weights)
 from vllm.scalar_type import scalar_types
@@ -449,68 +443,6 @@ def test_hqq_marlin_gemm(
    assert max_diff < 0.04


-@pytest.mark.skipif(not is_quant_method_supported("qqq"),
-                    reason="Marlin is not supported on this GPU type.")
-@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
-@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
-@pytest.mark.parametrize("num_bits", MARLIN_QQQ_SUPPORTED_NUM_BITS)
-@pytest.mark.parametrize("group_size", MARLIN_QQQ_SUPPORTED_GROUP_SIZES)
-@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
-def test_marlin_qqq_gemm(
-    k_chunk,
-    n_chunk,
-    num_bits,
-    group_size,
-    mnk_factors,
-):
-    int8_traits = torch.iinfo(torch.int8)
-    m_factor, n_factor, k_factor = mnk_factors
-
-    size_m = m_factor
-    size_k = k_chunk * k_factor
-    size_n = n_chunk * n_factor
-
-    a_input = rand_data((size_m, size_k))
-    b_weight = rand_data((size_k, size_n))
-
-    # Quantize activations
-    s_a = a_input.abs().max(dim=-1, keepdim=True)[0].div(int8_traits.max).to(
-        torch.float)
-    q_a = (a_input / s_a).round().clamp(int8_traits.min,
-                                        int8_traits.max).to(torch.int8)
-
-    # Quantize weights
-    w_ref, marlin_qqq_q_w, marlin_qqq_s_group, marlin_qqq_s_channel = \
-    marlin_qqq_quantize(b_weight, num_bits, group_size)
-
-    workspace = MarlinWorkspace(size_n, MARLIN_QQQ_MIN_THREAD_N,
-                                MARLIN_QQQ_MAX_PARALLEL)
-
-    opcheck(torch.ops._C.marlin_qqq_gemm,
-            (q_a, marlin_qqq_q_w, s_a, marlin_qqq_s_channel,
-             marlin_qqq_s_group, workspace.scratch, a_input.shape[0],
-             b_weight.shape[1], a_input.shape[1]))
-
-    output = ops.marlin_qqq_gemm(
-        q_a,
-        marlin_qqq_q_w,
-        s_a,
-        marlin_qqq_s_channel,
-        marlin_qqq_s_group,
-        workspace.scratch,
-        a_input.shape[0],
-        b_weight.shape[1],
-        a_input.shape[1],
-    )
-    output_ref = torch.matmul(q_a.half() * s_a.half(), w_ref)
-
-    torch.cuda.synchronize()
-
-    max_diff = compute_max_diff(output, output_ref)
-
-    assert max_diff < 0.04
-
-
 def test_marlin_gemm_subset_input():
    quant_type = scalar_types.uint4b8
    group_size = 128
@@ -602,18 +534,3 @@ def test_marlin_gemm_with_bias(size_m):
    max_diff = compute_max_diff(output, output_ref)

    assert max_diff < 0.04
-
-
-def test_marlin_gemm_opcheck():
-    size_m = 2048
-    size_n = 4096
-    size_k = 4096
-    a = torch.rand((size_m, size_n), device='cuda', dtype=torch.float16)
-    w = torch.randint(-5, 5, (256, 8192), device='cuda', dtype=torch.int32)
-    s = torch.full((32, size_k), 0.125, device='cuda', dtype=torch.float16)
-    wk = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
-                         GPTQ_MARLIN_MAX_PARALLEL).scratch
-    x = torch.ops._C.marlin_gemm(a, w, s, wk, size_m, size_n, size_k)
-    y = torch.ops._C.marlin_gemm(a, w, s, wk, size_m, size_n, size_k)
-    torch.testing.assert_close(x, y)
-    opcheck(torch.ops._C.marlin_gemm, (a, w, s, wk, size_m, size_n, size_k))
\ No newline at end of file
--- a/tests/kernels/quantization/untest_triton_scaled_mm.py
+++ b/tests/kernels/quantization/untest_triton_scaled_mm.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for the triton_scaled_mm kernel
+
+Run `pytest tests/kernels/quantization/test_triton_scaled_mm.py`.
+"""
+import os
+import importlib
+from typing import Optional
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+from ...utils import models_path_prefix
+
+device = "cuda"
+
+triton_scaled_mm_module = importlib.import_module(
+    "vllm.model_executor.layers.quantization.compressed_tensors."
+    "triton_scaled_mm")
+triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
+
+
+def torch_scaled_mm(a: torch.Tensor,
+                    b: torch.Tensor,
+                    scale_a: torch.Tensor,
+                    scale_b: torch.Tensor,
+                    out_dtype: type[torch.dtype],
+                    bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    out = torch.mm(a.to(torch.float32), b.to(torch.float32))
+    out = scale_a * out
+    out = scale_b.T * out
+    out = out.to(out_dtype)
+    if bias is not None:
+        out = out + bias
+
+    return out
+
+
+def get_8bit_types():
+    types = [torch.int8]
+    if current_platform.supports_fp8():
+        types.append(current_platform.fp8_dtype())
+    return types
+
+
+# This test is to check regressions for int8 support on ROCm.
+@pytest.mark.parametrize("model_path", [
+    os.path.join(models_path_prefix, "neuralmagic/Llama-3.2-1B-quantized.w8a8"),
+])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [10])
+@pytest.mark.skipif(not current_platform.is_rocm(),
+                    reason="Should only run on ROCm")
+def test_rocm_compressed_tensors_w8a8(vllm_runner, example_prompts, model_path,
+                                      max_tokens, num_logprobs):
+    dtype = "bfloat16"
+
+    with vllm_runner(model_path, dtype=dtype) as vllm_model:
+        vllm_model.generate_greedy_logprobs(example_prompts, max_tokens,
+                                            num_logprobs)
+
+
+MNK_FACTORS = [
+    (1, 256, 128),
+    (33, 256, 496),
+    (64, 971, 1024),
+    (64, 20486, 128),
+    (512, 256, 496),
+    (512, 20486, 1024),
+]
+
+
+@pytest.mark.parametrize("M,N,K", MNK_FACTORS)
+@pytest.mark.parametrize("out_dtype", [torch.bfloat16])
+@pytest.mark.parametrize("in_dtype", get_8bit_types())
+@pytest.mark.parametrize("use_scalar_scale_a", [True, False])
+@pytest.mark.parametrize("use_scalar_scale_b", [True, False])
+@pytest.mark.parametrize("use_bias", [True, False])
+def test_scaled_mm(M, N, K, in_dtype, out_dtype, use_scalar_scale_a,
+                   use_scalar_scale_b, use_bias):
+    is_floating_point_type = lambda t: torch.tensor([1, 1], dtype=t
+                                                    ).is_floating_point()
+
+    current_platform.seed_everything(0)
+
+    # NOTE: There are cases, where if the matrix is large enough, an output
+    # like 65504.4 can be produced, and can easily turn into inf when
+    # multiplied when using float16/bfloat16.  This means one function, e.g.,
+    # testing function, and another function, e.g. golden function, can
+    # produce a non-inf value while the other produces an inf value, and
+    # will cause assert_close/allclose to fail, even though if overflow
+    # wouldn't have occurred, the values would have been "close."
+    #
+    # So, the values here are kept small enough to avoid this situation.
+    if is_floating_point_type(in_dtype):
+        a = (0.25 * torch.rand(
+            (M, K), dtype=torch.float32, device=device)).to(in_dtype)
+        b = (0.25 * torch.rand(
+            (K, N), dtype=torch.float32, device=device)).to(in_dtype)
+    else:
+        a = torch.randint(-32, 32, (M, K), dtype=in_dtype, device=device)
+        b = torch.randint(-32, 32, (K, N), dtype=in_dtype, device=device)
+
+    if use_scalar_scale_a:
+        scale_a = torch.rand((1, 1), device=device)
+    else:
+        scale_a = 0.25 * torch.rand((M, 1), device=device)
+
+    if use_scalar_scale_b:
+        scale_b = torch.rand((1, 1), device=device)
+    else:
+        scale_b = 0.25 * torch.rand((N, 1), device=device)
+
+    bias = None
+    if use_bias:
+        bias = torch.rand((N, ), device=device, dtype=out_dtype)
+
+    c_check = triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
+
+    c_actual = torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
+
+    torch.testing.assert_close(c_check, c_actual, rtol=1e-1, atol=1e-1)
--- a/tests/kernels/test_flex_attention.py
+++ b/tests/kernels/test_flex_attention.py
@@ -10,13 +10,19 @@ import pytest
 import torch
 from packaging import version

-from vllm import SamplingParams
+
+from tests.v1.attention.utils import (BatchSpec, create_common_attn_metadata,
+                                      create_standard_kv_cache_spec,
+                                      create_vllm_config)
+from vllm.v1.attention.backends.flex_attention import (
+    FlexAttentionMetadataBuilder)
 from ..utils import models_path_prefix

-from ..models.utils import check_embeddings_close
+from ..models.utils import check_embeddings_close, check_logprobs_close

 TORCH_VERSION = version.parse(torch.__version__)
 MINIMUM_TORCH_VERSION = version.parse("2.7.0")
+DIRECT_BUILD_VERSION = version.parse("2.9.dev0")


 def set_seed(seed):
@@ -36,22 +42,18 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
    """Test that FlexAttention produces the same outputs as the default backend.

    This test compares the outputs from the FlexAttention backend with
-    the default backend, ensuring they are identical when using the same seed.
+    the default backend, ensuring they are similar when using the same seed.
    """
    model_name = os.path.join(models_path_prefix, "Qwen/Qwen2.5-1.5B-Instruct")
    seed = 42
    max_tokens = 24
+    num_logprobs = 5
    prompts = [
        "Hello, my name is",
        "The president of the United States is",
        "The capital of France is",
    ]

-    sampling_params = SamplingParams(temperature=0.0,
-                                     top_p=1.0,
-                                     seed=seed,
-                                     max_tokens=max_tokens)
-
    # Run with flex attention
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
@@ -63,7 +65,8 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
                         tensor_parallel_size=1,
                         num_gpu_blocks_override=128,
                         enforce_eager=True) as llm_flex:
-            output_flex = llm_flex.generate(prompts, sampling_params)
+            output_flex = llm_flex.generate_greedy_logprobs(
+                prompts, max_tokens, num_logprobs)

    # Run with default backend
    with monkeypatch.context() as m:
@@ -73,20 +76,17 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
                         runner="generate",
                         tensor_parallel_size=1,
                         num_gpu_blocks_override=128,
-                         enforce_eager=True) as llm_default:
-            output_default = llm_default.generate(prompts, sampling_params)
-
-    # Compare outputs from both backends
-    for i, (flex_result,
-            default_result) in enumerate(zip(output_flex, output_default)):
-        prompt = prompts[i]
-        flex_text = flex_result[1][0]
-        default_text = default_result[1][0]
-
-        assert flex_text == default_text, (
-            f"FlexAttention output doesn't match default for: {prompt!r}\n"
-            f"FlexAttention: {flex_text!r}\n"
-            f"Default: {default_text!r}")
+                         enforce_eager=True,
+                         gpu_memory_utilization=0.85) as llm_default:
+            output_default = llm_default.generate_greedy_logprobs(
+                prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=output_flex,
+        outputs_1_lst=output_default,
+        name_0="flex",
+        name_1="default",
+    )


 @pytest.mark.skipif(
@@ -138,5 +138,70 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
    )


+@pytest.mark.skipif(
+    not torch.cuda.is_available() or TORCH_VERSION < DIRECT_BUILD_VERSION,
+    reason="CUDA not available or PyTorch version < 2.7",
+)
+def test_block_mask_direct_vs_slow_path():
+    """Test that direct path block mask is a superset of slow path.
+
+    The direct path may include extra blocks for performance (over-estimation),
+    but must include all blocks that the slow path determines are necessary.
+    """
+    device = torch.device("cuda")
+
+    vllm_config = create_vllm_config(model_name="meta-llama/Meta-Llama-3-8B",
+                                     block_size=16,
+                                     max_model_len=1024)
+    kv_cache_spec = create_standard_kv_cache_spec(vllm_config)
+
+    # Use a mixed batch that will create groups spanning multiple sequences
+    batch_spec = BatchSpec(seq_lens=[35, 64, 128, 256],
+                           query_lens=[33, 5, 32, 64],
+                           name="test_mixed_batch")
+
+    common_attn_metadata = create_common_attn_metadata(
+        batch_spec, vllm_config.cache_config.block_size, device)
+
+    builder = FlexAttentionMetadataBuilder(kv_cache_spec, [], vllm_config,
+                                           device)
+
+    metadata_direct = builder.build(common_prefix_len=0,
+                                    common_attn_metadata=common_attn_metadata)
+    builder.direct_build = False
+    metadata_slow = builder.build(common_prefix_len=0,
+                                  common_attn_metadata=common_attn_metadata)
+
+    assert metadata_direct.block_mask is not None
+    assert metadata_slow.block_mask is not None
+
+    # Extract block indices for comparison, B, H are the same
+    direct_indices = metadata_direct.block_mask.kv_indices[0, 0]
+    slow_indices = metadata_slow.block_mask.kv_indices[0, 0]
+    direct_num = metadata_direct.block_mask.kv_num_blocks[0, 0]
+    slow_num = metadata_slow.block_mask.kv_num_blocks[0, 0]
+
+    # main test: every block needed by slow path must be in direct path
+    num_groups = direct_num.shape[0]
+    all_contained = True
+    missing_details = []
+
+    for group_idx in range(num_groups):
+        direct_blocks = set(
+            direct_indices[group_idx, :direct_num[group_idx]].tolist())
+        slow_blocks = set(
+            slow_indices[group_idx, :slow_num[group_idx]].tolist())
+
+        missing_blocks = slow_blocks - direct_blocks
+        if missing_blocks:
+            all_contained = False
+            missing_details.append(
+                f"Group {group_idx}: missing {sorted(missing_blocks)}")
+
+    assert all_contained, (
+        "Direct path is missing blocks required by slow path:\n" +
+        "\n".join(missing_details))
+
+
 if __name__ == "__main__":
    pytest.main([__file__])
--- a/tests/kernels/test_onednn.py
+++ b/tests/kernels/test_onednn.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Integration tests for FlexAttention backend vs default backend"""
+
+from typing import Optional
+
+import pytest
+import torch
+
+from tests.kernels.utils import to_int8
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+
+if not current_platform.is_cpu():
+    pytest.skip("skipping CPU-only tests", allow_module_level=True)
+
+NK_FACTORS = [
+    (256, 128),
+    (4096, 4096),
+    (16384, 4096),
+    (1023, 491),
+    (1001, 15),
+]
+M_FACTORS = [
+    (16, 1, 32, 128, 64),
+    (1, 17, 1, 31, 17),
+]
+CACHE_SIZES = [2]
+DTYPE = [torch.bfloat16]
+
+
+def rand_int8(shape: tuple, device: str = "cpu"):
+    return to_int8(torch.rand(shape, device=device) * 255 - 128)
+
+
+def ref_int8_scaled_mm(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    azp: Optional[torch.Tensor],
+    bias: Optional[torch.Tensor],
+    output_type: torch.dtype,
+):
+    if azp is not None:
+        a = a.to(dtype=torch.float32) - azp.to(dtype=torch.float32)
+    output = torch.mm((scale_a * a.to(dtype=torch.float32)),
+                      (scale_b * b.to(dtype=torch.float32)))
+    if bias is not None:
+        output += bias.float()
+
+    return output.to(dtype=output_type)
+
+
+def onednn_int8_gemm_test_helper(primitive_cache_size: int,
+                                 m: int,
+                                 n: int,
+                                 k: int,
+                                 per_tensor_a_quant: bool,
+                                 per_tensor_b_quant: bool,
+                                 use_azp: bool,
+                                 use_bias: bool,
+                                 out_dtype: torch.dtype = torch.bfloat16,
+                                 device: str = "cpu"):
+    # Test for a oneDNN kernel with per-tensor / per-token activation
+    # quantization and per-tensor / per-output channel weight quantization.
+    a = to_int8(torch.randn((m, k), device=device) * 5)
+    b = to_int8(torch.randn((n, k), device=device).t() * 5)
+
+    a_scales_shape = (1, 1) if per_tensor_a_quant else (m, 1)
+    b_scales_shape = (1, 1) if per_tensor_b_quant else (1, n)
+
+    scale_a = (torch.randn(a_scales_shape, device=device, dtype=torch.float32))
+    scale_b = (torch.randn(b_scales_shape, device=device, dtype=torch.float32))
+
+    if use_azp:
+        azp = torch.rand(a_scales_shape, dtype=torch.float32) * 10 + 1.5
+        azp = (azp / scale_a).round().to(dtype=torch.int32)
+        azp_adj = scale_b * b.sum(dim=0, keepdim=True, dtype=torch.float32)
+    else:
+        azp = None
+        azp_adj = None
+
+    if use_bias:
+        bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10
+    else:
+        bias = None
+
+    handler = ops.create_onednn_scaled_mm(
+        b,
+        scale_b,
+        out_dtype,
+        not per_tensor_a_quant,
+        use_azp,
+        primitive_cache_size,
+    )
+
+    out = torch.zeros((m, n), dtype=out_dtype)
+    ops.onednn_scaled_mm(handler, a, out, scale_a, azp, azp_adj, bias)
+    baseline = ref_int8_scaled_mm(a, b, scale_a, scale_b, azp, bias, out_dtype)
+
+    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
+
+    if use_bias:
+        # To test runtime bias setting
+        out = torch.zeros((m, n), dtype=out_dtype)
+        ops.onednn_scaled_mm(handler, a, out, scale_a, azp, azp_adj, None)
+        baseline = ref_int8_scaled_mm(a, b, scale_a, scale_b, azp, None,
+                                      out_dtype)
+
+        torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
+
+
+@pytest.mark.parametrize("n,k", NK_FACTORS)
+@pytest.mark.parametrize("m_list", M_FACTORS)
+@pytest.mark.parametrize("per_tensor_a_scale", [True, False])
+@pytest.mark.parametrize("per_tensor_b_scale", [True, False])
+@pytest.mark.parametrize("use_bias", [True, False])
+@pytest.mark.parametrize("use_azp", [True, False])
+@pytest.mark.parametrize("output_type", DTYPE)
+@pytest.mark.parametrize("primitive_cache_size", CACHE_SIZES)
+def test_onednn_int8_scaled_gemm(
+    n: int,
+    k: int,
+    m_list: tuple[int],
+    per_tensor_a_scale: bool,
+    per_tensor_b_scale: bool,
+    use_bias: bool,
+    use_azp: bool,
+    output_type: torch.dtype,
+    primitive_cache_size: int,
+):
+    for m in m_list:
+        onednn_int8_gemm_test_helper(
+            primitive_cache_size=primitive_cache_size,
+            m=m,
+            n=n,
+            k=k,
+            per_tensor_a_quant=per_tensor_a_scale,
+            per_tensor_b_quant=per_tensor_b_scale,
+            use_bias=use_bias,
+            use_azp=use_azp,
+            out_dtype=output_type,
+        )
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -3,7 +3,7 @@

 import tempfile
 from collections import OrderedDict
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock

 import pytest
 import os
@@ -11,8 +11,6 @@ import torch
 import torch.nn as nn
 from huggingface_hub import snapshot_download

-import vllm
-from vllm.config import LoRAConfig
 from vllm.distributed import (cleanup_dist_env_and_memory,
                              init_distributed_environment,
                              initialize_model_parallel)
@@ -22,7 +20,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
-from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.models.interfaces import SupportsLoRA
 from vllm.platforms import current_platform
 from ..utils import models_path_prefix
@@ -106,6 +103,7 @@ def dummy_model() -> nn.Module:
        ]))
    model.config = MagicMock()
    model.embedding_modules = {"lm_head": "lm_head"}
+    model.unpadded_vocab_size = 32000
    return model


@@ -139,6 +137,8 @@ def dummy_model_gate_up() -> nn.Module:
        ],
    }
    model.embedding_modules = {"lm_head": "lm_head"}
+    model.unpadded_vocab_size = 32000
+
    return model


@@ -223,40 +223,6 @@ def tinyllama_lora_files():
    return os.path.join(models_path_prefix, "jashing/tinyllama-colorist-lora")


-@pytest.fixture(scope="session")
-def phi2_lora_files():
-    # return snapshot_download(repo_id="isotr0py/phi-2-test-sql-lora")
-    return os.path.join(models_path_prefix, "isotr0py/phi-2-test-sql-lora")
-
-@pytest.fixture(scope="session")
-def qwen_lora_files():
-    # return snapshot_download(repo_id="jeeejeee/chatglm3-text2sql-spider")
-    return os.path.join(models_path_prefix, "customize/qwen-nl2dsl-lora")
-
-
-@pytest.fixture
-def llama_2_7b_engine_extra_embeddings():
-    cleanup_dist_env_and_memory(shutdown_ray=True)
-    get_model_old = get_model
-
-    def get_model_patched(**kwargs):
-        kwargs["vllm_config"].lora_config = LoRAConfig(max_loras=4,
-                                                       max_lora_rank=8)
-        return get_model_old(**kwargs)
-
-    with patch("vllm.worker.model_runner.get_model", get_model_patched):
-        engine = vllm.LLM(os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"), enable_lora=False)
-    yield engine.llm_engine
-    del engine
-    cleanup_dist_env_and_memory(shutdown_ray=True)
-
-
-@pytest.fixture
-def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings):
-    yield (llama_2_7b_engine_extra_embeddings.model_executor.driver_worker.
-           model_runner.model)
-
-
 @pytest.fixture
 def reset_default_device():
    """

--- a/tests/lora/test_add_lora.py
+++ b/tests/lora/test_add_lora.py
@@ -6,7 +6,6 @@ import time
 import os
 import pytest

-import vllm.envs as env
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.entrypoints.openai.api_server import (
    build_async_engine_client_from_engine_args)
@@ -100,12 +99,10 @@ async def test_add_lora(chatglm3_lora_files):
        # Run with warmup
        add_lora_tasks = [llm.add_lora(lr) for lr in warmup_run_requests]
        add_lora_results = await asyncio.gather(*add_lora_tasks)
-        if env.VLLM_USE_V1:
-            # Test that all all_lora calls are successful.
-            assert all(add_lora_results)
-        else:
-            # No way to check V0 engine results as the calls just return None.
-            pass
+
+        # Test that all all_lora calls are successful.
+        assert all(add_lora_results)
+
        time_with_add_lora = await requests_processing_time(
            llm, warmup_run_requests)


--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -89,6 +89,9 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
 @multi_gpu_test(num_gpus=4)
 @create_new_process_for_each_test()
 def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
+    # https://github.com/NVIDIA/nccl/issues/1790, set a lower value for
+    # gpu_memory_utilization here because NCCL >= 2.26.3 seems to use
+    # more GPU memory causing vLLM to OOM
    llm = vllm.LLM(MODEL_PATH,
                   max_model_len=1024,
                   enable_lora=True,
@@ -97,7 +100,8 @@ def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
                   tensor_parallel_size=4,
                   trust_remote_code=True,
                   fully_sharded_loras=True,
-                   enable_chunked_prefill=True)
+                   enable_chunked_prefill=True,
+                   gpu_memory_utilization=0.85)
    output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
    for i in range(len(EXPECTED_LORA_OUTPUT)):
        assert output1[i] == EXPECTED_LORA_OUTPUT[i]

--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -243,7 +243,7 @@ def check_punica_wrapper(punica_wrapper) -> bool:


 @torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
 @pytest.mark.parametrize("stage", STAGES)
@@ -347,7 +347,7 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
 @torch.inference_mode()
 # @pytest.mark.skip(
 #     reason="Fails when loras are in any slot other than the first.")
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
 @pytest.mark.parametrize("stage", STAGES)
@@ -486,7 +486,7 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,


 @torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 256512])
 @pytest.mark.parametrize("stage", STAGES)
@@ -620,12 +620,15 @@ def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,


 @torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
-@pytest.mark.parametrize("bias_enabled", [True, False])
-def test_linear_replicated(dist_init, num_loras, device, stage,
-                           bias_enabled) -> None:
+def test_linear_replicated(
+    dist_init,
+    num_loras,
+    device,
+    stage,
+) -> None:

    if current_platform.is_cuda_alike():
        torch.cuda.set_device(device)
@@ -634,10 +637,11 @@ def test_linear_replicated(dist_init, num_loras, device, stage,
    torch.set_default_device(device)
    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
    assert check_punica_wrapper(punica_wrapper)
-    lora_config = LoRAConfig(max_loras=max_loras,
-                             max_lora_rank=8,
-                             lora_dtype=torch.float16,
-                             bias_enabled=bias_enabled)
+    lora_config = LoRAConfig(
+        max_loras=max_loras,
+        max_lora_rank=8,
+        lora_dtype=torch.float16,
+    )

    def create_random_linear_replicated_layer():

@@ -651,10 +655,6 @@ def test_linear_replicated(dist_init, num_loras, device, stage,
        lora_linear.create_lora_weights(max_loras, lora_config)
        assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
            lora_linear.lora_b_stacked) == 1)
-        if bias_enabled:
-            assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
-        else:
-            assert lora_linear.lora_bias_stacked is None
        return linear, lora_linear

    for i in range(NUM_RANDOM_SEEDS):
@@ -734,14 +734,13 @@ def test_linear_replicated(dist_init, num_loras, device, stage,


 @torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("orientation", ["row", "column"])
 @pytest.mark.parametrize("fully_shard", [True, False])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
-@pytest.mark.parametrize("bias_enabled", [True, False])
 def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
-                         device, stage, bias_enabled) -> None:
+                         device, stage) -> None:

    if current_platform.is_cuda_alike():
        torch.cuda.set_device(device)
@@ -750,11 +749,12 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
    torch.set_default_device(device)
    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
    assert check_punica_wrapper(punica_wrapper)
-    lora_config = LoRAConfig(max_loras=max_loras,
-                             max_lora_rank=8,
-                             fully_sharded_loras=fully_shard,
-                             lora_dtype=torch.float16,
-                             bias_enabled=bias_enabled)
+    lora_config = LoRAConfig(
+        max_loras=max_loras,
+        max_lora_rank=8,
+        fully_sharded_loras=fully_shard,
+        lora_dtype=torch.float16,
+    )

    def create_random_linear_parallel_layer():
        if orientation == "row":
@@ -777,10 +777,7 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
        lora_linear.create_lora_weights(max_loras, lora_config)
        assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
            lora_linear.lora_b_stacked) == 1)
-        if bias_enabled:
-            assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
-        else:
-            assert lora_linear.lora_bias_stacked is None
+
        return linear, lora_linear

    for i in range(NUM_RANDOM_SEEDS):
@@ -860,14 +857,13 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,


 @torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("repeats", [1, 2, 3])
 @pytest.mark.parametrize("fully_shard", [True, False])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
-@pytest.mark.parametrize("bias_enabled", [True, False])
 def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
-                                device, stage, bias_enabled) -> None:
+                                device, stage) -> None:

    if current_platform.is_cuda_alike():
        torch.cuda.set_device(device)
@@ -876,11 +872,12 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
    torch.set_default_device(device)
    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
    assert check_punica_wrapper(punica_wrapper)
-    lora_config = LoRAConfig(max_loras=max_loras,
-                             max_lora_rank=8,
-                             fully_sharded_loras=fully_shard,
-                             lora_dtype=torch.float16,
-                             bias_enabled=bias_enabled)
+    lora_config = LoRAConfig(
+        max_loras=max_loras,
+        max_lora_rank=8,
+        fully_sharded_loras=fully_shard,
+        lora_dtype=torch.float16,
+    )

    def create_column_parallel_packed_layer():
        if repeats == 2:
@@ -924,10 +921,7 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
                                        model_config=FakeConfig())
        assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
            lora_linear.lora_b_stacked) == n_slices)
-        if bias_enabled:
-            assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
-        else:
-            assert lora_linear.lora_bias_stacked is None
+
        return linear, lora_linear

    for i in range(NUM_RANDOM_SEEDS):

--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -114,8 +114,7 @@ def test_llama_lora(sql_lora_files):
        enable_lora=True,
        # also test odd max_num_seqs
        max_num_seqs=13,
-        max_loras=4,
-        enable_chunked_prefill=True)
+        max_loras=4)
    generate_and_test(llm, sql_lora_files)


@@ -129,7 +128,6 @@ def test_llama_lora_tp4(sql_lora_files):
        max_num_seqs=16,
        max_loras=4,
        tensor_parallel_size=4,
-        enable_chunked_prefill=True,
    )
    generate_and_test(llm, sql_lora_files)

@@ -145,7 +143,6 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
        max_loras=4,
        tensor_parallel_size=4,
        fully_sharded_loras=True,
-        enable_chunked_prefill=True,
    )
    generate_and_test(llm, sql_lora_files)


--- a/tests/lora/test_multi_loras_with_tp.py
+++ b/tests/lora/test_multi_loras_with_tp.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
-Script to test multi loras service with tp >= 2
+This script contains:
+1. test multi loras service with tp >= 2
+2. test multi loras request
 """
+import pytest
+
 from tests.utils import multi_gpu_test
 from vllm import LLM, SamplingParams
 from vllm.lora.request import LoRARequest
@@ -156,3 +160,34 @@ def test_multi_loras_with_tp_sync():

        output_text = call_llm_get_outputs(prompt, "Alice")
        check_outputs(output_text, expected_output)
+
+
+def test_multiple_lora_requests():
+    llm = LLM(
+        model=MODEL_PATH,
+        enable_lora=True,
+        max_loras=4,
+        max_lora_rank=LORA_RANK,
+        max_model_len=512,
+        gpu_memory_utilization=0.5,
+        enforce_eager=True,
+    )
+    PROMPTS = ["Hello, my name is"] * 2
+    LORA_NAME = "Alice"
+    lora_request = [
+        LoRARequest(LORA_NAME + str(idx), idx + 1,
+                    LORA_NAME_PATH_MAP[LORA_NAME])
+        for idx in range(len(PROMPTS))
+    ]
+    # Multiple SamplingParams should be matched with each prompt
+    outputs = llm.generate(PROMPTS, lora_request=lora_request)
+    assert len(PROMPTS) == len(outputs)
+
+    # Exception raised, if the size of params does not match the size of prompts
+    with pytest.raises(ValueError):
+        outputs = llm.generate(PROMPTS, lora_request=lora_request[:1])
+
+    # Single LoRARequest should be applied to every prompt
+    single_lora_request = lora_request[0]
+    outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
+    assert len(PROMPTS) == len(outputs)