Merge tag 'v0.8.3' into v0.8.3-dev

fcfc474d · zhuwenwen · bb94d2e5 · 296c6572 · fcfc474d · fcfc474d
Commit fcfc474d authored Apr 09, 2025 by zhuwenwen
20 changed files
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -752,3 +752,72 @@ def test_gather_cache_mla(kv_lora_rank, qk_rope_head_dim, block_size,
    ops.gather_cache(src_cache, dst, block_table, cu_seq_lens, batch_size)
    torch.testing.assert_close(dst, expected)
+@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
+@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS_MLA)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.cpu_model
+@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
+@torch.inference_mode()
+def test_concat_and_cache_mla_cpu(
+    kv_lora_rank: int,
+    qk_rope_head_dim: int,
+    num_tokens: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+) -> None:
+    device = "cpu"
+    kv_cache_dtype = "auto"
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+    total_slots = num_blocks * block_size
+    slot_mapping_lst = random.sample(range(total_slots), num_tokens)
+    slot_mapping = torch.tensor(slot_mapping_lst,
+                                dtype=torch.long,
+                                device=device)
+    kv_c = torch.randn(num_tokens, kv_lora_rank, dtype=dtype, device=device)
+    k_pe = torch.randn(num_tokens,
+                       qk_rope_head_dim,
+                       dtype=dtype,
+                       device=device)
+    entry_size = kv_lora_rank + qk_rope_head_dim
+    scale = torch.tensor(0.1, dtype=torch.float32, device=device)
+    kv_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
+                                 kv_cache_dtype, device)
+    ref_temp = torch.zeros(*kv_cache.shape, dtype=dtype, device=device)
+    for i in range(num_tokens):
+        slot = slot_mapping[i].item()
+        block_idx = slot // block_size
+        block_offset = slot % block_size
+        ref_temp[block_idx, block_offset, :kv_lora_rank] = kv_c[i]
+        ref_temp[block_idx, block_offset, kv_lora_rank:] = k_pe[i]
+    if kv_cache_dtype == "fp8":
+        ref_kv_cache = torch.empty_like(ref_temp, dtype=kv_cache.dtype)
+        ops.convert_fp8(ref_kv_cache,
+                        ref_temp,
+                        scale.item(),
+                        kv_dtype=kv_cache_dtype)
+    else:
+        ref_kv_cache = ref_temp
+    opcheck(
+        torch.ops._C_cache_ops.concat_and_cache_mla,
+        (kv_c, k_pe, kv_cache, slot_mapping, kv_cache_dtype, scale),
+        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
+    )
+    ops.concat_and_cache_mla(kv_c, k_pe, kv_cache, slot_mapping,
+                             kv_cache_dtype, scale)
+    torch.testing.assert_close(kv_cache, ref_kv_cache)
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -3,6 +3,7 @@
 Run `pytest tests/kernels/test_cutlass.py`.
 """
+import random
 import pytest
 import torch
@@ -499,3 +500,140 @@ def test_cutlass_cuda_graph(per_act_token: bool, per_out_ch: bool):
                                     torch.float16)
    #print("out:",out)
    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
+# def test_cutlass_support_opcheck():
+#     opcheck(torch.ops._C.cutlass_scaled_mm_supports_fp8, (capability, ))
+# @pytest.mark.parametrize("num_experts", [8, 64])
+# @pytest.mark.parametrize("per_act_token", [True, False])
+# @pytest.mark.parametrize("per_out_ch", [True, False])
+# @pytest.mark.parametrize("use_bias", [False])
+# @pytest.mark.skipif(
+#     (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+#         current_platform.get_device_capability()),
+#     reason="Grouped gemm is not supported on this GPU type.")
+# def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
+                                per_out_ch: bool, use_bias: bool):
+    # Device and dtype setup
+    device = "cuda"
+    out_dtype = torch.half
+    # Create separate A, B, C tensors for each group
+    a_tensors = []
+    b_tensors = []
+    a_scales_tensors = []
+    b_scales_tensors = []
+    baseline_tensors = []
+    expert_offsets = torch.zeros((num_experts + 1),
+                                 device=device,
+                                 dtype=torch.int32)
+    problem_sizes = torch.zeros((num_experts, 3),
+                                device=device,
+                                dtype=torch.int32)
+    if not per_act_token:
+        one_scale_a = torch.randn((1, 1), device=device, dtype=torch.float32)
+    alignment = 16  # 128 // 8
+    # For variation, each group has dimensions
+    n_g = alignment * random.randint(1, 64)
+    k_g = alignment * random.randint(1, 64)
+    for g in range(num_experts):
+        m_g = alignment * random.randint(1, 64)
+        expert_offsets[g + 1] = expert_offsets[g] + m_g
+        problem_sizes[g][0] = m_g
+        problem_sizes[g][1] = n_g
+        problem_sizes[g][2] = k_g
+        m_a_scales = m_g if per_act_token else 1
+        n_b_scales = n_g if per_out_ch else 1
+        print("shape:", m_g, n_g, k_g)
+        # Create group-specific A and B (FP8) and output (FP16/FP32)
+        a_g = to_fp8(torch.randn((m_g, k_g), device=device))
+        b_g = to_fp8(torch.randn((n_g, k_g), device=device).t())
+        a_tensors.append(a_g)
+        b_tensors.append(b_g)
+        # Set up A/B scales
+        scale_b = torch.randn((1, n_b_scales),
+                              device=device,
+                              dtype=torch.float32)
+        b_scales_tensors.append(scale_b)
+        if per_act_token:
+            scale_a = torch.randn((m_a_scales, 1),
+                                  device=device,
+                                  dtype=torch.float32)
+            a_scales_tensors.append(scale_a)
+        else:
+            scale_a = one_scale_a
+        # Compute baseline result for this group
+        baseline_g = baseline_scaled_mm(a_g, b_g, scale_a, scale_b, out_dtype,
+                                        None)
+        baseline_tensors.append(baseline_g)
+    a_tensors_stacked = torch.empty((expert_offsets[num_experts], k_g),
+                                    device=device,
+                                    dtype=torch.float8_e4m3fn)
+    b_tensors_stacked = torch.empty((num_experts, n_g, k_g),
+                                    device=device,
+                                    dtype=torch.float8_e4m3fn)
+    for g in range(num_experts):
+        a_tensors_stacked[expert_offsets[g]:expert_offsets[g +
+                                                           1]] = a_tensors[g]
+        b_tensors_stacked[g] = b_tensors[g].t()
+    b_tensors_stacked = b_tensors_stacked.transpose(1, 2)
+    if per_act_token:
+        a_scales_tensors_stacked = torch.empty(
+            (expert_offsets[num_experts], 1),
+            device=device,
+            dtype=torch.float32)
+        for g in range(num_experts):
+            a_scales_tensors_stacked[
+                expert_offsets[g]:expert_offsets[g + 1]] = a_scales_tensors[g]
+    else:
+        a_scales_tensors_stacked = one_scale_a
+    b_scales_tensors_stacked = torch.empty((num_experts, n_b_scales),
+                                           device=device,
+                                           dtype=torch.float32)
+    for g in range(num_experts):
+        b_scales_tensors_stacked[g] = b_scales_tensors[g]
+    out_tensors_stacked = torch.zeros((expert_offsets[num_experts], n_g),
+                                      device=device,
+                                      dtype=out_dtype)
+    ab_strides = torch.full((num_experts, ),
+                            a_tensors_stacked.stride(0),
+                            device="cuda",
+                            dtype=torch.int64)
+    c_strides = torch.full((num_experts, ),
+                           out_tensors_stacked.stride(0),
+                           device="cuda",
+                           dtype=torch.int64)
+    ops.cutlass_moe_mm(out_tensors_stacked, a_tensors_stacked,
+                       b_tensors_stacked, a_scales_tensors_stacked,
+                       b_scales_tensors_stacked, expert_offsets[:-1],
+                       problem_sizes, ab_strides, ab_strides, c_strides)
+    # Validate each group's result against the baseline
+    for g in range(num_experts):
+        baseline = baseline_tensors[g]
+        c = out_tensors_stacked[expert_offsets[g]:expert_offsets[g + 1]]
+        print(baseline)
+        print(c)
+        print("*")
+        torch.testing.assert_close(c, baseline, rtol=1e-2, atol=5e-4)
--- a/tests/kernels/test_cutlass_moe.py
+++ b/tests/kernels/test_cutlass_moe.py
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+from vllm import _custom_ops as ops
+from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
+from vllm.model_executor.layers.fused_moe.fused_moe import (fused_experts,
+                                                            fused_topk)
+from vllm.platforms import current_platform
+NUM_EXPERTS = [40, 64]
+TOP_KS = [6, 8]
+def run(a: torch.Tensor, a_scale: torch.Tensor, w1_q: torch.Tensor,
+        w2_q: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor,
+        topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+        ab_strides1: torch.Tensor, c_strides1: torch.Tensor,
+        ab_strides2: torch.Tensor, c_strides2: torch.Tensor):
+    with set_current_vllm_config(
+            VllmConfig(parallel_config=ParallelConfig(
+                pipeline_parallel_size=1))):
+        return cutlass_moe_fp8(a,
+                               w1_q,
+                               w2_q,
+                               w1_scale,
+                               w2_scale,
+                               topk_weights,
+                               topk_ids,
+                               ab_strides1,
+                               c_strides1,
+                               ab_strides2,
+                               c_strides2,
+                               a1_scale=a_scale)
+@pytest.mark.parametrize("m", [2, 64, 224])
+@pytest.mark.parametrize("n", [1024, 3072])
+@pytest.mark.parametrize("k", [1024, 1536])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("per_act_token", [True, False])
+@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.skipif(
+    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+        current_platform.get_device_capability()),
+    reason="Grouped gemm is not supported on this GPU type.")
+def test_cutlass_moe_no_graph(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_ch: bool,
+):
+    current_platform.seed_everything(7)
+    with set_current_vllm_config(
+            VllmConfig(parallel_config=ParallelConfig(
+                pipeline_parallel_size=1))):
+        dtype = torch.half
+        a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+        w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+        w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+        # Get the right scale for tests.
+        _, a_scale1 = ops.scaled_fp8_quant(
+            a, use_per_token_if_dynamic=per_act_token)
+        a_q, _ = ops.scaled_fp8_quant(a,
+                                      a_scale1,
+                                      use_per_token_if_dynamic=per_act_token)
+        a_d = a_q.float().mul(a_scale1).to(dtype)
+        n_b_scales = 2 * n if per_out_ch else 1
+        k_b_scales = k if per_out_ch else 1
+        w1_q = torch.empty((e, 2 * n, k),
+                           device="cuda",
+                           dtype=torch.float8_e4m3fn)
+        w2_q = torch.empty((e, k, n), device="cuda", dtype=torch.float8_e4m3fn)
+        w1_scale = torch.empty((e, n_b_scales, 1),
+                               device="cuda",
+                               dtype=torch.float32)
+        w2_scale = torch.empty((e, k_b_scales, 1),
+                               device="cuda",
+                               dtype=torch.float32)
+        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
+        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
+        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+        for expert in range(e):
+            w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
+                w1[expert], use_per_token_if_dynamic=per_out_ch)
+            w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
+                w2[expert], use_per_token_if_dynamic=per_out_ch)
+        w1_q = w1_q.transpose(1, 2)
+        w2_q = w2_q.transpose(1, 2)
+        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
+        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
+        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+        w1_d = torch.empty_like(w1)
+        w2_d = torch.empty_like(w2)
+        for expert in range(e):
+            w1_d[expert] = (w1_q[expert].t().float() * w1_scale[expert]).half()
+            w2_d[expert] = (w2_q[expert].t().float() * w2_scale[expert]).half()
+        score = torch.randn((m, e), device="cuda", dtype=dtype)
+        topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
+        triton_output = fused_experts(a_d, w1_d, w2_d, topk_weights, topk_ids)
+        cutlass_output = cutlass_moe_fp8(a,
+                                         w1_q,
+                                         w2_q,
+                                         w1_scale,
+                                         w2_scale,
+                                         topk_weights,
+                                         topk_ids,
+                                         ab_strides1,
+                                         c_strides1,
+                                         ab_strides2,
+                                         c_strides2,
+                                         a1_scale=a_scale1)
+        #print(triton_output)
+        #print(cutlass_output)
+        #print("*")
+        torch.testing.assert_close(triton_output,
+                                   cutlass_output,
+                                   atol=5e-2,
+                                   rtol=1e-2)
+@pytest.mark.parametrize("m", [2, 64, 224])
+@pytest.mark.parametrize("n", [1024, 3072])
+@pytest.mark.parametrize("k", [1024, 1536])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("per_act_token", [True, False])
+@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.skipif(
+    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+        current_platform.get_device_capability()),
+    reason="Grouped gemm is not supported on this GPU type.")
+def test_cutlass_moe_cuda_graph(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_ch: bool,
+):
+    current_platform.seed_everything(7)
+    with set_current_vllm_config(
+            VllmConfig(parallel_config=ParallelConfig(
+                pipeline_parallel_size=1))):
+        dtype = torch.half
+        a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+        w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+        w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+        # Get the right scale for tests.
+        _, a_scale1 = ops.scaled_fp8_quant(
+            a, use_per_token_if_dynamic=per_act_token)
+        a_q, _ = ops.scaled_fp8_quant(a,
+                                      a_scale1,
+                                      use_per_token_if_dynamic=per_act_token)
+        a_d = a_q.float().mul(a_scale1).to(dtype)
+        n_b_scales = 2 * n if per_out_ch else 1
+        k_b_scales = k if per_out_ch else 1
+        w1_q = torch.empty((e, 2 * n, k),
+                           device="cuda",
+                           dtype=torch.float8_e4m3fn)
+        w2_q = torch.empty((e, k, n), device="cuda", dtype=torch.float8_e4m3fn)
+        w1_scale = torch.empty((e, n_b_scales, 1),
+                               device="cuda",
+                               dtype=torch.float32)
+        w2_scale = torch.empty((e, k_b_scales, 1),
+                               device="cuda",
+                               dtype=torch.float32)
+        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
+        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
+        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+        for expert in range(e):
+            w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
+                w1[expert], use_per_token_if_dynamic=per_out_ch)
+            w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
+                w2[expert], use_per_token_if_dynamic=per_out_ch)
+        w1_q = w1_q.transpose(1, 2)
+        w2_q = w2_q.transpose(1, 2)
+        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
+        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
+        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+        w1_d = torch.empty_like(w1)
+        w2_d = torch.empty_like(w2)
+        for expert in range(e):
+            w1_d[expert] = (w1_q[expert].t().float() * w1_scale[expert]).half()
+            w2_d[expert] = (w2_q[expert].t().float() * w2_scale[expert]).half()
+        score = torch.randn((m, e), device="cuda", dtype=dtype)
+        topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
+        triton_output = fused_experts(a_d, w1_d, w2_d, topk_weights, topk_ids)
+        stream = torch.cuda.Stream()
+        graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(graph, stream=stream):
+            cutlass_output = run(a, a_scale1, w1_q, w2_q, w1_scale, w2_scale,
+                                 topk_weights, topk_ids, ab_strides1,
+                                 c_strides1, ab_strides2, c_strides2)
+        torch.cuda.synchronize()
+        graph.replay()
+        torch.cuda.synchronize()
+        #print(triton_output)
+        #print(cutlass_output)
+        #print("*")
+        torch.testing.assert_close(triton_output,
+                                   cutlass_output,
+                                   atol=9e-2,
+                                   rtol=1e-2)
--- a/tests/kernels/test_gguf.py
+++ b/tests/kernels/test_gguf.py
@@ -69,7 +69,7 @@ QUANT_TYPES = [
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("dtype", [torch.half])
+@pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("quant_type", QUANT_TYPES)
 @torch.inference_mode()
 def test_dequantize(hidden_size: int, dtype: torch.dtype,
@@ -82,7 +82,7 @@ def test_dequantize(hidden_size: int, dtype: torch.dtype,
        ref_output = torch.tensor(dequantize(tensor.data, quant_type),
                                  device="cuda").to(dtype)
        output = ops.ggml_dequantize(torch.tensor(tensor.data, device="cuda"),
-                                     quant_type, *list(shape)).to(dtype)
+                                     quant_type, *list(shape), dtype)
        torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=4e-2)

--- a/tests/kernels/test_lightning_attn.py
+++ b/tests/kernels/test_lightning_attn.py
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+from vllm.model_executor.layers.lightning_attn import (
+    linear_decode_forward_triton)
+from vllm.platforms import current_platform
+NUM_HEADS = [4, 8]
+HEAD_SIZES = [64]
+BATCH_SIZES = [1, 2]
+SEQ_LENGTHS = [16]
+DTYPES = [torch.float32]
+def reference_lightning_attention(q, k, v, ed, block_size, kv_history):
+    """Reference implementation of lightning attention core algorithm
+    The difference from the main implementation is that this processes 
+    each step sequentially, instead of using parallelized triton kernels
+    """
+    B, H, S, D = q.shape
+    E = v.shape[-1]
+    dtype = q.dtype
+    output = torch.zeros((B, H, S, E), dtype=dtype, device=q.device)
+    # Use clone() to ensure an independent copy
+    if kv_history is None:
+        kv_cache = torch.zeros((B, H, D, E), dtype=dtype, device=q.device)
+    else:
+        kv_cache = kv_history.clone()
+    # More efficient implementation
+    # Convert decay factors to matrix form
+    if ed.dim() == 1:
+        decay = torch.exp(-ed).view(1, -1, 1, 1)
+    else:
+        decay = torch.exp(-ed)
+    for b in range(B):
+        for step in range(S):
+            # Process all heads at once for this position
+            q_bs = q[b, :, step]  # [H, D]
+            k_bs = k[b, :, step]  # [H, D]
+            v_bs = v[b, :, step]  # [H, E]
+            # Calculate KV outer products for all heads
+            for h in range(H):
+                # Calculate KV outer product
+                kv_outer = torch.outer(k_bs[h], v_bs[h])
+                # Update KV cache with decay
+                # Note: Using the same order as in the Triton kernel
+                kv_cache[b, h] = decay[0, h, 0, 0] * kv_cache[b, h] + kv_outer
+                # Calculate attention output
+                output[b, h, step] = torch.matmul(q_bs[h], kv_cache[b, h])
+    # Match the shape returned by the actual implementation
+    # The actual implementation returns a tensor of shape [B, H, 2, D, E]
+    # where dimension 2 contains both KV and KV history
+    kv_reshaped = kv_cache.unsqueeze(2)  # [B, H, 1, D, E]
+    final_kv_cache = torch.cat([kv_reshaped, kv_reshaped],
+                               dim=2)  # [B, H, 2, D, E]
+    return output, final_kv_cache
+def reference_linear_decode(q, k, v, kv_caches, slope_rate, slot_idx):
+    """Reference implementation: linear attention decode function"""
+    B, H, _, D = q.shape
+    output = torch.zeros(B, H * D, dtype=q.dtype, device=q.device)
+    # Calculate decay factors once (more efficient)
+    decay = torch.exp(-slope_rate).view(-1, 1, 1)  # [H, 1, 1]
+    # Process each batch
+    for b in range(B):
+        slot_id = slot_idx[b].item()
+        # Skip padding positions
+        if slot_id == -1:
+            continue
+        # Process all heads at once for this batch
+        q_b = q[b, :, 0]  # [H, D]
+        k_b = k[b, :, 0]  # [H, D]
+        v_b = v[b, :, 0]  # [H, D]
+        # Process each attention head
+        for h in range(H):
+            # Get current query, key and value
+            q_bh = q_b[h]
+            k_bh = k_b[h]
+            v_bh = v_b[h]
+            # Get cache
+            kv_cache_old = kv_caches[b, h]
+            # Calculate new key-value outer product
+            kv_outer = torch.outer(k_bh, v_bh)
+            # Apply decay and update cache
+            kv_new = kv_outer + decay[h, 0, 0] * kv_cache_old
+            # Calculate output
+            out_h = torch.matmul(q_bh, kv_new)
+            # Update output and cache
+            output[b, h * D:(h + 1) * D] = out_h
+            kv_caches[b, h] = kv_new
+    return output
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@torch.inference_mode()
+def test_linear_decode_forward_triton(
+    batch_size: int,
+    num_heads: int,
+    head_size: int,
+    dtype: torch.dtype,
+):
+    torch.set_default_device("cuda")
+    torch.manual_seed(42)
+    torch.cuda.manual_seed_all(42)
+    current_platform.seed_everything(42)
+    base = 0.01
+    q = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+    k = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+    v = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+    kv_caches = base * torch.randn(batch_size,
+                                   num_heads,
+                                   head_size,
+                                   head_size,
+                                   dtype=dtype,
+                                   device="cuda")
+    kv_caches_copy = kv_caches.clone()
+    slope_rate = torch.zeros(num_heads, device="cuda")
+    for h in range(num_heads):
+        slope_rate[h] = 0.1 * (h + 1)
+    slot_idx = torch.arange(batch_size, device="cuda")
+    triton_output = linear_decode_forward_triton(q, k, v, kv_caches,
+                                                 slope_rate, slot_idx)
+    reference_output = reference_linear_decode(q, k, v, kv_caches_copy,
+                                               slope_rate, slot_idx)
+    torch.testing.assert_close(triton_output,
+                               reference_output,
+                               rtol=1e-1,
+                               atol=1e-1)
+    torch.testing.assert_close(kv_caches, kv_caches_copy, rtol=1e-1, atol=1e-1)
+    assert triton_output.shape == (batch_size, num_heads * head_size)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@torch.inference_mode()
+def test_linear_decode_forward_triton_with_padding(
+    num_heads: int,
+    head_size: int,
+    dtype: torch.dtype,
+):
+    torch.set_default_device("cuda")
+    torch.manual_seed(42)
+    torch.cuda.manual_seed_all(42)
+    current_platform.seed_everything(42)
+    batch_size = 4
+    base = 0.01
+    q = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+    k = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+    v = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+    kv_caches = base * torch.randn(batch_size,
+                                   num_heads,
+                                   head_size,
+                                   head_size,
+                                   dtype=dtype,
+                                   device="cuda")
+    kv_caches_copy = kv_caches.clone()
+    slope_rate = torch.zeros(num_heads, device="cuda")
+    for h in range(num_heads):
+        slope_rate[h] = 0.1 * (h + 1)
+    slot_idx = torch.tensor([0, 1, -1, 2], device="cuda")
+    triton_output = linear_decode_forward_triton(q, k, v, kv_caches,
+                                                 slope_rate, slot_idx)
+    reference_output = reference_linear_decode(q, k, v, kv_caches_copy,
+                                               slope_rate, slot_idx)
+    padding_mask = (slot_idx
+                    != -1).unsqueeze(1).expand(-1, num_heads * head_size)
+    triton_masked = triton_output[padding_mask]
+    reference_masked = reference_output[padding_mask]
+    atol, rtol = 1.5e-1, 1.5e-1
+    valid_indices = slot_idx != -1
+    for i in range(batch_size):
+        if valid_indices[i] > 0:
+            torch.testing.assert_close(kv_caches[i],
+                                       kv_caches_copy[i],
+                                       rtol=rtol,
+                                       atol=atol)
+    torch.testing.assert_close(triton_masked,
+                               reference_masked,
+                               rtol=rtol,
+                               atol=atol)
+    assert triton_output.shape == (batch_size, num_heads * head_size)
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("seq_len", SEQ_LENGTHS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@torch.inference_mode()
+def test_lightning_attention_reference(
+    batch_size: int,
+    num_heads: int,
+    head_size: int,
+    seq_len: int,
+    dtype: torch.dtype,
+):
+    torch.set_default_device("cuda")
+    torch.manual_seed(42)
+    torch.cuda.manual_seed_all(42)
+    current_platform.seed_everything(42)
+    base = 0.01
+    q = base * torch.randn(
+        batch_size, num_heads, seq_len, head_size, dtype=dtype)
+    k = base * torch.randn(
+        batch_size, num_heads, seq_len, head_size, dtype=dtype)
+    v = base * torch.randn(
+        batch_size, num_heads, seq_len, head_size, dtype=dtype)
+    ed = torch.zeros(num_heads, device="cuda")
+    for h in range(num_heads):
+        ed[h] = 0.1 * (h + 1)
+    kv_history = base * torch.randn(batch_size,
+                                    num_heads,
+                                    head_size,
+                                    head_size,
+                                    dtype=dtype,
+                                    device="cuda")
+    kv_history_clone = kv_history.clone()
+    ref_output, ref_kv_cache = reference_lightning_attention(
+        q, k, v, ed, 256, kv_history)
+    from vllm.model_executor.layers.lightning_attn import lightning_attention
+    actual_output, actual_kv_cache = lightning_attention(
+        q, k, v, ed, 256, kv_history_clone)
+    atol, rtol = 1.5e-1, 1.5e-1
+    torch.testing.assert_close(ref_output, actual_output, rtol=rtol, atol=atol)
+    torch.testing.assert_close(ref_kv_cache,
+                               actual_kv_cache,
+                               rtol=rtol,
+                               atol=atol)
+    assert ref_output.shape == (batch_size, num_heads, seq_len, head_size)
+    assert ref_kv_cache.shape == actual_kv_cache.shape
--- a/tests/kernels/test_mla_decode_cpu.py
+++ b/tests/kernels/test_mla_decode_cpu.py
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+import vllm._custom_ops as ops
+from vllm.platforms import current_platform
+def cdiv(a, b):
+    return (a + b - 1) // b
+def ref_mla(
+        out: Tensor,  # (bs, num_heads, v_head_dim)
+        query: Tensor,  # (bs, num_heads, head_dim)
+        kv_cache: Tensor,  # (num_blocks, block_size, head_dim)
+        scale: float,
+        block_tables: Tensor,  # (bs, max_num_blocks)
+        seq_lens: Tensor,  # (bs,)
+):
+    bs, num_heads, v_head_dim = out.shape
+    head_dim = query.shape[2]
+    for i in range(bs):
+        # gather and flatten KV-cache
+        kv = kv_cache[
+            block_tables[i]]  # (max_num_blocks, block_size, head_dim)
+        kv = kv.view(1, -1,
+                     head_dim)[:, :seq_lens[i]]  # (1, seq_len, head_dim)
+        v = kv[:, :, :v_head_dim]
+        q = query[i].view(num_heads, 1, head_dim)
+        o = F.scaled_dot_product_attention(q,
+                                           kv,
+                                           v,
+                                           scale=scale,
+                                           enable_gqa=True)
+        out[i] = o.view(num_heads, v_head_dim)
+    return out
+@pytest.mark.parametrize("bs", [4])
+@pytest.mark.parametrize("mean_seq_len", [256])
+@pytest.mark.parametrize("h_q", [16])
+@pytest.mark.parametrize("d", [576])
+@pytest.mark.parametrize("dv", [512])
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("dtype", [torch.float, torch.half, torch.bfloat16])
+@pytest.mark.parametrize("varlen", [False, True])
+@pytest.mark.cpu_model
+@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
+def test_mla_decode_cpu(
+    bs: int,
+    mean_seq_len: int,
+    h_q: int,
+    d: int,
+    dv: int,
+    block_size: int,
+    dtype: torch.dtype,
+    varlen: bool,
+):
+    torch.set_default_dtype(dtype)
+    torch.manual_seed(0)
+    scale = d**(-0.5)
+    if varlen:
+        seq_lens = torch.empty(bs).normal_(mean_seq_len, mean_seq_len / 2)
+        seq_lens = seq_lens.clip(2).to(torch.int32)
+    else:
+        seq_lens = torch.full((bs, ), mean_seq_len, dtype=torch.int32)
+    max_seq_len = seq_lens.max().item()
+    seqlen_pad = cdiv(max_seq_len, 256) * 256  # is this necessary?
+    q = torch.randn(bs, h_q, d)
+    block_table = torch.arange(bs * seqlen_pad // block_size,
+                               dtype=torch.int32)
+    block_table = block_table.view(bs, seqlen_pad // block_size)
+    kv_cache = torch.randn(block_table.numel(), block_size, d)
+    for i, seq_len in enumerate(seq_lens.tolist()):
+        kv_cache.view(bs, seqlen_pad, d)[i, seq_len:] = float("nan")
+    out_mla = q.new_zeros(bs, h_q, dv)
+    ops.mla_decode_kvcache_cpu(out_mla, q, kv_cache, scale, block_table,
+                               seq_lens)
+    out_ref = q.new_zeros(bs, h_q, dv)
+    ref_mla(out_ref, q, kv_cache, scale, block_table, seq_lens)
+    assert not out_mla.isnan().any(), "Likely read out of bounds"
+    torch.testing.assert_close(out_mla, out_ref)
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -3,7 +3,6 @@
 Run `pytest tests/kernels/test_moe.py`.
 """
 import pytest
 import torch
 from torch.nn import Parameter
@@ -216,11 +215,17 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
 @pytest.mark.parametrize("dtype",
                         [torch.float32, torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("padding", [True, False])
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
 @torch.inference_mode()
-def test_mixtral_moe(dtype: torch.dtype, padding: bool):
+def test_mixtral_moe(dtype: torch.dtype, padding: bool, use_rocm_aiter: bool,
+                     monkeypatch):
    """Make sure our Mixtral MoE implementation agrees with the one from
    huggingface."""
+    if use_rocm_aiter:
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
    # Instantiate our and huggingface's MoE blocks
    config = MixtralConfig()
    hf_moe = MixtralSparseMoeBlock(config).to(dtype).to("cuda")
@@ -268,10 +273,18 @@ def test_mixtral_moe(dtype: torch.dtype, padding: bool):
        torch.bfloat16: 1e-2,
    }
-    torch.testing.assert_close(hf_states.flatten(0, 1),
+    if use_rocm_aiter:
-                               vllm_states,
+        # The values of rtol and atol are set based on the tests in ROCM AITER package. # noqa: E501
-                               rtol=mixtral_moe_tol[dtype],
+        # https://github.com/ROCm/aiter/blob/dfed377f4be7da96ca2d75ac0761f569676f7240/op_tests/test_moe.py#L174  # noqa: E501
-                               atol=mixtral_moe_tol[dtype])
+        torch.testing.assert_close(hf_states.flatten(0, 1),
+                                   vllm_states,
+                                   rtol=0.01,
+                                   atol=100)
+    else:
+        torch.testing.assert_close(hf_states.flatten(0, 1),
+                                   vllm_states,
+                                   rtol=mixtral_moe_tol[dtype],
+                                   atol=mixtral_moe_tol[dtype])
 @pytest.mark.skipif(current_platform.is_rocm(),

--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -167,6 +167,7 @@ def test_contexted_kv_attention(
       block_table,
       b_start_loc,
       b_seq_len,
+       MAX_CTX_LEN,
       max_input_len,
       k_scale,
       v_scale,
@@ -183,6 +184,7 @@ def test_contexted_kv_attention(
       block_table,
       b_start_loc,
       b_seq_len,
+       MAX_CTX_LEN,
       max_input_len,
       k_scale,
       v_scale,
@@ -401,6 +403,7 @@ def test_contexted_kv_attention_alibi(
       block_table,
       b_start_loc,
       b_seq_len,
+       MAX_CTX_LEN,
       max_input_len,
       k_scale,
       v_scale,
@@ -417,6 +420,7 @@ def test_contexted_kv_attention_alibi(
       block_table,
       b_start_loc,
       b_seq_len,
+       MAX_CTX_LEN,
       max_input_len,
       k_scale,
       v_scale,

--- a/tests/kernels/test_uva.py
+++ b/tests/kernels/test_uva.py
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+from vllm.utils import get_cuda_view_from_cpu_tensor, is_uva_available
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+@pytest.mark.skipif(not is_uva_available(), reason="UVA is not available.")
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_cpu_write(device):
+    torch.set_default_device(device)
+    cpu_tensor = torch.zeros(10,
+                             10,
+                             device="cpu",
+                             pin_memory=True,
+                             dtype=torch.int32)
+    cuda_view = get_cuda_view_from_cpu_tensor(cpu_tensor)
+    assert cuda_view.device.type == "cuda"
+    assert cuda_view[0, 0] == 0
+    assert cuda_view[2, 3] == 0
+    assert cuda_view[4, 5] == 0
+    cpu_tensor[0, 0] = 1
+    cpu_tensor[2, 3] = 2
+    cpu_tensor[4, 5] = -1
+    cuda_view.mul_(2)
+    assert cuda_view[0, 0] == 2
+    assert cuda_view[2, 3] == 4
+    assert cuda_view[4, 5] == -2
+@pytest.mark.skipif(not is_uva_available(), reason="UVA is not available.")
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_gpu_write(device):
+    torch.set_default_device(device)
+    cpu_tensor = torch.zeros(10,
+                             10,
+                             device="cpu",
+                             pin_memory=True,
+                             dtype=torch.int32)
+    cuda_view = get_cuda_view_from_cpu_tensor(cpu_tensor)
+    assert cuda_view.device.type == "cuda"
+    assert cuda_view[0, 0] == 0
+    assert cuda_view[2, 3] == 0
+    assert cuda_view[4, 5] == 0
+    cuda_view[0, 0] = 1
+    cuda_view[2, 3] = 2
+    cuda_view[4, 5] = -1
+    cuda_view.mul_(2)
+    assert cpu_tensor[0, 0] == 2
+    assert cpu_tensor[2, 3] == 4
+    assert cpu_tensor[4, 5] == -2
\ No newline at end of file
--- a/tests/kernels/untest_ggml.py
+++ b/tests/kernels/untest_ggml.py
@@ -15,7 +15,8 @@ def test_ggml_opcheck(quant_type):
    qweight = torch.randint(0, 100, shape, device='cuda', dtype=torch.uint8)
    m = qweight.shape[0]
    n = qweight.shape[1] // type_size * block_size
-    opcheck(torch.ops._C.ggml_dequantize, (qweight, quant_type, m, n))
+    opcheck(torch.ops._C.ggml_dequantize,
+            (qweight, quant_type, m, n, torch.float16))
    x = torch.rand((m, 512), device='cuda', dtype=torch.float16)
    opcheck(torch.ops._C.ggml_mul_mat_a8,

--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -2,7 +2,6 @@
 import tempfile
 from collections import OrderedDict
-from typing import TypedDict
 from unittest.mock import MagicMock, patch
 import pytest
@@ -28,28 +27,6 @@ from vllm.platforms import current_platform
 from ..utils import models_path_prefix
-class ContextIDInfo(TypedDict):
-    lora_id: int
-    context_length: str
-class ContextInfo(TypedDict):
-    lora: str
-    context_length: str
-LONG_LORA_INFOS: list[ContextIDInfo] = [{
-    "lora_id": 1,
-    "context_length": "16k",
-}, {
-    "lora_id": 2,
-    "context_length": "16k",
-}, {
-    "lora_id": 3,
-    "context_length": "32k",
-}]
 @pytest.fixture()
 def should_do_global_cleanup_after_test(request) -> bool:
    """Allow subdirectories to skip global cleanup by overriding this fixture.
@@ -256,41 +233,6 @@ def long_context_lora_files_16k_1():
    return os.path.join(models_path_prefix, "SangBinCho/long_context_16k_testing_1")
-@pytest.fixture(scope="session")
-def long_context_lora_files_16k_2():
-    # return snapshot_download(repo_id="SangBinCho/long_context_16k_testing_2")
-    return os.path.join(models_path_prefix, "SangBinCho/long_context_16k_testing_2")
-@pytest.fixture(scope="session")
-def long_context_lora_files_32k():
-    # return snapshot_download(repo_id="SangBinCho/long_context_32k_testing")
-    return os.path.join(models_path_prefix, "SangBinCho/long_context_32k_testing")
-@pytest.fixture(scope="session")
-def long_context_infos(long_context_lora_files_16k_1,
-                       long_context_lora_files_16k_2,
-                       long_context_lora_files_32k):
-    cleanup_dist_env_and_memory(shutdown_ray=True)
-    infos: dict[int, ContextInfo] = {}
-    for lora_checkpoint_info in LONG_LORA_INFOS:
-        lora_id = lora_checkpoint_info["lora_id"]
-        if lora_id == 1:
-            lora = long_context_lora_files_16k_1
-        elif lora_id == 2:
-            lora = long_context_lora_files_16k_2
-        elif lora_id == 3:
-            lora = long_context_lora_files_32k
-        else:
-            raise AssertionError("Unknown lora id")
-        infos[lora_id] = {
-            "context_length": lora_checkpoint_info["context_length"],
-            "lora": lora,
-        }
-    return infos
 @pytest.fixture
 def llama_2_7b_engine_extra_embeddings():
    cleanup_dist_env_and_memory(shutdown_ray=True)

--- a/tests/lora/data/long_context_test_data.py
+++ b/tests/lora/data/long_context_test_data.py
--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
@@ -43,14 +43,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
    return generated_texts
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
 def test_baichuan_lora(baichuan_lora_files):
    llm = vllm.LLM(MODEL_PATH,
                   max_model_len=1024,

--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -20,6 +20,14 @@ EXPECTED_LORA_OUTPUT = [
 ]
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
    prompts = [
        PROMPT_TEMPLATE.format(query="How many singers do we have?"),
@@ -48,14 +56,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
    return generated_texts
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
 @create_new_process_for_each_test()
 def test_chatglm3_lora(chatglm3_lora_files):
    llm = vllm.LLM(MODEL_PATH,

--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
 # SPDX-License-Identifier: Apache-2.0
-import importlib
 import random
 from copy import deepcopy
 from dataclasses import dataclass
@@ -20,7 +19,6 @@ from vllm.lora.fully_sharded_layers import (
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
-                              LinearScalingRotaryEmbeddingWithLoRA,
                              LogitsProcessorWithLoRA, LoRAMapping,
                              MergedColumnParallelLinearWithLoRA,
                              MergedQKVParallelLinearWithLoRA,
@@ -29,8 +27,7 @@ from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
                              RowParallelLinearWithLoRA,
                              VocabParallelEmbeddingWithLoRA)
 # yapf: enable
-from vllm.lora.models import (LongContextLoRAContext, LoRALayerWeights,
+from vllm.lora.models import LoRALayerWeights, PackedLoRALayerWeights
-                              PackedLoRALayerWeights)
 from vllm.lora.punica_wrapper import get_punica_wrapper
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                               MergedColumnParallelLinear,
@@ -38,7 +35,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                               ReplicatedLinear,
                                               RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    ParallelLMHead, VocabParallelEmbedding, get_masked_input_and_mask)
 from vllm.model_executor.utils import set_random_seed
@@ -60,32 +56,16 @@ DEVICES = ([
    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ] if current_platform.is_cuda_alike() else ["cpu"])
-#For GPU, we will launch different triton kernels between the prefill and decode
+# prefill stage(True) or decode stage(False)
-# stages, so we need to verify this. prefill stage(True) or decode stage(False)
 STAGES = [True, False]
-# With the inclusion of V1 tests (look at the run_with_both_engines_lora),
+NUM_RANDOM_SEEDS = 6
-# the tests in this file run twice, once with the V0 engine and then with
-# the V1 engine.
-# The NUM_RANDOM_SEEDS value was set to 10 before. It is cut to half
-# with the inclusion of V1 tests to maintain the CI test times.
-NUM_RANDOM_SEEDS = 5
-# The VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS value was set to
-# 256 before. It is cut to half with the inclusion of V1 tests to maintain
-# the CI test times.
 VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128
 @pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
+def clean_cache():
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    # Reload punica_gpu as the kernels used are tied to engine type.
-    from vllm.lora.punica_wrapper import punica_gpu
-    importlib.reload(punica_gpu)
    # Release any memory we might be holding on to. CI runs OOMs otherwise.
    from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
                                                _LORA_B_PTR_DICT)
@@ -95,6 +75,24 @@ def v1(run_with_both_engines_lora):
    yield
+@pytest.fixture(autouse=True)
+def skip_cuda_with_stage_false(request):
+    """
+    On cuda-like platforms, we use the same kernels for prefill and decode 
+    stage, and 'stage' is generally ignored, so we only need to test once.
+    """
+    if current_platform.is_cuda_alike():
+        try:
+            if hasattr(request.node, "callspec") and hasattr(
+                    request.node.callspec, "params"):
+                params = request.node.callspec.params
+                if "stage" in params and params["stage"] is False:
+                    pytest.skip("Skip test when stage=False")
+        except Exception:
+            pass
+    yield
 def get_random_id_to_index(num_loras: int,
                           num_slots: int,
                           log: bool = True) -> list[Optional[int]]:
@@ -1016,103 +1014,6 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
                                   atol=atol)
-@torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 8])
-@pytest.mark.parametrize("device", ["cuda"])
-@pytest.mark.parametrize("scaling_factors", [(1.0, ), (4.0, ), (4.0, 8.0),
-                                             (6.0, 1.0)])
-@pytest.mark.parametrize("max_position", [11, 4096, 32768])
-@pytest.mark.parametrize("is_neox_style", [True, False])
-@pytest.mark.parametrize("rotary_dim", [None, 32])
-@pytest.mark.parametrize("head_size", [32, 108])
-@pytest.mark.parametrize("seq_len", [11, 1024])
-@pytest.mark.skipif(not current_platform.is_cuda_alike(),
-                    reason="Only CUDA backends are supported")
-def test_rotary_embedding_long_context(dist_init, num_loras, device,
-                                       scaling_factors, max_position,
-                                       is_neox_style, rotary_dim, head_size,
-                                       seq_len) -> None:
-    dtype = torch.float16
-    max_loras = 8
-    seed = 0
-    current_platform.seed_everything(seed)
-    torch.set_default_device(device)
-    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
-    assert check_punica_wrapper(punica_wrapper)
-    lora_config = LoRAConfig(max_loras=max_loras,
-                             max_lora_rank=8,
-                             long_lora_scaling_factors=scaling_factors,
-                             lora_dtype=dtype)
-    if rotary_dim is None:
-        rotary_dim = head_size
-    base = 10000
-    batch_size = 5 * num_loras
-    num_heads = 7
-    # Verify lora is equivalent to linear scaling rotary embedding.
-    rope = get_rope(
-        head_size,
-        rotary_dim,
-        max_position,
-        base,
-        is_neox_style,
-    )
-    lora_rope = LinearScalingRotaryEmbeddingWithLoRA(rope)
-    lora_rope.set_mapping(punica_wrapper)
-    lora_rope.create_lora_weights(max_loras, lora_config)
-    linear_rope = get_rope(head_size, rotary_dim, max_position, base,
-                           is_neox_style, {
-                               "rope_type": "linear",
-                               "factor": scaling_factors
-                           })
-    linear_rope = linear_rope.to(dtype=dtype)
-    id_to_index = get_random_id_to_index(num_loras, max_loras)
-    _, index_mapping, prompt_mapping = create_random_inputs(
-        active_lora_ids=[0],
-        num_inputs=batch_size,
-        input_size=(1, max_position),
-        input_range=(0, lora_config.lora_extra_vocab_size),
-        input_type=torch.float16,
-        device=device)
-    lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
-    long_lora_context = LongContextLoRAContext(list(scaling_factors),
-                                               rotary_dim)
-    next_expected_offset = 0
-    # Make sure the offset is correct.
-    scaling_factor_to_offset = lora_rope.scaling_factor_to_offset
-    for scaling_factor, offset in scaling_factor_to_offset.items():
-        assert offset == next_expected_offset
-        next_expected_offset += scaling_factor * max_position
-    for i in range(len(scaling_factors)):
-        long_lora_context.offsets_by_lora_id[i] = scaling_factor_to_offset.get(
-            scaling_factors[i], 0)
-    punica_wrapper.update_metadata(
-        lora_mapping,
-        id_to_index,
-        max_loras,
-        512,
-        lora_config.lora_extra_vocab_size,
-        long_lora_context=long_lora_context,
-    )
-    # lora_rope.set_mapping(*mapping_info)
-    positions = torch.randint(0, max_position, (batch_size, seq_len))
-    query = torch.randn(batch_size,
-                        seq_len,
-                        num_heads * head_size,
-                        dtype=dtype)
-    key = torch.randn_like(query)
-    ref_q, ref_k = linear_rope(positions, query, key)
-    actual_q, actual_k = lora_rope(positions, query, key)
-    torch.allclose(ref_q, actual_q)
-    torch.allclose(ref_k, actual_k)
 @pytest.mark.parametrize("tp_size", [1, 2, 4, 8])
 @pytest.mark.parametrize(
    "seed", list(range(VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS)))

--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -29,6 +29,14 @@ EXPECTED_LORA_OUTPUT = [
 ]
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
    prompts = [
        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
@@ -72,16 +80,6 @@ def generate_and_test(llm, sql_lora_files):
    print("removing lora")
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-# V1 Test: Failing due to numerics on V1.
-@pytest.mark.skip_v1
 @create_new_process_for_each_test()
 def test_llama_lora(sql_lora_files):
@@ -127,8 +125,6 @@ def test_llama_lora_warmup(sql_lora_files):
        "less when using lora than when not using lora")
-# V1 Test: Failing due to numerics on V1.
-@pytest.mark.skip_v1
 @multi_gpu_test(num_gpus=4)
 @create_new_process_for_each_test()
 def test_llama_lora_tp4(sql_lora_files):
@@ -158,20 +154,3 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
        enable_chunked_prefill=True,
    )
    generate_and_test(llm, sql_lora_files)
-@multi_gpu_test(num_gpus=4)
-@create_new_process_for_each_test()
-def test_llama_lora_tp4_fully_sharded_enable_bias(sql_lora_files):
-    llm = vllm.LLM(
-        MODEL_PATH,
-        enable_lora=True,
-        max_num_seqs=16,
-        max_loras=4,
-        tensor_parallel_size=4,
-        fully_sharded_loras=True,
-        enable_lora_bias=True,
-        enable_chunked_prefill=True,
-    )
-    generate_and_test(llm, sql_lora_files)
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -7,7 +7,6 @@ import torch
 from safetensors.torch import load_file
 from torch import nn
-from vllm import envs
 from vllm.config import LoRAConfig
 from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
                              MergedColumnParallelLinearWithLoRA,
@@ -33,6 +32,17 @@ DEVICES = ([
 ] if current_platform.is_cuda_alike() else ["cpu"])
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch: pytest.MonkeyPatch):
+    """
+    Some tests depend on V0 internals. Since both V0 and V1 use the same
+    LoRAModelManager it is okay to just test V0.
+    """
+    with monkeypatch.context() as m:
+        m.setenv('VLLM_USE_V1', '0')
+        yield
 @pytest.mark.parametrize("device", DEVICES)
 def test_from_lora_tensors(sql_lora_files, device):
    tensors = load_file(
@@ -411,7 +421,6 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
    assert manager.device == device
-@pytest.mark.skipif(envs.VLLM_USE_V1, reason="Test leverages V0 internals.")
 @pytest.mark.parametrize("device", DEVICES)
 def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
                                          sql_lora_files, device):
@@ -491,7 +500,6 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
            device)
-@pytest.mark.skipif(envs.VLLM_USE_V1, reason="Test leverages V0 internals.")
 @pytest.mark.parametrize("device", DEVICES)
 def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
                                sql_lora_files, device):

--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@@ -60,7 +60,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
 @pytest.mark.xfail(
    current_platform.is_rocm(),
    reason="MiniCPM-V dependency xformers incompatible with ROCm")
-@create_new_process_for_each_test()
 def test_minicpmv_lora(minicpmv_lora_files):
    llm = vllm.LLM(
        MODEL_PATH,
@@ -80,6 +79,8 @@ def test_minicpmv_lora(minicpmv_lora_files):
        assert EXPECTED_OUTPUT[i].startswith(output2[i])
+@pytest.mark.skipif(current_platform.is_cuda_alike(),
+                    reason="Skipping to avoid redundant model tests")
 @pytest.mark.xfail(
    current_platform.is_rocm(),
    reason="MiniCPM-V dependency xformers incompatible with ROCm")
@@ -101,6 +102,8 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
        assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
+@pytest.mark.skipif(current_platform.is_cuda_alike(),
+                    reason="Skipping to avoid redundant model tests")
 @pytest.mark.xfail(
    current_platform.is_rocm(),
    reason="MiniCPM-V dependency xformers incompatible with ROCm")

--- a/tests/lora/test_phi.py
+++ b/tests/lora/test_phi.py
@@ -12,6 +12,14 @@ MODEL_PATH = os.path.join(models_path_prefix, "microsoft/phi-2")
 PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:"  # noqa: E501
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
    prompts = [
        PROMPT_TEMPLATE.format(
@@ -50,14 +58,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
    return generated_texts
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
 # Skipping for V1 for now as we are hitting,
 # "Head size 80 is not supported by FlashAttention." error.
 @pytest.mark.skip_v1

--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -40,6 +40,14 @@ else:
    ]
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
 def do_sample(llm: vllm.LLM,
              lora_path: str,
              lora_id: int,
@@ -72,14 +80,6 @@ def do_sample(llm: vllm.LLM,
    return generated_texts
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tp_size", [1])
 def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,