vllm kvprune wo:v1.1.0

d29c39ca · chenzk · f81ce56b · d29c39ca · d29c39ca · d29c39ca
Commit d29c39ca authored Apr 30, 2026 by chenzk
20 changed files
--- a/vllm/compactor-vllm/src/compactor_vllm/attention/compile_kernels.py
+++ b/vllm/compactor-vllm/src/compactor_vllm/attention/compile_kernels.py
+import argparse
+import logging
+import math
+
+import torch
+from compactor_vllm.attention.sparse_varlen_kernel import (
+    causal_sparse_varlen_with_cache,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def build_mock_paged_cache_from_lengths(
+    L_cache_per_b: torch.Tensor,
+    HKV: int,
+    D: int,
+    PAGE_SIZE: int,
+    N_LOGICAL_PAGES_MAX: int,
+    device,
+    dtype,
+):
+    B = len(L_cache_per_b)
+    max_len = PAGE_SIZE * N_LOGICAL_PAGES_MAX
+    assert (L_cache_per_b <= max_len).all()
+
+    seq_lens_bh = torch.empty((B, HKV), dtype=torch.int32, device=device)
+    for b in range(B):
+        seq_lens_bh[b, :].fill_(L_cache_per_b[b])
+
+    num_phys_pages = B * HKV * N_LOGICAL_PAGES_MAX
+    CACHE_SIZE = num_phys_pages * PAGE_SIZE
+
+    K_cache = torch.zeros((CACHE_SIZE, D), device=device, dtype=dtype)
+    V_cache = torch.zeros((CACHE_SIZE, D), device=device, dtype=dtype)
+    page_table = torch.empty(
+        (B, HKV, N_LOGICAL_PAGES_MAX), device=device, dtype=torch.int32
+    )
+
+    # assign unique physical pages per (b, h, lp)
+    phys_page = 0
+    for b in range(B):
+        for h in range(HKV):
+            for lp in range(N_LOGICAL_PAGES_MAX):
+                page_table[b, h, lp] = phys_page
+                phys_page += 1
+
+    for b in range(B):
+        Lc = int(L_cache_per_b[b].item())
+        for h in range(HKV):
+            for i in range(Lc):
+                lp = i // PAGE_SIZE
+                off = i % PAGE_SIZE
+                phys = int(page_table[b, h, lp].item())
+                idx = phys * PAGE_SIZE + off
+                K_cache[idx] = torch.randn(D, device=device, dtype=dtype)
+                V_cache[idx] = torch.randn(D, device=device, dtype=dtype)
+
+    return K_cache, V_cache, page_table, seq_lens_bh, CACHE_SIZE
+
+
+def autotune_causal_sparse_varlen_with_cache(
+    *,
+    max_length: int = 16384,
+    HKV: int = 8,
+    HQ: int = 32,
+    D: int = 128,
+    PAGE_SIZE: int = 128,
+    device: str = "cuda",
+    dtype=torch.float16,
+):
+    """
+    Autotune causal_sparse_varlen_with_cache over a sweep of cache/append lengths.
+    """
+    import itertools
+
+    import tqdm
+
+    N_LOGICAL_PAGES_MAX = ((max_length + PAGE_SIZE - 1) // PAGE_SIZE) * PAGE_SIZE
+    B = 4
+
+    # D must be a power of two (kernel requirement).
+    assert (D & (D - 1)) == 0
+
+    lengths_to_sweep = [0, 256]
+    i = 9
+    while (v := (1 << i)) < max_length:
+        lengths_to_sweep.append(v)
+        i += 1
+
+    combos = list(itertools.product(lengths_to_sweep, repeat=2))
+    logger.info(
+        "tuning kernels. this may take a few minutes, "
+        "but only needs to be run once per LLMConfig"
+    )
+
+    for cache_l, append_l in tqdm.tqdm(combos):
+        if cache_l + append_l == 0:
+            continue
+
+        L_cache_per_b = torch.tensor(
+            [cache_l] * B,
+            device=device,
+            dtype=torch.int32,
+        )
+        assert (L_cache_per_b <= PAGE_SIZE * N_LOGICAL_PAGES_MAX).all()
+        K_cache, V_cache, page_table, seq_lens_bh, CACHE_SIZE = (
+            build_mock_paged_cache_from_lengths(
+                L_cache_per_b=L_cache_per_b,
+                HKV=HKV,
+                D=D,
+                PAGE_SIZE=PAGE_SIZE,
+                N_LOGICAL_PAGES_MAX=N_LOGICAL_PAGES_MAX,
+                device=device,
+                dtype=dtype,
+            )
+        )
+
+        L_app_list = [append_l] * B
+        cu = [0]
+        for L in L_app_list:
+            cu.append(cu[-1] + L)
+        cu_seqlens_qk = torch.tensor(cu, dtype=torch.int32, device=device)
+        N = int(cu_seqlens_qk[-1].item())
+
+        max_seqlen_q = int((cu_seqlens_qk[1:] - cu_seqlens_qk[:-1]).max().item())
+        max_seqlen_k = seq_lens_bh.max().item()
+        q_raw = torch.randn(N, HQ, D, device=device, dtype=dtype)
+        k_append_raw = torch.randn(N, HKV, D, device=device, dtype=dtype)
+        v_append_raw = torch.randn(N, HKV, D, device=device, dtype=dtype)
+
+        # Identity batch mapping (local batch index == global)
+        batch_mapping = torch.arange(B, device=device, dtype=torch.int32)
+
+        sm_scale = 1.0 / math.sqrt(D)
+
+        causal_sparse_varlen_with_cache(
+            q=q_raw,
+            k_cache=K_cache,
+            v_cache=V_cache,
+            k=k_append_raw,
+            v=v_append_raw,
+            seq_lens_bh=seq_lens_bh,
+            global_page_table=page_table,
+            batch_mapping=batch_mapping,
+            cu_seqlens_q=cu_seqlens_qk,
+            HKV=HKV,
+            PAGE_SIZE=PAGE_SIZE,
+            sm_scale=sm_scale,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k_cache=max_seqlen_k,
+        )
+
+
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Autotune Triton kernels. "
+                    "Results are cached, so this should only need to be run once per configuration."
+                    "This script doesn't need to be run, as the kernels will be autotuned at runtime"
+                    "if no cached autotuning data exists. Running this before hand will prevent run-time"
+                    "autotuning, which will accelerate compactor-vllm at inference time."
+    )
+    parser.add_argument(
+        "--max-length",
+        type=int,
+        default=16384,
+        help="Maximum total sequence length to consider.",
+    )
+    parser.add_argument(
+        "--HKV",
+        type=int,
+        default=8,
+        help="Number of KV heads.",
+    )
+    parser.add_argument(
+        "--HQ",
+        type=int,
+        default=32,
+        help="Number of query heads.",
+    )
+    parser.add_argument(
+        "--D",
+        type=int,
+        default=128,
+        help="Per-head hidden dimension (must be power of 2).",
+    )
+    parser.add_argument(
+        "--page-size",
+        type=int,
+        default=128,
+        help="Page size (tokens per physical page).",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        help="Torch device to run on (e.g. 'cuda', 'cuda:0', 'cpu').",
+    )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="float16",
+        help="Dtype for tensors: one of {float16, fp16, bfloat16, bf16, float32, fp32}.",
+    )
+    parser.add_argument(
+        "--log-level",
+        type=str,
+        default="INFO",
+        choices=["CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"],
+        help="Logging level.",
+    )
+    return parser.parse_args()
+
+
+def _resolve_dtype(dtype_str: str):
+    s = dtype_str.lower()
+    if s in ("float16", "fp16", "half"):
+        return torch.float16
+    if s in ("bfloat16", "bf16"):
+        return torch.bfloat16
+    if s in ("float32", "fp32"):
+        return torch.float32
+    raise ValueError(f"Unsupported dtype: {dtype_str}")
+
+
+def main():
+    args = _parse_args()
+    logging.basicConfig(
+        level=getattr(logging, args.log_level.upper()),
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    )
+
+    dtype = _resolve_dtype(args.dtype)
+    logger.info(
+        "Starting autotune with max_length=%d, HKV=%d, HQ=%d, D=%d, page_size=%d, "
+        "device=%s, dtype=%s",
+        args.max_length,
+        args.HKV,
+        args.HQ,
+        args.D,
+        args.page_size,
+        args.device,
+        dtype,
+    )
+
+    autotune_causal_sparse_varlen_with_cache(
+        max_length=args.max_length,
+        HKV=args.HKV,
+        HQ=args.HQ,
+        D=args.D,
+        PAGE_SIZE=args.page_size,
+        device=args.device,
+        dtype=dtype,
+    )
+
+
+if __name__ == "__main__":
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s %(levelname)s: %(message)s",
+    )
+    main()
--- a/vllm/compactor-vllm/src/compactor_vllm/attention/sparse_decode_kernel.py
+++ b/vllm/compactor-vllm/src/compactor_vllm/attention/sparse_decode_kernel.py
+import functools
+import math
+
+import torch
+import triton
+import triton.language as tl
+
+from compactor_vllm.utils.triton_compat import (
+    autotune as triton_autotune,
+    maybe_set_allocator,
+)
+
+
+def head_sparse_decode_attention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_lens_bh: torch.Tensor,
+    global_page_table: torch.Tensor,
+    batch_mapping: torch.Tensor,
+    HKV: int,
+    PAGE_SIZE: int,
+    sm_scale: float = None,
+    key_split: int = None,
+):
+    """
+    Decode-time head-sparse attention over a paged KV cache.
+
+    This is a wrapper around the Triton decode kernel used during incremental
+    generation. For each batch, we read the cached keys
+    and values from a global paged KV buffer, apply causal attention with one
+    new query token, and return the attention output.
+
+    The KV cache is stored in a single global K/V tensor of shape
+    ``[CACHE_SIZE, D]`` and indexed via a per-layer page table. Each logical
+    (batch, kv_head, token_idx) is mapped to a physical row in the cache by:
+
+        1. Looking up the logical page index in ``global_page_table[b, h, lp]``,
+        2. Computing ``phys_row = page_id * PAGE_SIZE + (token_idx % PAGE_SIZE)``.
+
+    Grouped-query attention (GQA / MQA) is supported by passing more query
+    heads than KV heads (``HQ`` must be a multiple of ``HKV``).
+
+    Args:
+        :param q: Query tensor of shape ``[B, HQ, D]`` or `[B, 1, HQ, D]``
+            containing the new decode tokens for each sequence in the launch batch.
+        :param k: Global key cache of shape ``[CACHE_SIZE, D]``. This is the shared
+            backing buffer for all (batch, head) KV pages.
+        :param v: Global value cache of shape ``[CACHE_SIZE, D]``.
+        :param seq_lens_bh: Tensor of shape ``[B, HKV]`` (int32) giving, for each
+            local batch index and KV head, the number of valid cached tokens
+            in the paged KV cache.
+        :param global_page_table: Tensor of shape
+            ``[MAX_NUM_BATCHES, HKV, N_LOGICAL_PAGES_MAX]`` (int32) mapping
+            ``(true_batch_idx, kv_head, logical_page)`` to a physical page id
+            in the global cache.
+        :param batch_mapping: Tensor of shape ``[B]`` (int32) mapping the launch-batch
+            index used by this call to the true batch row used to index
+            ``global_page_table``.
+        :param HKV: Number of KV heads.
+        :param PAGE_SIZE: Number of tokens stored per physical KV page.
+        :param sm_scale: Optional scaling factor applied to the attention logits
+            before softmax. If ``None``, ``1 / sqrt(D)`` is used.
+        :param key_split: Optional number of splits along the key sequence length.
+            If > 1, the kernel will process the KV sequence in ``key_split``
+            chunks to reduce on-chip memory usage. If ``None`` or 0, a
+            heuristic is used.
+
+    Returns:
+        :return torch.Tensor: Attention output of shape ``[B, HQ, D]`` on the same
+        device and dtype as ``q``.
+    """
+
+    with torch.cuda.device(q.device):
+        if q.ndim != 3:
+            assert q.ndim == 4
+            B, HQ, S, D = q.shape
+            assert S == 1, "head_sparse_decode_attention only supports q_len=1"
+            q = q.squeeze(-2)
+        elif q.ndim == 3:
+            B, HQ, D = q.shape
+
+        CACHE_SIZE = k.shape[0]
+        assert PAGE_SIZE % 32 == 0, "PAGE_SIZE must be divisible by 128"
+        GROUP_M = HQ // HKV
+        assert GROUP_M * HKV == HQ, "HQ must be divisible by H_kv"
+
+        FP8 = hasattr(torch, "float8_e5m2") and q.dtype == torch.float8_e5m2
+
+        seq_lens_bh = seq_lens_bh.to(torch.int32)
+        assert B <= 32767, "too many batches"
+        assert global_page_table.shape[1] == HKV
+        assert q.is_contiguous()
+        assert (D & (D - 1)) == 0, "D must be a power of 2"
+        N_LOGICAL_PAGES_MAX = global_page_table.shape[-1]
+
+        sm_scale = 1 / math.sqrt(D) if sm_scale is None else sm_scale
+        if key_split is None:
+            # round max_seq_len to the next power of two to maximize cache hits
+            key_split = num_splits_heuristic(
+                B * HKV,
+                max_seq_len=1 << int(seq_lens_bh.max()).bit_length(),
+                num_sms=torch.cuda.get_device_properties(
+                    q.device
+                ).multi_processor_count,
+                max_splits=12,
+            )
+
+        maybe_set_allocator(
+            lambda size, align, _: torch.empty(size, dtype=torch.int8, device=q.device)
+        )
+
+        # stage 1 scratch
+        mid_o = torch.empty((B, key_split, HQ, D), device=q.device, dtype=q.dtype)
+        mid_lse = torch.empty((B, key_split, HQ), device=q.device, dtype=torch.float32)
+        # processes all queries for a KV head together
+        # pointers are lowercase, CONSTANTS are upper
+        grid1 = (B, HKV, key_split)
+        _varkv_stage1_groupM[grid1](
+            q=q,
+            k=k,
+            v=v,
+            mid_o=mid_o,
+            mid_lse=mid_lse,
+            page_table_bhl=global_page_table,
+            batch_mapping=batch_mapping,
+            seq_lens_bh=seq_lens_bh.contiguous(),
+            SM_SCALE=sm_scale,
+            B=B,
+            HKV=HKV,
+            HQ=HQ,
+            CACHE_SIZE=CACHE_SIZE,
+            STRIDE_LBS=mid_lse.stride(0),
+            STRIDE_LS=mid_lse.stride(1),
+            STRIDE_LH=mid_lse.stride(2),
+            N_LOGICAL_PAGES_MAX=N_LOGICAL_PAGES_MAX,
+            D=D,
+            KEY_SPLIT=key_split,
+            GROUP_M=GROUP_M,
+            DTYPE=tl.float8e5
+            if FP8
+            else (tl.bfloat16 if q.dtype == torch.bfloat16 else tl.float16),
+            PAGE_SIZE=PAGE_SIZE,
+        )
+
+        if key_split == 1:
+            return mid_o.squeeze(1).contiguous()
+
+        # reduce partial results across splits
+        output = torch.empty_like(q)
+        grid2 = (B, HQ)
+        _varkv_stage2_reduce[grid2](
+            mid_o=mid_o,
+            mid_lse=mid_lse,
+            output=output,
+            STRIDE_LBS=mid_lse.stride(0),
+            STRIDE_LS=mid_lse.stride(1),
+            STRIDE_LH=mid_lse.stride(2),
+            STRIDE_OBS=output.stride(0),
+            STRIDE_OH=output.stride(1),
+            B=B,
+            HQ=HQ,
+            D=D,  # type: ignore
+            KEY_SPLIT=key_split,  # type: ignore
+            DTYPE=tl.float8e5
+            if FP8
+            else (tl.bfloat16 if q.dtype == torch.bfloat16 else tl.float16),
+        )
+        return output
+
+
+# similar to flash attention split heuristic
+@functools.lru_cache(maxsize=128)
+def num_splits_heuristic(
+    total_mblocks: int,
+    max_seq_len: int,
+    num_sms: int,
+    max_splits: int,
+) -> int:
+    # If we nearly fill SMs already, prefer 1 split
+    if total_mblocks >= 0.8 * num_sms or max_seq_len <= 1024:
+        return 1
+    eff = []
+    max_eff = 0.0
+    for s in range(1, min(max_splits, num_sms) + 1):
+        if (max_seq_len / s) <= 512:
+            break
+        n_waves = float(total_mblocks * s) / float(num_sms)
+        e = n_waves / math.ceil(n_waves) if n_waves > 0 else 0.0
+        eff.append(e)
+        max_eff = max(max_eff, e)
+    threshold = 0.75 * max_eff  # if not split_min_hit else 0.9 * max_eff
+    for i, e in enumerate(eff, start=1):
+        if e >= threshold:
+            return i
+    return 1
+
+
+def prune_invalid_configs(configs, _, **kwargs):
+    PAGE_SIZE = kwargs["PAGE_SIZE"]
+    return [conf for conf in configs if conf.kwargs.get("BLOCK_N", 0) <= PAGE_SIZE]
+
+
+@triton_autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_N": BLOCK_N, "MIN_BLOCK_KV": MIN_BLOCK_KV, "WARPSPEC": ws},
+            num_warps=w,
+            num_stages=s,
+        )
+        for BLOCK_N in [32, 64, 128]
+        for MIN_BLOCK_KV in [8]
+        for s in [2, 3, 4]
+        for w in [4, 8]
+        for ws in [True, False]
+    ],
+    key=[
+        "HKV",
+        "GROUP_M",
+        "D",
+        "PAGE_SIZE",  # "B"
+    ],
+    cache_results=True,
+    prune_configs_by={"early_config_prune": prune_invalid_configs},
+)
+@triton.jit
+def _varkv_stage1_groupM(
+    q,  # [B, HQ, D] contiguous
+    k,  # GLOBAL cache: [CACHE_SIZE, D], contiguous
+    v,  # GLOBAL cache: [CACHE_SIZE, D], contiguous
+    mid_o,
+    mid_lse,
+    page_table_bhl,  # int32 [B*H_kv*N_LOGICAL_PAGES_MAX] (flattened)
+    batch_mapping,  # int32 [B]  maps local pid_b -> true batch index
+    seq_lens_bh,  # int32 [B*H_kv] valid tokens per (b,h)
+    SM_SCALE,
+    B,
+    HKV,
+    HQ,
+    CACHE_SIZE,  # CACHE_SIZE = N_PAGES * PAGE_SIZE
+    STRIDE_LBS,
+    STRIDE_LS,
+    STRIDE_LH,
+    # constexprs
+    N_LOGICAL_PAGES_MAX: tl.constexpr,  # page table width per (b,h)
+    D: tl.constexpr,
+    KEY_SPLIT: tl.constexpr,
+    GROUP_M: tl.constexpr,
+    DTYPE: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    MIN_BLOCK_KV: tl.constexpr,
+    WARPSPEC: tl.constexpr,
+    PAGE_SIZE: tl.constexpr,
+):
+    pid_b = tl.program_id(0)  # batch
+    pid_kvh = tl.program_id(1)  # kv head
+    pid_s = tl.program_id(2)  # split
+
+    # valid length L for this (b,h)
+    bh_stride = HKV
+    L = tl.load(seq_lens_bh + pid_b * bh_stride + pid_kvh)
+    if L == 0:
+        return
+
+    tl.assume(L > 0)
+
+    # split sizing on logical token axis [0..L)
+    base = tl.cdiv(L, KEY_SPLIT)
+    per_split_len = tl.cdiv(base, MIN_BLOCK_KV) * MIN_BLOCK_KV
+    split_start = pid_s * per_split_len
+    split_end = tl.minimum(split_start + per_split_len, L)
+
+    # query heads mapped to this kv head
+    base_qh = pid_kvh * GROUP_M
+    GROUP_M_PAD: tl.constexpr = 16 if GROUP_M < 16 else GROUP_M
+    offs_m = tl.arange(0, GROUP_M_PAD)
+    mask_m = offs_m < GROUP_M
+    offs_d = tl.arange(0, D)
+
+    # load Q tile [M, D]
+    q_ptrs = q + (pid_b * HQ + base_qh + offs_m)[:, None] * D + offs_d[None, :]
+    q = tl.load(q_ptrs, mask=mask_m[:, None], other=0.0).to(DTYPE)  # [M, D]
+
+    # streaming softmax state per query
+    e_max = tl.zeros([GROUP_M_PAD], dtype=tl.float32) - float("inf")
+    e_sum = tl.zeros([GROUP_M_PAD], dtype=tl.float32)
+    acc = tl.zeros([GROUP_M_PAD, D], dtype=tl.float32)
+
+    if split_end > split_start:
+        # logical pages covering [split_start, split_end)
+        lp0 = split_start // PAGE_SIZE
+        lp1 = tl.cdiv(split_end, PAGE_SIZE)  # exclusive
+
+        mapped_b = tl.load(batch_mapping + pid_b)
+        tl.assume(mapped_b >= 0)
+        # page table base for this (b,h)
+        pt_stride = N_LOGICAL_PAGES_MAX
+        pt_base = (mapped_b * HKV + pid_kvh) * pt_stride
+
+        for lp in tl.range(lp0, lp1):
+            phys = tl.load(
+                page_table_bhl + pt_base + lp, cache_modifier=".cg"
+            )  # physical page id
+            # bounds within the logical page
+            local_start = tl.where(lp == lp0, split_start - lp * PAGE_SIZE, 0)
+            local_end = tl.where(lp == (lp1 - 1), split_end - lp * PAGE_SIZE, PAGE_SIZE)
+
+            page_base = phys * PAGE_SIZE
+            page_base = tl.multiple_of(page_base, BLOCK_N)
+            for s in tl.range(local_start, local_end, BLOCK_N):
+                s = tl.multiple_of(s, MIN_BLOCK_KV)
+                offs_bn = tl.arange(0, BLOCK_N)
+                key_idx = page_base + s + offs_bn
+                k_ptrs = k + key_idx[:, None] * D + offs_d[None, :]
+                k_blk = tl.load(k_ptrs, mask=(key_idx < CACHE_SIZE)[:, None], other=0.0)
+                qk = tl.dot(q, k_blk.T) * SM_SCALE  # [M, BN]
+
+                offs_n = s + tl.arange(0, BLOCK_N)
+                mask_n = offs_n < local_end
+                qk = tl.where(mask_n[None, :], qk, -float("inf"))
+
+                n_e_max = tl.maximum(tl.max(qk, 1), e_max)  # [M]
+                re_scale = tl.exp(e_max - n_e_max)  # [M]
+                acc = acc * re_scale[:, None]  # [M, D]
+                v_ptrs = v + key_idx[:, None] * D + offs_d[None, :]
+                v_blk = tl.load(v_ptrs, mask=(key_idx < CACHE_SIZE)[:, None], other=0.0)
+                p = tl.exp(qk - n_e_max[:, None])  # [M, BN]
+                acc = tl.dot(p.to(DTYPE), v_blk, acc)
+
+                e_sum = e_sum * re_scale + tl.sum(p, 1)
+                e_max = n_e_max
+
+        # write mid outputs [M, D] for this split
+        tmp = (acc / e_sum[:, None]).to(DTYPE)
+        row_mid = pid_b * (KEY_SPLIT * HQ) + pid_s * HQ + base_qh + offs_m
+        mid_ptrs = mid_o + row_mid[:, None] * D + offs_d[None, :]
+        tl.store(mid_ptrs, tmp, mask=mask_m[:, None])
+
+        ml_ptrs = (
+            mid_lse
+            + pid_b * STRIDE_LBS
+            + pid_s * STRIDE_LS
+            + (base_qh + offs_m) * STRIDE_LH
+        )
+        safe_sum = tl.where(mask_m, e_sum, 1.0)
+        tl.store(ml_ptrs, e_max + tl.log(safe_sum), mask=mask_m)
+    else:
+        # empty split
+        zero_md = tl.zeros([GROUP_M_PAD, D], dtype=DTYPE)
+        row_mid = pid_b * (KEY_SPLIT * HQ) + pid_s * HQ + base_qh + offs_m
+        mid_ptrs = mid_o + row_mid[:, None] * D + offs_d[None, :]
+        tl.store(mid_ptrs, zero_md, mask=mask_m[:, None])
+        ml_ptrs = (
+            mid_lse
+            + pid_b * STRIDE_LBS
+            + pid_s * STRIDE_LS
+            + (base_qh + offs_m) * STRIDE_LH
+        )
+        tl.store(ml_ptrs, -float("inf"), mask=mask_m)
+
+
+@triton.jit
+def _varkv_stage2_reduce(
+    mid_o,
+    mid_lse,
+    output,
+    STRIDE_LBS,
+    STRIDE_LS,
+    STRIDE_LH,
+    STRIDE_OBS,
+    STRIDE_OH,
+    B,
+    HQ,
+    D: tl.constexpr,
+    KEY_SPLIT: tl.constexpr,
+    DTYPE: tl.constexpr,
+):
+    pid_b = tl.program_id(0)
+    pid_h = tl.program_id(1)
+    offs_d = tl.arange(0, D)
+
+    # across split LSE combine
+    e_sum = 0.0
+    e_max = -float("inf")
+    acc = tl.zeros([D], dtype=tl.float32)
+
+    for s in tl.range(KEY_SPLIT):
+        row_mid = pid_b * (KEY_SPLIT * HQ) + s * HQ + pid_h
+        tv = tl.load(mid_o + row_mid * D + offs_d).to(DTYPE)
+        tl_ptr = mid_lse + pid_b * STRIDE_LBS + s * STRIDE_LS + pid_h * STRIDE_LH
+        tlogic = tl.load(tl_ptr)
+
+        n_e_max = tl.maximum(e_max, tlogic)
+        old_scale = tl.exp(e_max - n_e_max)
+        acc = acc * old_scale + tl.exp(tlogic - n_e_max) * tv.to(tl.float32)
+        e_sum = e_sum * old_scale + tl.exp(tlogic - n_e_max)
+        e_max = n_e_max
+
+    o = (acc / e_sum).to(DTYPE)
+    o_ptr = output + pid_b * STRIDE_OBS + pid_h * STRIDE_OH + offs_d
+    tl.store(o_ptr, o)
--- a/vllm/compactor-vllm/src/compactor_vllm/attention/sparse_varlen_kernel.py
+++ b/vllm/compactor-vllm/src/compactor_vllm/attention/sparse_varlen_kernel.py
+import logging
+import math
+
+import torch
+import triton
+import triton.language as tl
+
+from compactor_vllm.utils.triton_compat import (
+    autotune as triton_autotune,
+    cuda_capability_geq,
+    maybe_set_allocator,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def causal_sparse_varlen_with_cache(
+    q,
+    k,
+    v,
+    k_cache,
+    v_cache,
+    seq_lens_bh,
+    global_page_table,
+    batch_mapping,
+    cu_seqlens_q,
+    max_seqlen_q: int,
+    max_seqlen_k_cache: int,
+    HKV: int,
+    PAGE_SIZE: int,
+    sm_scale=None,
+):
+    """
+    Causal prefill attention over a paged KV cache plus a block of newly
+    appended tokens in a packed batch format.
+
+    This function wraps the Triton kernel
+    ``_causal_head_sparse_varlen_with_cache`` to compute prefill attention for
+    a batch of variable-length sequences, where:
+      • Past keys/values are stored in a paged global KV cache
+        (``k_cache``, ``v_cache``) with a (per-layer) page table.
+
+      • New tokens for this step are given as K/V blocks
+        (``k``, ``v``), together with a packed query block ``q``.
+
+      • The result is equivalent to applying causal attention over the
+        concatenation of:
+            [ cached KV prefix  ||  (K_app, V_app) for this step ]
+        for each sequence in the batch.
+
+    Grouped-query attention (GQA / MQA) is supported by allowing more query
+    heads than KV heads: ``HQ`` must be divisible by ``HKV``.
+
+    Args:
+        :param q:
+            Query tensor of shape ``[N, HQ, D]`` (float16 / bfloat16/float32).
+            ``N`` is the total number of new tokens across the batch
+            (i.e. ``N = sum_b seqlen_q[b]``), packed according to
+            ``cu_seqlens_q``. ``HQ`` is the number of query heads, ``D`` the
+            head dimension (must be a power of two).
+        :param k:
+            New key tensor of shape ``[N, HKV, D]`` for the same tokens as
+            ``q``. These are the K values appended to the cache for this
+            prefill step.
+        :param v:
+            New value tensor of shape ``[N, HKV, D]`` for the same tokens as
+            ``q``.
+        :param k_cache:
+            Global key cache backing buffer of shape ``[CACHE_SIZE, D]``.
+            Keys for all cached tokens and heads are stored here; the mapping
+            from (batch, head, token index) to a row in this buffer is
+            given by ``global_page_table``.
+        :param v_cache:
+            Global value cache of shape ``[CACHE_SIZE, D]``. Must have the
+            same layout as ``k_cache`` (same ``CACHE_SIZE`` and ``D``).
+        :param seq_lens_bh:
+            Tensor of shape ``[B, HKV]`` (int32) giving, for each local batch
+            index and KV head, the number of cached tokens already present
+            in the paged KV cache before this prefill step.
+        :param global_page_table:
+            Tensor of shape ``[MAX_NUM_BATCHES, HKV, N_LOGICAL_PAGES_MAX]`` (int32)
+            mapping ``(true_batch_idx, kv_head, logical_page)`` to a physical
+            page id in the global KV cache. A physical page id `p` refers to
+            the slice:
+                ``k_cache[p * PAGE_SIZE : (p + 1) * PAGE_SIZE]``.
+        :param batch_mapping:
+            Tensor of shape ``[B]`` (int16 / int32) mapping the local batch
+            index used in this kernel launch to the global batch index used
+            to index ``global_page_table``. This allows the same global cache
+            to be shared across multiple microbatches.
+        :param cu_seqlens_q:
+            Tensor of shape ``[B + 1]`` (int32) with cumulative sequence
+            lengths for the *new* tokens (q/k/v) in packed form. For batch
+            element ``b``:
+                ``seqlen_q[b] = cu_seqlens_q[b + 1] - cu_seqlens_q[b]``.
+            The total number of tokens satisfies
+                ``N = cu_seqlens_q[-1]``.
+        :param max_seqlen_q:
+            Maximum new query sequence length across the batch, i.e.
+            ``max_b seqlen_q[b]``.
+        :param max_seqlen_k_cache:
+            Maximum cached sequence length across (batch, KV head), i.e.
+            ``max_{b,h} seq_lens_bh[b, h]``.
+        :param HKV:
+            Number of KV heads. Must divide ``HQ``.
+        :param PAGE_SIZE:
+            Number of tokens stored per physical page in the paged KV cache.
+            ``CACHE_SIZE`` must be divisible by ``PAGE_SIZE``.
+        :param sm_scale:
+            Optional scaling factor applied to the attention logits before
+            softmax. If ``None``, defaults to ``1.0 / sqrt(D)``.
+        :returns torch.Tensor:
+            Attention output of shape ``[N, HQ, D]``, with the same dtype and
+            device as ``q``. The output is laid out in the same packed
+            varlen format as the input queries, i.e. the first
+            ``seqlen_q[0]`` rows correspond to batch 0, the next
+            ``seqlen_q[1]`` rows to batch 1, etc.
+    """
+    assert q.ndim == 3, "q should be [N, HQ, D]"
+    N, HQ, D = q.shape
+    assert (D & (D - 1)) == 0, "D must be power of two"
+
+    B = cu_seqlens_q.numel() - 1
+    assert B > 0
+    assert HQ % HKV == 0, "Number of query heads must divide number of keys heads"
+    H_g = HQ // HKV
+    # view Q as [HKV, N, QUERY_GROUP_SIZE, D]
+    out = torch.empty_like(q)
+    q = q.view(N, HKV, H_g, D).permute(1, 0, 2, 3)
+    out = out.view(N, HKV, H_g, D).permute(1, 0, 2, 3)
+
+    # K_app/V_app: [N, HKV, D] -> [HKV, N, D]
+    k_app = k.view(N, HKV, D).permute(1, 0, 2)
+    v_app = v.view(N, HKV, D).permute(1, 0, 2)
+
+    cu_seqlens_q = cu_seqlens_q.to(dtype=torch.int32, device=q.device)
+    seq_lens_bh = seq_lens_bh.to(dtype=torch.int32, device=q.device)
+    batch_mapping = batch_mapping.to(dtype=torch.int16, device=q.device)
+
+    N_LOGICAL_PAGES_MAX = global_page_table.shape[-1]
+    CACHE_SIZE = k_cache.shape[0]
+    assert v_cache.shape[0] == CACHE_SIZE
+    assert k_cache.shape[1] == D and v_cache.shape[1] == D
+    assert PAGE_SIZE > 0 and CACHE_SIZE % PAGE_SIZE == 0
+
+    if sm_scale is None:
+        sm_scale = 1.0 / math.sqrt(D)
+
+    # strides for Q [G, N, QUERY_GROUP_SIZE, D]
+    STRIDE_Q_G, STRIDE_Q_N, STRIDE_Q_H, STRIDE_Q_D = q.stride()
+    STRIDE_KC, STRIDE_VC = k_cache.stride(0), v_cache.stride(0)
+    # [G, N, D]
+    STRIDE_KA_G, STRIDE_KA_N, STRIDE_KA_D = k_app.stride()
+    STRIDE_VA_G, STRIDE_VA_N, STRIDE_VA_D = v_app.stride()
+
+    # OUT [G, N, QUERY_GROUP_SIZE, D]
+    STRIDE_OUT_G, STRIDE_OUT_N, STRIDE_OUT_H, STRIDE_OUT_D = out.stride()
+    # launch grid
+    maybe_set_allocator(
+        lambda size, align, _: torch.empty(size, dtype=torch.int8, device=q.device)
+    )
+    assert STRIDE_KA_D == STRIDE_VA_D == STRIDE_Q_D == STRIDE_OUT_D == 1, (
+        "final dimension must be contiguous"
+    )
+
+    def grid(META):
+        return HKV, B, triton.cdiv(max_seqlen_q, META["BLOCK_M"])
+
+    # On a fresh batch, max_seqlen_k_cache==0 (no KV prefix yet). Passing
+    # `triton.next_power_of_2(0)` into autotune constexpr keys breaks
+    # kernel selection / tuning and can yield garbage outputs.
+    _k_max_autotune = max(int(max_seqlen_k_cache), 1)
+    AUTOTUNE_MAX_Q_LEN = triton.next_power_of_2(max_seqlen_q)
+    AUTOTUNE_MAX_K_LEN = triton.next_power_of_2(_k_max_autotune)
+    _causal_head_sparse_varlen_with_cache[grid](
+        Q=q,
+        K_cache=k_cache,
+        V_cache=v_cache,
+        K_app=k_app,
+        V_app=v_app,
+        cu_seqlens_qk=cu_seqlens_q,
+        seq_lens_bh=seq_lens_bh,
+        page_table=global_page_table,
+        batch_mapping=batch_mapping,
+        OUT=out,
+        HKV=HKV,
+        QUERY_GROUP_SIZE=H_g,
+        PAGE_SIZE=PAGE_SIZE,
+        N_LOGICAL_PAGES_MAX=N_LOGICAL_PAGES_MAX,
+        STRIDE_Q_G=STRIDE_Q_G,
+        STRIDE_Q_N=STRIDE_Q_N,
+        STRIDE_Q_H=STRIDE_Q_H,
+        STRIDE_KC=STRIDE_KC,
+        STRIDE_VC=STRIDE_VC,
+        STRIDE_KA_G=STRIDE_KA_G,
+        STRIDE_KA_N=STRIDE_KA_N,
+        STRIDE_VA_G=STRIDE_VA_G,
+        STRIDE_VA_N=STRIDE_VA_N,
+        STRIDE_OUT_G=STRIDE_OUT_G,
+        STRIDE_OUT_N=STRIDE_OUT_N,
+        STRIDE_OUT_H=STRIDE_OUT_H,
+        sm_scale=sm_scale,
+        D=D,
+        AUTOTUNE_MAX_Q_LEN=AUTOTUNE_MAX_Q_LEN,
+        AUTOTUNE_MAX_K_LEN=AUTOTUNE_MAX_K_LEN,
+    )
+    return out.permute(1, 0, 2, 3).view(N, HQ, D)  # already contiguous
+
+
+autotune_configs_cc9 = [
+    triton.Config(
+        {"BLOCK_N": 64, "BLOCK_M": 64, "WARPSPEC": True}, num_warps=16, num_stages=3
+    ),
+    triton.Config(
+        {"BLOCK_N": 64, "BLOCK_M": 64, "WARPSPEC": True}, num_warps=8, num_stages=3
+    ),
+    triton.Config(
+        {"BLOCK_N": 64, "BLOCK_M": 32, "WARPSPEC": True}, num_warps=8, num_stages=4
+    ),
+    triton.Config(
+        {"BLOCK_N": 64, "BLOCK_M": 32, "WARPSPEC": True}, num_warps=8, num_stages=3
+    ),
+    triton.Config(
+        {"BLOCK_N": 64, "BLOCK_M": 32, "WARPSPEC": False}, num_warps=4, num_stages=3
+    ),
+    triton.Config(
+        {"BLOCK_N": 64, "BLOCK_M": 16, "WARPSPEC": True}, num_warps=8, num_stages=3
+    ),
+    triton.Config(
+        {"BLOCK_N": 64, "BLOCK_M": 16, "WARPSPEC": True}, num_warps=8, num_stages=4
+    ),
+    triton.Config(
+        {"BLOCK_N": 64, "BLOCK_M": 16, "WARPSPEC": False}, num_warps=4, num_stages=4
+    ),
+    triton.Config(
+        {"BLOCK_N": 32, "BLOCK_M": 32, "WARPSPEC": True}, num_warps=8, num_stages=4
+    ),
+    triton.Config(
+        {"BLOCK_N": 32, "BLOCK_M": 32, "WARPSPEC": False}, num_warps=8, num_stages=4
+    ),
+    triton.Config(
+        {"BLOCK_N": 32, "BLOCK_M": 16, "WARPSPEC": False}, num_warps=8, num_stages=3
+    ),
+    triton.Config(
+        {"BLOCK_N": 32, "BLOCK_M": 16, "WARPSPEC": False}, num_warps=4, num_stages=4
+    ),
+]
+
+autotune_configs_cc8 = [
+    triton.Config(
+        {"BLOCK_N": BN, "BLOCK_M": BM, "WARPSPEC": True}, num_warps=w, num_stages=s
+    )
+    for BN in [16, 32]
+    for BM in [64]
+    for w in [4, 8]
+    for s in [2, 3]
+]
+
+
+def prune_invalid_configs(configs, _, **kwargs):
+    return [
+        conf
+        for conf in configs
+        if not (conf.kwargs.get("BLOCK_N") == 32 and conf.kwargs.get("num_stages") == 4)
+    ]
+
+
+def get_autotune_configs():
+    if cuda_capability_geq(9, 0):
+        return autotune_configs_cc9
+    else:
+        return autotune_configs_cc8
+
+
+@triton_autotune(
+    configs=get_autotune_configs(),
+    key=[
+        "HKV",
+        "QUERY_GROUP_SIZE",
+        "D",
+        "PAGE_SIZE",
+        "AUTOTUNE_MAX_K_LEN",
+        "AUTOTUNE_MAX_Q_LEN",
+    ],
+    cache_results=True,
+)
+@triton.jit
+def _causal_head_sparse_varlen_with_cache(
+    Q,  # [HKV, N, QUERY_GROUP_SIZE, D] (non-contiguous)
+    K_cache,
+    V_cache,  # [CACHE_SIZE, D]
+    K_app,
+    V_app,  # [HKV, N, D]
+    cu_seqlens_qk,  # [B+1]
+    seq_lens_bh,  # [B, HKV]
+    page_table,  # [B_total, HKV, N_LOGICAL_PAGES_MAX]
+    batch_mapping,  # [B], maps local b -> global batch index
+    OUT,  # [HKV, N, QUERY_GROUP_SIZE, D]
+    #
+    HKV: tl.constexpr,
+    QUERY_GROUP_SIZE: tl.constexpr,
+    PAGE_SIZE: tl.constexpr,
+    N_LOGICAL_PAGES_MAX,
+    STRIDE_Q_G,
+    STRIDE_Q_N,
+    STRIDE_Q_H,
+    STRIDE_KC,
+    STRIDE_VC,
+    STRIDE_KA_G,
+    STRIDE_KA_N,
+    STRIDE_VA_G,
+    STRIDE_VA_N,
+    STRIDE_OUT_G,
+    STRIDE_OUT_N,
+    STRIDE_OUT_H,
+    sm_scale,
+    #
+    D: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    WARPSPEC: tl.constexpr,
+    AUTOTUNE_MAX_Q_LEN: tl.constexpr,  # used for autotune key
+    AUTOTUNE_MAX_K_LEN: tl.constexpr,  # used for autotune key
+):
+    TOTAL_N_QUERIES: tl.constexpr = BLOCK_M * QUERY_GROUP_SIZE
+    pid_g = tl.program_id(0)  # kv_head id in [0, HKV)
+    pid_b = tl.program_id(1)  # batch id
+    pid_m = tl.program_id(2)  # query-tile id within batch
+
+    # batch segment [qb, qe) in N
+    off_b = tl.load(cu_seqlens_qk + pid_b)
+    off_b1 = tl.load(cu_seqlens_qk + pid_b + 1)
+    seq_len_append = off_b1 - off_b
+
+    q_start = off_b + pid_m * BLOCK_M
+    q_end = tl.minimum(q_start + BLOCK_M, off_b1)
+    # number of queries in this tile for this batch
+    M = q_end - q_start
+    if M <= 0:
+        return
+
+    # cached length for (b, kv_head=pid_g)
+    L_cache = tl.load(seq_lens_bh + pid_b * HKV + pid_g)
+    # row indices flattened over [QUERY_GROUP_SIZE, M]
+    offs_row = tl.arange(0, TOTAL_N_QUERIES)
+    row_m = offs_row % BLOCK_M
+    row_h = offs_row // BLOCK_M
+    # valid rows: only those with row_m < M
+    row_mask = row_m < M
+
+    # global query index per row
+    q_idx = q_start + row_m
+    offs_d = tl.arange(0, D)
+    # Q tile: [TOTAL_N_QUERIES, D]
+    # Q layout: [HKV, N, QUERY_GROUP_SIZE, D]
+    q_ptrs = (
+        Q
+        + pid_g * STRIDE_Q_G
+        + q_idx[:, None] * STRIDE_Q_N
+        + row_h[:, None] * STRIDE_Q_H
+        + offs_d[None, :]
+    )
+    q = tl.load(q_ptrs, mask=row_mask[:, None], other=0.0)
+
+    e_max = tl.zeros([TOTAL_N_QUERIES], dtype=tl.float32) - float("inf")
+    e_sum = tl.zeros([TOTAL_N_QUERIES], dtype=tl.float32)
+    acc = tl.zeros([TOTAL_N_QUERIES, D], dtype=tl.float32)
+
+    offs_block_n = tl.arange(0, BLOCK_N)
+    qk_scale = sm_scale * 1.44269504
+
+    # 1) attend over cachee K/V
+    if L_cache > 0:
+        # map local (b) to global batch index
+        mapped_b = tl.load(batch_mapping + pid_b)
+        pt_base = (mapped_b * HKV + pid_g) * N_LOGICAL_PAGES_MAX
+        # iterate logical pages
+        num_lp = tl.cdiv(L_cache, PAGE_SIZE)
+        for lp in tl.range(0, num_lp):
+            # can overflow in 32 bits so upcast
+            phys = tl.load(page_table + pt_base + lp).to(tl.int64)
+            page_start = phys * PAGE_SIZE
+            # how many valid tokens in this page for this (b,g)
+            remain = L_cache - lp * PAGE_SIZE
+            page_len = tl.minimum(PAGE_SIZE, remain)
+            # iterate over this page in BLOCK_N chunks
+            for ks in tl.range(0, page_len, BLOCK_N):
+                offs_n = ks + offs_block_n
+                mask_n = offs_n < page_len
+
+                key_idx = page_start + offs_n
+                k_ptrs = K_cache + key_idx[:, None] * STRIDE_KC + offs_d[None, :]
+
+                k = tl.load(k_ptrs, mask=mask_n[:, None], other=0.0)  # [BN, D]
+                qk = tl.dot(q, k.T) * qk_scale  # [TOTAL_N_QUERIES, BN]
+                qk = tl.where(row_mask[:, None] & mask_n[None, :], qk, -1.0e6)
+
+                # softmax update
+                cur_max = tl.max(qk, 1)
+                n_e_max = tl.maximum(e_max, cur_max)
+                re_scale = tl.math.exp2(e_max - n_e_max)
+                p = tl.math.exp2(qk - n_e_max[:, None])
+
+                v_ptrs = V_cache + key_idx[:, None] * STRIDE_VC + offs_d[None, :]
+                v = tl.load(v_ptrs, mask=mask_n[:, None], other=0.0)  # [BN, D]
+
+                acc = acc * re_scale[:, None]
+                acc = tl.dot(p.to(v.dtype), v, acc)
+
+                e_sum = e_sum * re_scale + tl.sum(p, 1)
+                e_max = n_e_max
+
+    # 2) attend over appended K_app/V_app (causal)
+    # appended tokens for batch b are in [off_b, off_b1)
+    # query tile is [q_start, q_end)
+    # for each query at index q_idx, valid appended keys k satisfy off_b <= k <= q_idx
+    if q_end > off_b:
+        # exactly one appended token
+        if seq_len_append == 1:
+            ka_ptrs = K_app + pid_g * STRIDE_KA_G + off_b * STRIDE_KA_N + offs_d
+            k = tl.load(ka_ptrs)  # [D]
+            qk = tl.sum(q * k[None, :], 1) * qk_scale
+            qk = tl.where(row_mask, qk, -1.0e6)
+            n_e_max = tl.maximum(e_max, qk)
+            re_scale = tl.math.exp2(e_max - n_e_max)
+            p = tl.math.exp2(qk - n_e_max)
+            va_ptrs = V_app + pid_g * STRIDE_VA_G + off_b * STRIDE_VA_N + offs_d
+            v = tl.load(va_ptrs)  # [D]
+            acc = acc * re_scale[:, None] + p[:, None] * v[None, :]
+            e_sum = e_sum * re_scale + p
+        else:
+            # off-band: k in [off_b, q_start)
+            # for all queries t in [q_start, q_end), any k < q_start satisfies k <= t.
+            # so no causal mask needed.
+            off_band_start = off_b
+            off_band_end = q_start
+
+            if off_band_end > off_band_start:
+                for ks in tl.range(off_band_start, off_band_end, BLOCK_N):
+                    offs_n = ks + offs_block_n
+                    mask_n = offs_n < off_band_end
+
+                    ka_ptrs = (
+                        K_app
+                        + pid_g * STRIDE_KA_G
+                        + offs_n[:, None] * STRIDE_KA_N
+                        + offs_d[None, :]
+                    )
+                    k = tl.load(ka_ptrs, mask=mask_n[:, None], other=0.0)
+
+                    qk = tl.dot(q, k.T) * qk_scale
+                    qk = tl.where(row_mask[:, None] & mask_n[None, :], qk, -1.0e6)
+
+                    cur_max = tl.max(qk, 1)
+                    n_e_max = tl.maximum(e_max, cur_max)
+
+                    re_scale = tl.math.exp2(e_max - n_e_max)
+                    p = tl.math.exp2(qk - n_e_max[:, None])
+
+                    va_ptrs = (
+                        V_app
+                        + pid_g * STRIDE_VA_G
+                        + offs_n[:, None] * STRIDE_VA_N
+                        + offs_d[None, :]
+                    )
+                    v = tl.load(va_ptrs, mask=mask_n[:, None], other=0.0)
+
+                    acc = acc * re_scale[:, None]
+                    acc = tl.dot(p.to(v.dtype), v, acc)
+
+                    e_sum = e_sum * re_scale + tl.sum(p, 1)
+                    e_max = n_e_max
+
+            # on-band remaining k
+            on_band_start = tl.maximum(q_start, off_b)
+            if on_band_start < q_end:
+                for ks in tl.range(on_band_start, q_end, BLOCK_N):
+                    offs_n = ks + tl.arange(0, BLOCK_N)
+                    mask_n = offs_n < q_end
+
+                    ka_ptrs = (
+                        K_app
+                        + pid_g * STRIDE_KA_G
+                        + offs_n[:, None] * STRIDE_KA_N
+                        + offs_d[None, :]
+                    )
+
+                    k = tl.load(ka_ptrs, mask=mask_n[:, None], other=0.0)
+
+                    qk = tl.dot(q, k.T) * qk_scale
+
+                    caus_mask = offs_n[None, :] <= q_idx[:, None]
+                    full_mask = row_mask[:, None] & mask_n[None, :] & caus_mask
+
+                    qk = tl.where(full_mask, qk, -1.0e6)
+
+                    cur_max = tl.max(qk, 1)
+                    n_e_max = tl.maximum(e_max, cur_max)
+                    re_scale = tl.math.exp2(e_max - n_e_max)
+                    p = tl.math.exp2(qk - n_e_max[:, None])
+
+                    va_ptrs = (
+                        V_app
+                        + pid_g * STRIDE_VA_G
+                        + offs_n[:, None] * STRIDE_VA_N
+                        + offs_d[None, :]
+                    )
+                    v = tl.load(va_ptrs, mask=mask_n[:, None], other=0.0)
+
+                    acc = acc * re_scale[:, None]
+                    acc = tl.dot(p.to(v.dtype), v, acc)
+
+                    e_sum = e_sum * re_scale + tl.sum(p, 1)
+                    e_max = n_e_max
+
+    # 3) write outputs
+    o = (acc / e_sum[:, None]).to(q.dtype)
+    out_ptrs = (
+        OUT
+        + pid_g * STRIDE_OUT_G
+        + q_idx[:, None] * STRIDE_OUT_N
+        + row_h[:, None] * STRIDE_OUT_H
+        + offs_d[None, :]
+    )
+    tl.store(out_ptrs, o, mask=row_mask[:, None])
+
--- a/vllm/compactor-vllm/src/compactor_vllm/benchmark/__init__.py
+++ b/vllm/compactor-vllm/src/compactor_vllm/benchmark/__init__.py
--- a/vllm/compactor-vllm/src/compactor_vllm/compression/__init__.py
+++ b/vllm/compactor-vllm/src/compactor_vllm/compression/__init__.py
+from compactor_vllm.compression.common import (
+    BaseCompressionMethod,
+    NoCompression,
+)
+from compactor_vllm.compression.criticalkv import CriticalAdaKVCompression
+from compactor_vllm.compression.compactor import CompactorCompression
+from compactor_vllm.compression.compression_config import (
+    BatchCompressionParams,
+    CompressionMethod,
+    SequenceCompressionParams,
+)
+from compactor_vllm.compression.snapkv import SnapKVCompression
+
+COMPRESSION_REGISTRY: dict[CompressionMethod, type[BaseCompressionMethod]] = {
+    CompressionMethod.CRITICALADAKV: CriticalAdaKVCompression,
+    CompressionMethod.COMPACTOR: CompactorCompression,
+    CompressionMethod.SNAPKV: SnapKVCompression,
+    CompressionMethod.NONE: NoCompression,
+}
+
+
+def apply_prerope_compression(q, k, v, context):
+    method = context.compression_context.compression_method
+    return COMPRESSION_REGISTRY[method].pre_rope_scoring(q, k, v, context=context)
+
+
+def apply_postrope_compression(q, k, v, prerope_scores, context):
+    method = context.compression_context.compression_method
+    return COMPRESSION_REGISTRY[method].post_rope_scoring(
+        q, k, v, prerope_scores, context=context
+    )
+
+
+__all__ = [
+    "apply_prerope_compression",
+    "apply_postrope_compression",
+    "CompressionMethod",
+    "BatchCompressionParams",
+    "SequenceCompressionParams",
+    "COMPRESSION_REGISTRY"
+]
--- a/vllm/compactor-vllm/src/compactor_vllm/compression/common.py
+++ b/vllm/compactor-vllm/src/compactor_vllm/compression/common.py
+from abc import ABC, abstractmethod
+from typing import Optional
+
+import torch
+
+from compactor_vllm.kv_cache.store_kv_cache import prefill_store_topk_kv
+
+
+class BaseCompressionMethod(ABC):
+    """
+    Abstract interface for KV cache compression methods.
+
+    A compression method is implemented as a pair of optional scoring phases
+    that run before and after rotary position embedding (RoPE) is applied:
+
+      1. ``pre_rope_scoring`` operates on pre-RoPE Q/K.
+
+      2. ``post_rope_scoring`` operates on post-RoPE Q/K and can either:
+         - refine / reweight the pre-RoPE scores, or
+         - compute potentially position-aware.
+
+    Concrete subclasses are expected to implement both
+    static methods and return a single tensor of scores (or ``None`` if the
+    phase is a no-op), which the caller can then feed into the shared
+    “scores → top-k indices → KV extraction” pipeline.
+    """
+
+    @staticmethod
+    @abstractmethod
+    def pre_rope_scoring(
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        context,
+    ) -> Optional[torch.Tensor]:
+        """
+        Compute per-token importance scores from pre-RoPE queries/keys.
+
+        Args:
+            :param q:
+                Pre-RoPE query tensor. Shape ``[total_tokens, HQ, D]```.
+            :param k:
+                Pre-RoPE key tensor. Shape ``[total_tokens, HKV, D]```.
+            :param v:
+                Value tensor. Shape ``[total_tokens, HKV, D]```
+            :param context:
+                compactor_vllm.utils.context.Context object carrying additional metadata,
+                such as batch mappings or temporary buffers
+
+        Returns:
+            :return Optional[torch.Tensor]:
+                A tensor of scores (e.g. per-token, per-head importance values)
+                to be passed to ``post_rope_scoring`` or directly into the
+                top-k selection step. If this phase is a no-op, implementations
+                should return ``None``. Shape ``[total_tokens, HKV]```.
+        """
+        pass
+
+    @staticmethod
+    @abstractmethod
+    def post_rope_scoring(
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        pre_rope_scores: Optional[torch.Tensor],
+        context,
+    ) -> Optional[torch.Tensor]:
+        """
+        Compute or refine importance scores from post-RoPE queries/keys.
+
+        This method is called after rotary embeddings have been applied. It can
+        optionally use both the post-RoPE Q/K and any scores produced by
+        ``pre_rope_scoring`` to produce final scores used for token selection.
+
+        Common patterns include:
+          * Using ``pre_rope_scores`` as a base signal and applying a
+            position-aware correction.
+          * Only computing scores that depend on absolute or relative positions.
+          * Simply passing through ``pre_rope_scores`` unchanged.
+
+        Args:
+            :param q:
+                Post-RoPE query tensor. Shape ``[total_tokens, HQ, D]```.
+            :param k:
+                Post-RoPE key tensor. Shape ``[total_tokens, HKV, D]```.
+            :param pre_rope_scores:
+                Optional scores returned by ``pre_rope_scoring``. May be
+                ``None`` if the pre-RoPE phase returned None.
+            :param v:
+                Value tensor. Shape ``[total_tokens, HKV, D]```
+            :param context:
+                compactor_vllm.utils.context.Context object carrying additional metadata,
+                such as batch mappings or temporary buffers
+        Returns:
+            :return Optional[torch.Tensor]:
+                Final importance scores to be consumed by the compression
+                pipeline (for top-k token selection). If this phase is a
+                no-op, implementations may return ``pre_rope_scores``. If
+                None is returned, no compression will be applied.
+        """
+        pass
+
+
+class NoCompression(BaseCompressionMethod):
+    """
+    Trivial compression method that disables KV cache compression.
+    """
+
+    @staticmethod
+    def pre_rope_scoring(
+        q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, context
+    ) -> Optional[torch.Tensor]:
+        return None
+
+    @staticmethod
+    def post_rope_scoring(
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        pre_rope_scores: torch.Tensor,
+        context,
+    ) -> Optional[torch.Tensor]:
+        return pre_rope_scores
+
+
+def extract_and_store_top_kv(
+    scores: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    max_k_len: int,
+    top_k: int,
+    H: int,
+    new_keys: torch.Tensor,  # [N_total, H, D]
+    new_vals: torch.Tensor,  # [N_total, H, D]
+    num_tokens_to_retain: torch.Tensor,  # [B] int32
+    page_table: torch.Tensor,  # [B_total, H, N_LOGICAL_PAGES_MAX] int32
+    batch_mapping: torch.Tensor,  # [B] int32 (local -> true batch rows)
+    bh_lens: torch.Tensor,  # [B, H] int32 (contiguous), UPDATED atomically
+    k_cache: torch.Tensor,  # [N_PAGES * PAGE_SIZE, D]
+    v_cache: torch.Tensor,  # [N_PAGES * PAGE_SIZE, D]
+    PAGE_SIZE: int,
+    PAD_TO_PAGE_SIZE: bool = True,
+    K_TILE: int = 16,
+    padding: float = -float("inf"),
+):
+    """helper method to extract and store top-k indices into KV cache (so they can be executed in a single stream)"""
+    indices_topk = scores_to_retain_indices(
+        scores,
+        cu_seqlens_k=cu_seqlens_k,
+        max_k_len=max_k_len,
+        top_k=top_k,
+        H=H,
+        padding=padding,
+    )
+    prefill_store_topk_kv(
+        new_keys=new_keys,
+        new_vals=new_vals,
+        indices_topk=indices_topk,
+        num_tokens_to_retain=num_tokens_to_retain,
+        page_table=page_table,
+        batch_mapping=batch_mapping,
+        bh_lens=bh_lens,
+        k_cache=k_cache,
+        v_cache=v_cache,
+        cu_seqlens_k=cu_seqlens_k,
+        PAGE_SIZE=PAGE_SIZE,
+        PAD_TO_PAGE_SIZE=PAD_TO_PAGE_SIZE,
+        K_TILE=K_TILE,
+    )
+
+
+def scores_to_retain_indices(
+    scores: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    max_k_len: int,
+    top_k: int,
+    H: int,
+    padding: float = -float("inf"),
+) -> torch.Tensor:
+    """
+    Select global top-k token–head indices per sequence from packed scores.
+
+    This helper takes per-token, per-head scores in packed varlen form and
+    returns, for each batch element, the indices of the top-k (token, head)
+    pairs in the flattened global layout.
+    Inputs are assumed to follow the usual packed varlen convention:
+      • ``scores`` is laid out as ``[N_total, H]``, where:
+          ``N_total = sum_b seqlen_k[b]``
+        and ``HKV`` is the number of KV heads.
+
+      • ``cu_seqlens_k`` is ``[B + 1]`` (int32), giving cumulative lengths
+        for the keys per batch:
+            ``seqlen_k[b] = cu_seqlens_k[b + 1] - cu_seqlens_k[b]``.
+
+      • ``max_k_len`` is an upper bound on ``seqlen_k[b]`` across the batch.
+
+    The function pads each sequence to length ``max_k_len`` with ``padding``
+    (default: ``-inf``), flattens the per-sequence scores into shape
+    ``[B, max_k_len * H]``, and runs a per-batch top-k. The returned indices
+    are shifted so that they directly index into the flattened global
+    score layout of shape ``[N_total * H]``:
+        global_index = (token_global_offset * H) + head_index
+
+    Args:
+        :param scores:
+            Tensor of shape ``[N_total, HKV]`` containing scores for each
+            (token, head) pair in packed varlen format.
+        :param cu_seqlens_k:
+            Tensor of shape ``[B + 1]`` (int32) with cumulative key sequence
+            lengths for each batch element. The total number of tokens
+            satisfies ``N_total = cu_seqlens_k[-1]``.
+        :param max_k_len:
+            Maximum key sequence length across the batch (i.e.
+            ``max_b seqlen_k[b]``). Used to allocate the padded buffer.
+        :param top_k:
+            Number of (token, head) entries to retain **per batch element**.
+            If ``top_k > max_k_len * HKV``, it is clamped to ``max_k_len * HKV``.
+        :param H:
+            Number of key heads; must match ``scores.shape[1]``.
+        :param padding:
+            Padding value used when extending sequences shorter than
+            ``max_k_len``. Defaults to ``-inf``, so that padded positions are
+            never selected in the top-k.
+
+    Returns:
+        :return torch.Tensor:
+            Tensor of shape ``[B, k_eff]`` (int64) where
+            ``k_eff = min(top_k, max_k_len * H)``. Each entry is a global
+            index into the flattened score array of shape ``[N_total * H]``
+            (i.e. scores viewed as ``scores.view(-1)``),
+    """
+    # idea: pad and then select top-k.
+    B, device = cu_seqlens_k.numel() - 1, scores.device
+    padded = torch.full(
+        (B, max_k_len, H), fill_value=padding, dtype=scores.dtype, device=device
+    )
+    for b in range(B):
+        s, e = int(cu_seqlens_k[b]), int(cu_seqlens_k[b + 1])
+        padded[b, : e - s, :].copy_(scores[s:e, :])
+    flat = padded.view(B, max_k_len * H)
+    idx = torch.topk(
+        flat, k=min(top_k, max_k_len * H), dim=1, largest=True, sorted=True
+    ).indices
+    return idx + (cu_seqlens_k[:-1] * H).unsqueeze(-1)
--- a/vllm/compactor-vllm/src/compactor_vllm/compression/compactor.py
+++ b/vllm/compactor-vllm/src/compactor_vllm/compression/compactor.py
--- a/vllm/compactor-vllm/src/compactor_vllm/compression/compactor_origin.py
+++ b/vllm/compactor-vllm/src/compactor_vllm/compression/compactor_origin.py
--- a/vllm/compactor-vllm/src/compactor_vllm/compression/compression_config.py
+++ b/vllm/compactor-vllm/src/compactor_vllm/compression/compression_config.py
+import logging
+from dataclasses import dataclass
+from enum import Enum, auto
+
+logger = logging.getLogger(__name__)
+
+
+class CompressionMethod(Enum):
+    CRITICALADAKV = auto()
+    COMPACTOR = auto()
+    SNAPKV = auto()
+    NONE = auto()
+
+
+# class CachingPolicy(Enum):
+#     CACHE_PROMPT = auto()
+#     DONT_CACHE = auto()
+
+
+# class CompressionType(Enum):
+#     QUERY_AWARE = auto()
+#     QUERY_AGNOSTIC = auto()
+
+
+@dataclass
+class SequenceCompressionParams:
+    compression_ratio: float = 1.0
+    protected_first_tokens: int = 16
+    protected_last_tokens: int = 64
+
+
+@dataclass
+class BatchCompressionParams:
+    # compression_type: CompressionType = CompressionType.QUERY_AGNOSTIC
+    compression_method: CompressionMethod = CompressionMethod.COMPACTOR
+
+    do_chunked_compression: bool = True
+    chunk_size: int = 512
+
+    def __post_init__(self):
+        if self.compression_method == CompressionMethod.SNAPKV:
+            self.do_chunked_compression = False
+            logger.warning(
+                "CompressionMethod.SNAPKV is not compatible with chunked compression. Disabling it."
+            )
--- a/vllm/compactor-vllm/src/compactor_vllm/compression/criticalkv-cursor.py
+++ b/vllm/compactor-vllm/src/compactor_vllm/compression/criticalkv-cursor.py
+"""
+CriticalAdaKV: 在 Compactor（pre RoPE 杠杆分 + post RoPE 非因果注意力融合）基础上，
+用输出投影 Wo 对 Value 的 L1 范数做 Stage-2 重加权；Stage-1 在 Compactor 基础分上做预算内 top-k 保护。
+
+预算与 compactor_vllm 引擎一致：使用 ``compression_context.batch_tokens_to_retain``（flatten 的
+(token, head) 对数量）及首/尾保护段长度。
+
+注意：不得在 import 时加载 ``compactor_vllm.utils.context``（其会再 import ``CompressionMethod``，
+与 ``compression/__init__.py`` 导入本模块形成环）。运行时只使用与 ``CompressionContext`` 同字段的 duck 对象。
+"""
+
+from __future__ import annotations
+
+from typing import Any, Optional, Tuple
+
+import torch
+import triton
+from triton import language as tl
+
+from compactor_vllm.compression.common import BaseCompressionMethod
+from compactor_vllm.compression.compactor import (
+    CompactorCompression,
+    non_causal_attn_scores,
+)
+from compactor_vllm.compression.snapkv import SnapKVCompression
+from compactor_vllm.utils.helpers import maybe_execute_in_stream
+from compactor_vllm.utils.triton_compat import autotune as triton_autotune
+
+
+
+# ============================================================================
+# Triton Kernel 1: 计算 ||Wo @ V||₁ (L1 范数)
+# ============================================================================
+@triton_autotune(
+    configs=[
+        triton.Config({"BLOCK_K": bk, "BLOCK_D": bd}, num_warps=nw, num_stages=ns)
+        for bk in [32, 64, 128]
+        for bd in [32, 64]
+        for nw in [4, 8]
+        for ns in [3, 4]
+    ],
+    key=["Hk", "D", "HIDDEN"],
+    cache_results=True,
+)
+@triton.jit
+def _compute_wo_v_l1_kernel(
+    V,
+    WO,
+    cu_k,
+    OUT,
+    STRIDE_V_NK,
+    STRIDE_V_HK,
+    STRIDE_V_D,
+    STRIDE_WO_HQ,
+    STRIDE_WO_D,
+    STRIDE_WO_HID,
+    STRIDE_OUT_NK,
+    STRIDE_OUT_HK,
+    Hk: tl.constexpr,
+    Hq: tl.constexpr,
+    D: tl.constexpr,
+    HIDDEN: tl.constexpr,
+    QUERY_GROUP_SIZE: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+):
+    b = tl.program_id(0)
+    hk = tl.program_id(1)
+    ks = tl.program_id(2)
+
+    k_beg = tl.load(cu_k + b)
+    k_end = tl.load(cu_k + b + 1)
+
+    nk_off = ks * BLOCK_K + tl.arange(0, BLOCK_K)
+    nk = k_beg + nk_off
+    k_mask = nk < k_end
+
+    out_ptrs = OUT + nk * STRIDE_OUT_NK + hk * STRIDE_OUT_HK
+    l1_sum = tl.zeros([BLOCK_K], dtype=tl.float32)
+
+    for g in range(QUERY_GROUP_SIZE):
+        hq = hk * QUERY_GROUP_SIZE + g
+
+        v_ptrs = (
+            V
+            + nk[:, None] * STRIDE_V_NK
+            + hk * STRIDE_V_HK
+            + tl.arange(0, D)[None, :] * STRIDE_V_D
+        )
+        v_blk = tl.load(v_ptrs, mask=k_mask[:, None], other=0.0).to(tl.float32)
+
+        for hid_off in range(0, HIDDEN, BLOCK_D):
+            hid_idx = hid_off + tl.arange(0, BLOCK_D)
+            hid_mask = hid_idx < HIDDEN
+
+            wo_ptrs = (
+                WO
+                + hq * STRIDE_WO_HQ
+                + tl.arange(0, D)[:, None] * STRIDE_WO_D
+                + hid_idx[None, :] * STRIDE_WO_HID
+            )
+            wo_tile = tl.load(wo_ptrs, mask=hid_mask[None, :], other=0.0).to(tl.float32)
+
+            wov_tile = tl.dot(v_blk, wo_tile)
+            l1_sum += tl.sum(tl.abs(wov_tile), axis=1)
+
+    l1_sum = l1_sum / QUERY_GROUP_SIZE
+    tl.store(out_ptrs, l1_sum, mask=k_mask)
+
+
+# ============================================================================
+# Triton Kernel 2: Stage 1 保护 + Stage 2 加权融合
+# ============================================================================
+@triton_autotune(
+    configs=[triton.Config({"BLOCK_K": bk}) for bk in [32, 64, 128, 256]],
+    key=["Hk"],
+    cache_results=True,
+)
+@triton.jit
+def _critical_ada_fuse_kernel(
+    BASE_SCORES,
+    WO_V_NORM,
+    STAGE1_MASK,
+    cu_k,
+    OUT,
+    EPSILON: tl.constexpr,
+    STRIDE_BS_NK,
+    STRIDE_BS_HK,
+    STRIDE_WN_NK,
+    STRIDE_WN_HK,
+    STRIDE_S1_NK,
+    STRIDE_S1_HK,
+    STRIDE_OUT_NK,
+    STRIDE_OUT_HK,
+    Hk: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
+    b = tl.program_id(0)
+    hk = tl.program_id(1)
+
+    k_beg = tl.load(cu_k + b)
+    k_end = tl.load(cu_k + b + 1)
+
+    for ks in tl.range(k_beg, k_end, BLOCK_K):
+        nk = ks + tl.arange(0, BLOCK_K)
+        kmask = nk < k_end
+
+        bs_ptrs = BASE_SCORES + nk * STRIDE_BS_NK + hk * STRIDE_BS_HK
+        wn_ptrs = WO_V_NORM + nk * STRIDE_WN_NK + hk * STRIDE_WN_HK
+        s1_ptrs = STAGE1_MASK + nk * STRIDE_S1_NK + hk * STRIDE_S1_HK
+
+        base = tl.load(bs_ptrs, mask=kmask, other=0.0)
+        wnorm = tl.load(wn_ptrs, mask=kmask, other=1.0)
+        stage1_protect = tl.load(s1_ptrs, mask=kmask, other=0).to(tl.int32)
+
+        fused = (base + EPSILON) * wnorm
+        fused = tl.where(stage1_protect == 1, float("inf"), fused)
+
+        out_ptrs = OUT + nk * STRIDE_OUT_NK + hk * STRIDE_OUT_HK
+        tl.store(out_ptrs, fused, mask=kmask)
+
+
+def critical_ada_key_scores(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    wo_weight: torch.Tensor,
+    cu_seqlens: torch.Tensor,
+    base_scores: torch.Tensor,
+    compression_ctx: Any,
+    *,
+    store_stream: Optional[torch.cuda.Stream] = None,
+) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]]:
+    """
+    使用与引擎一致的保留预算 ``batch_tokens_to_retain``（每条序列的 (token, head) 对数），
+    在每条序列上尽量贴近 kvpress 的 CriticalAdaKV 语义：
+      1) alpha_safeguard 安全预算（每头至少保留一部分）；
+      2) 基于 base_scores 的 head-wise 自适应预算分配（head_budgets）；
+      3) Stage-1 按 head_budgets * first_stage_ratio 保护；
+      4) Stage-2 计算 ``(base + eps) * ||Wo@V||_1``，再按 head_budgets 做每头 top-k 保护。
+
+    Args:
+        compression_ctx: 与 ``CompressionContext`` 相同字段即可（duck typing），须含
+            ``batch_tokens_to_retain``、``protected_first_tokens``、``protected_last_tokens``；
+            可选 ``critical_ada_epsilon``、``critical_ada_first_stage_ratio``、
+            ``critical_ada_alpha_safeguard``。
+    """
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1
+    device = q.device
+    _, Hq, D = q.shape
+    N_k, Hk, Dk = k.shape
+    assert D == Dk and Hq % Hk == 0
+
+    # 与 non_causal_attn_scores 使用同一 cu（prefill 下即 context.cu_seqlens_q），
+    # 保证 base_scores 行与 Triton 分段一致；勿与 cu_seqlens_k 混用。
+    B = cu_seqlens.numel() - 1
+    G = Hq // Hk
+    k_lengths = cu_seqlens[1:] - cu_seqlens[:-1]
+
+    btr = compression_ctx.batch_tokens_to_retain
+    assert btr is not None and btr.numel() == B
+    btr = btr.to(device=device, dtype=torch.int32)
+
+    prot_first = compression_ctx.protected_first_tokens or [0] * B
+    prot_last = compression_ctx.protected_last_tokens or [0] * B
+    epsilon = compression_ctx.critical_ada_epsilon
+    first_stage_ratio = compression_ctx.critical_ada_first_stage_ratio
+    alpha_safeguard = float(getattr(compression_ctx, "critical_ada_alpha_safeguard", 0.2))
+    alpha_safeguard = max(0.0, min(1.0, alpha_safeguard))
+
+    if wo_weight.dim() == 2:
+        hidden_size, _ = wo_weight.shape
+        wo = wo_weight.transpose(0, 1).view(Hq, D, hidden_size).contiguous()
+    else:
+        wo = wo_weight.contiguous()
+    hidden_size = wo.size(-1)
+
+    wo_v_norm = torch.empty((N_k, Hk), dtype=torch.float32, device=device)
+
+    def grid_wo(META):
+        max_k_len = int(k_lengths.max().item())
+        return (B, Hk, triton.cdiv(max_k_len, META["BLOCK_K"]))
+
+    _compute_wo_v_l1_kernel[grid_wo](
+        v,
+        wo,
+        cu_seqlens,
+        wo_v_norm,
+        *v.stride(),
+        *wo.stride(),
+        *wo_v_norm.stride(),
+        Hk=Hk,
+        Hq=Hq,
+        D=D,
+        HIDDEN=hidden_size,
+        QUERY_GROUP_SIZE=G,
+    )
+
+    stage1_mask = torch.zeros((N_k, Hk), dtype=torch.int32, device=device)
+    # kvpress 风格的每头预算（按序列自适应），用于 Stage-1/Stage-2。
+    head_budgets_by_batch = []
+
+    for b in range(B):
+        k_len = int(k_lengths[b].item())
+        if k_len == 0:
+            head_budgets_by_batch.append(None)
+            continue
+        k_beg = int(cu_seqlens[b].item())
+        k_end = int(cu_seqlens[b + 1].item())
+        s = int(prot_first[b]) if b < len(prot_first) else 0
+        e = int(prot_last[b]) if b < len(prot_last) else 0
+        lo, hi = k_beg + s, k_end - e
+        compressible = max(0, hi - lo)
+        keep_pairs = int(btr[b].item())
+        if compressible <= 0:
+            head_budgets_by_batch.append(None)
+            continue
+        # 每头 token 预算（kvpress 的 n_kept）
+        n_kept_tokens = max(1, keep_pairs // Hk)
+        n_kept_tokens = min(n_kept_tokens, compressible)
+        # 安全预算（每头至少保留 n_safe）
+        n_safe = int(n_kept_tokens * alpha_safeguard)
+        if n_safe > 0:
+            tk_safe = min(n_safe, compressible)
+            for hk in range(Hk):
+                safe_idx = torch.topk(base_scores[lo:hi, hk], tk_safe, sorted=False).indices
+                stage1_mask[lo + safe_idx, hk] = 1
+
+        # 自适应预算分配：在扁平 (token, head) 空间取 top n_kept_tokens*Hk，统计每个 head 的预算
+        budget_scores = base_scores[lo:hi, :].clone()
+        if n_safe > 0:
+            budget_scores[stage1_mask[lo:hi, :] == 1] = float("inf")
+        top_pairs = min(n_kept_tokens * Hk, budget_scores.numel())
+        if top_pairs <= 0:
+            head_budgets_by_batch.append(None)
+            continue
+        top_idx_flat = torch.topk(
+            budget_scores.reshape(-1), top_pairs, sorted=False
+        ).indices
+        top_head_idx = top_idx_flat % Hk
+        head_budgets = torch.bincount(top_head_idx, minlength=Hk).to(torch.int32)
+        head_budgets_by_batch.append(head_budgets)
+
+        # Stage-1：按 head_budgets 的 first_stage_ratio 分头保护（kvpress 语义）
+        for hk in range(Hk):
+            phase1_budget = int(head_budgets[hk].item() * first_stage_ratio)
+            if phase1_budget <= 0:
+                continue
+            tk = min(phase1_budget, compressible)
+            top_idx = torch.topk(base_scores[lo:hi, hk], tk, sorted=False).indices
+            stage1_mask[lo + top_idx, hk] = 1
+
+    final_scores = torch.empty((N_k, Hk), dtype=torch.float32, device=device)
+
+    def grid_fuse(_META):
+        return (B, Hk)
+
+    _critical_ada_fuse_kernel[grid_fuse](
+        base_scores,
+        wo_v_norm,
+        stage1_mask,
+        cu_seqlens,
+        final_scores,
+        EPSILON=epsilon,
+        *base_scores.stride(),
+        *wo_v_norm.stride(),
+        *stage1_mask.stride(),
+        *final_scores.stride(),
+        Hk=Hk,
+    )
+
+    # Stage-2（kvpress 语义）：在融合后按每头预算再做一次 top-k 保护。
+    for b in range(B):
+        hb = head_budgets_by_batch[b]
+        if hb is None:
+            continue
+        k_beg = int(cu_seqlens[b].item())
+        k_end = int(cu_seqlens[b + 1].item())
+        s = int(prot_first[b]) if b < len(prot_first) else 0
+        e = int(prot_last[b]) if b < len(prot_last) else 0
+        lo, hi = k_beg + s, k_end - e
+        if hi <= lo:
+            continue
+        region_len = hi - lo
+        for hk in range(Hk):
+            budget = int(hb[hk].item())
+            if budget <= 0:
+                continue
+            tk = min(budget, region_len)
+            idx = torch.topk(final_scores[lo:hi, hk], tk, sorted=False).indices
+            final_scores[lo + idx, hk] = float("inf")
+
+    masked_key_indices = None
+    for b in range(B):
+        k_len = int(k_lengths[b].item())
+        if k_len == 0:
+            continue
+        keep_pairs = int(btr[b].item())
+        total_pairs = k_len * Hk
+        if keep_pairs >= total_pairs:
+            continue
+        k_beg = int(cu_seqlens[b].item())
+        k_end = int(cu_seqlens[b + 1].item())
+        n_prune_pairs = min(total_pairs - keep_pairs, total_pairs)
+        if n_prune_pairs <= 0:
+            continue
+
+        flat_scores = final_scores[k_beg:k_end, :].reshape(-1)
+        prune_idx = torch.topk(
+            -flat_scores, min(n_prune_pairs, flat_scores.numel()), sorted=False
+        ).indices
+        batch_idx = torch.full_like(prune_idx, b, dtype=torch.int64)
+        head_idx = prune_idx % Hk
+        seq_idx = prune_idx // Hk + k_beg
+        if masked_key_indices is None:
+            masked_key_indices = (batch_idx, head_idx, seq_idx)
+        else:
+            masked_key_indices = (
+                torch.cat([masked_key_indices[0], batch_idx]),
+                torch.cat([masked_key_indices[1], head_idx]),
+                torch.cat([masked_key_indices[2], seq_idx]),
+            )
+
+    if store_stream is not None:
+        final_scores.record_stream(store_stream)
+
+    return final_scores, masked_key_indices
+
+
+class CriticalAdaKVCompression(BaseCompressionMethod):
+    """
+    以 CompactorCompression 为基分（pre RoPE 杠杆 + post RoPE 非因果融合），
+    再应用 CriticalAda 两阶段加权；须由 Attention 在 post-RoPE 前注入 ``compression_context.wo_weight``。
+    """
+
+    @staticmethod
+    def pre_rope_scoring(
+        q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, context
+    ) -> Optional[torch.Tensor]:
+        cc = context.compression_context
+        base = getattr(cc, "critical_ada_base_scorer", "compactor") if cc is not None else "compactor"
+        if str(base).lower() == "snapkv":
+            return SnapKVCompression.pre_rope_scoring(q, k, v, context)
+        return CompactorCompression.pre_rope_scoring(q, k, v, context)
+
+    @staticmethod
+    def post_rope_scoring(
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        pre_rope_scores: Optional[torch.Tensor],
+        context,
+    ) -> Optional[torch.Tensor]:
+        compression_context = context.compression_context
+        assert compression_context is not None
+        base = str(getattr(compression_context, "critical_ada_base_scorer", "compactor")).lower()
+
+        if base == "snapkv":
+            base_scores = SnapKVCompression.post_rope_scoring(q, k, v, pre_rope_scores, context)
+        else:
+            # 与 compactor.py 中 CompactorCompression.post_rope_scoring 逐字一致：
+            # maybe_execute_in_stream(non_causal_attn_scores, q,k,v, cu_seqlens_q, max_seqlen_q, ...)
+            # 不得改为其它封装，否则与单独使用 COMPACTOR 时分数字不一致。
+            if context.STORE_STREAM is not None:
+                torch.cuda.current_stream().wait_stream(context.STORE_STREAM)
+
+            base_scores = maybe_execute_in_stream(
+                non_causal_attn_scores,
+                q,
+                k,
+                v,
+                context.cu_seqlens_q,
+                context.max_seqlen_q,
+                chunk_size=CompactorCompression.chunk_size,
+                sm_scale=1.0,
+                normalize=True,
+                accum_scores=pre_rope_scores,
+                context_lens=compression_context.context_lens,
+                protected_first_tokens=compression_context.protected_first_tokens,
+                protected_last_tokens=compression_context.protected_last_tokens,
+                accum_blending=0.5,
+            )
+
+        wo_weight = compression_context.wo_weight
+        if wo_weight is None:
+            return base_scores
+
+        scores, _masked = maybe_execute_in_stream(
+            critical_ada_key_scores,
+            q,
+            k,
+            v,
+            wo_weight,
+            context.cu_seqlens_q,
+            base_scores,
+            compression_context,
+            STORE_STREAM=context.STORE_STREAM,
+            store_stream=context.STORE_STREAM,
+        )
+        return scores
+
+    @staticmethod
+    def prepare_layer(module: torch.nn.Module, device: torch.device, dtype: torch.dtype):
+        """可选：预计算并缓存 Wo；实际推理以 Attention.forward 中注入的 ``cc.wo_weight`` 为准。"""
+        if not hasattr(module, "o_proj") or module.o_proj.weight is None:
+            return
+        if not hasattr(module, "num_heads") or not hasattr(module, "head_dim"):
+            return
+        wo_raw = module.o_proj.weight.data
+        hidden_size, _ = wo_raw.shape
+        Hq = module.num_heads
+        head_dim = module.head_dim
+        wo = (
+            wo_raw.transpose(0, 1)
+            .view(Hq, head_dim, hidden_size)
+            .to(device=device, dtype=torch.float32)
+        )
+        module._critical_ada_wo_weight = wo
+
--- a/vllm/compactor-vllm/src/compactor_vllm/compression/criticalkv.py
+++ b/vllm/compactor-vllm/src/compactor_vllm/compression/criticalkv.py
--- a/vllm/compactor-vllm/src/compactor_vllm/compression/criticalkv_origin.py
+++ b/vllm/compactor-vllm/src/compactor_vllm/compression/criticalkv_origin.py
--- a/vllm/compactor-vllm/src/compactor_vllm/compression/snapkv.py
+++ b/vllm/compactor-vllm/src/compactor_vllm/compression/snapkv.py
--- a/vllm/compactor-vllm/src/compactor_vllm/compression/snapkv_origin.py
+++ b/vllm/compactor-vllm/src/compactor_vllm/compression/snapkv_origin.py
--- a/vllm/compactor-vllm/src/compactor_vllm/config/__init__.py
+++ b/vllm/compactor-vllm/src/compactor_vllm/config/__init__.py
--- a/vllm/compactor-vllm/src/compactor_vllm/config/constants.py
+++ b/vllm/compactor-vllm/src/compactor_vllm/config/constants.py
+RESERVED_BATCH = 0
+# NOTE: Triton `tl.constexpr` is intended for use in kernel signatures/annotations.
+# Some Triton builds reject passing `tl.constexpr(...)` objects as constexpr values.
+# Keep the runtime value as a plain int and let kernel signatures declare constexpr.
+TRITON_RESERVED_BATCH = RESERVED_BATCH
--- a/vllm/compactor-vllm/src/compactor_vllm/config/engine_config.py
+++ b/vllm/compactor-vllm/src/compactor_vllm/config/engine_config.py
--- a/vllm/compactor-vllm/src/compactor_vllm/config/sampling_params.py
+++ b/vllm/compactor-vllm/src/compactor_vllm/config/sampling_params.py
+from dataclasses import dataclass
+
+
+@dataclass
+class SamplingParams:
+    temperature: float = 1.0
+    max_new_tokens: int = 256
+
+    def __post_init__(self):
+        if self.temperature < 0:
+            raise ValueError("Temperature cannot be negative")
--- a/vllm/compactor-vllm/src/compactor_vllm/core/__init__.py
+++ b/vllm/compactor-vllm/src/compactor_vllm/core/__init__.py
--- a/vllm/compactor-vllm/src/compactor_vllm/core/llm_engine.py
+++ b/vllm/compactor-vllm/src/compactor_vllm/core/llm_engine.py