vllm kvprune:v1.0.0

2b7160c6 · chenzk · fa718036 · 2b7160c6 · 2b7160c6 · 2b7160c6
Commit 2b7160c6 authored Apr 23, 2026 by chenzk
20 changed files
--- a/vllm/kvprune/compression/compactor_origin.py
+++ b/vllm/kvprune/compression/compactor_origin.py
+import logging
+import math
+from typing import List, Optional
+
+import torch
+import triton
+from tqdm.contrib.logging import logging_redirect_tqdm
+from triton import language as tl
+
+from vllm.kvprune.compression.common import BaseCompressionMethod
+from vllm.kvprune.utils.helpers import maybe_execute_in_stream
+from vllm.kvprune.utils.triton_compat import autotune as triton_autotune
+
+logger = logging.getLogger(__name__)
+
+
+class CompactorCompression(BaseCompressionMethod):
+    chunk_size: int = 128
+
+    @staticmethod
+    def pre_rope_scoring(
+        q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, context
+    ) -> Optional[torch.Tensor]:
+        compression_context = context.compression_context
+        scores = maybe_execute_in_stream(
+            approximate_leverage_scores,
+            k,
+            compression_context.context_lens,
+            compression_context.PHI,
+            normalize=True,
+            chunk_size=compression_context.compression_chunk_size,
+            STORE_STREAM=context.STORE_STREAM,
+        )
+        return scores
+
+    @staticmethod
+    def post_rope_scoring(
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        pre_rope_scores: torch.Tensor,
+        context,
+    ) -> Optional[torch.Tensor]:
+        compression_context = context.compression_context
+        return maybe_execute_in_stream(
+            non_causal_attn_scores,
+            q,
+            k,
+            v,
+            context.cu_seqlens_q,
+            context.max_seqlen_q,
+            chunk_size=CompactorCompression.chunk_size,
+            sm_scale=1.0,
+            normalize=True,
+            accum_scores=pre_rope_scores,
+            context_lens=compression_context.context_lens,
+            protected_first_tokens=compression_context.protected_first_tokens,
+            protected_last_tokens=compression_context.protected_last_tokens,
+            accum_blending=0.5,
+        )
+
+
+def split_into_chunks(xs, chunk_size):
+    """
+    Convert a list of sequence lengths into a sequence of coalesced chunk lengths.
+
+    Given an iterable of per-sequence context lengths ``xs`` and a target ``chunk_size``,
+    this helper produces two parallel lists:
+
+      * ``coalesced_chunks`` – lengths of contiguous segments in the
+        **concatenated** sequence space, where each segment corresponds either
+        to a full chunk of size ``chunk_size`` or to a residual "epilogue"
+        tail shorter than ``chunk_size``.
+
+      * ``chunks`` – the actual chunk sizes used within each original sequence.
+        For a length ``n``, we produce ``n // chunk_size`` entries of
+        ``chunk_size`` (the "prologue") and at most one final entry equal to
+        ``n % chunk_size`` (the "epilogue").
+
+    ``chunks`` reflects how each input length is decomposed into
+    fixed-size (plus optional tail) processing blocks, while
+    ``coalesced_chunks`` describes those same blocks after concatenating consecutive
+    chunks of size ``chunk_size``. together
+
+    Example:
+        xs = [257, 127], chunk_size = 128
+        coalesced_chunks = [256, 1, 127]
+        chunks           = [128, 128, 1, 127]
+
+    Args:
+        :param xs:
+            Iterable of non-negative integers
+        :param chunk_size:
+            Target chunk size
+
+    Returns:
+        :return Tuple[List[int], List[int]]:
+            ``(coalesced_chunks, chunks)`` as described above.
+    """
+    coalesced_chunks, chunks = [], []
+    for n in xs:
+        nchunks = n // chunk_size
+        prologue = nchunks * chunk_size
+        epilogue = n - prologue
+        if prologue > 0:
+            coalesced_chunks.append(prologue)
+            chunks.extend([chunk_size] * nchunks)
+        if epilogue > 0:
+            coalesced_chunks.append(epilogue)
+            chunks.append(epilogue)
+    return coalesced_chunks, chunks
+
+
+def approximate_leverage_scores(
+    key_states: torch.Tensor,  # [N, H, D]
+    context_lens: List[int],  # [B]
+    PHI: torch.Tensor,  # [D, k]
+    regularizer: float = 5e-3,
+    normalize: bool = False,
+    chunk_size: int = 512,
+) -> torch.Tensor:  # returns [N, H]
+    """
+    Approximate leverage scores for keys via randomized sketching.
+
+    This implements a randomized approximation to per-token leverage scores for
+    the key matrix, as described in Compactor: Calibrated Query-Agnostic KV Cache
+    Compression with Approximate Leverage Scores (https://arxiv.org/abs/2507.08143).
+    Args:
+        :param key_states:
+            Tensor of shape ``[N, H, D]`` containing pre-RoPE key states for
+            all tokens across the batch, packed along the sequence dimension.
+            ``N = sum(context_lens)``.
+        :param context_lens:
+            List of per-sequence context lengths, length ``B``.
+        :param PHI:
+            Random projection matrix of shape ``[D, k]`` used to sketch the
+            keys into a lower-dimensional subspace (k < D).
+        :param regularizer:
+            Small positive scalar added to the diagonal of each Gram matrix
+            before SVD to improve numerical stability. Defaults to ``1e-2``.
+        :param normalize:
+            If True, apply per-sequence z-score normalization to the scores
+            across all heads and tokens in a batch.
+        :param chunk_size:
+            Target chunk size along the sequence dimension. If > 0, the
+            concatenated sequence is split into chunks of at most this size
+            before forming Gram matrices and SVD. If ≤ 0, the entire sequence
+            for each context is treated as a single chunk.
+    Returns:
+        :return torch.Tensor:
+            Approximate leverage scores of shape ``[N, H]``, where each row
+            corresponds to a token and each column to a head.
+    """
+    if chunk_size > 0:
+        coalesced_chunk_lens, chunks_lens = split_into_chunks(context_lens, chunk_size)
+    else:
+        coalesced_chunk_lens, chunks_lens = context_lens, context_lens
+    chunk_lens_cuda = torch.tensor([0] + chunks_lens).cuda(non_blocking=True)
+    X = torch.matmul(key_states.transpose(0, 1).contiguous(), PHI.contiguous())
+    H, N, k = X.shape
+    chunks = torch.split(X, coalesced_chunk_lens, dim=-2)
+    gram_matrices = []
+    for i, L in enumerate(coalesced_chunk_lens):
+        chunk = chunks[i]
+        if chunk_size <= 0 or L % chunk_size != 0:
+            chunk.sub_(chunk.mean(dim=-2, keepdim=True))
+            g = torch.matmul(chunk.transpose(-1, -2).contiguous(), chunk.contiguous())
+            g = g.unsqueeze(1)
+        else:
+            chunk = chunk.view(H, -1, chunk_size, k)  # [H, num_chunks, chunk_size, k]
+            chunk.sub_(chunk.mean(dim=-2, keepdim=True))
+            g = torch.matmul(chunk.transpose(-1, -2).contiguous(), chunk.contiguous())
+        gram_matrices.append(g)
+    G = torch.cat(gram_matrices, dim=1).to(torch.float32)
+    diag = G.diagonal(dim1=-2, dim2=-1)
+    diag.add_(regularizer)
+    try:
+        V, S, Vt = torch.linalg.svd(G, full_matrices=False, driver="gesvda")
+    except RuntimeError:
+        try:
+            diag = G.diagonal(dim1=-2, dim2=-1)
+            diag.add_(regularizer * 10)
+            V, S, Vt = torch.linalg.svd(G, full_matrices=False, driver="gesvda")
+        except RuntimeError:
+            with logging_redirect_tqdm():
+                logger.warning(
+                    "GESVDA failed, falling back to QR decomposition, which will be MUCH slower. "
+                    "Try increasing chunk_size if this issue persists."
+                )
+            # this is over 50 times slower than using GESVDA
+            return _approximate_leverage_scores_qr_fallback(
+                X=X,
+                chunks_lens=chunks_lens,
+                chunk_lens_cuda=chunk_lens_cuda,
+                normalize=normalize,
+                chunk_size=chunk_size,
+            )
+    SV = (V * S.rsqrt().unsqueeze(-2)).to(X.dtype)
+    start = 0
+    all_scores = []
+    for i, L in enumerate(coalesced_chunk_lens):
+        chunk = chunks[i]
+        if chunk_size <= 0 or L % chunk_size != 0:
+            num_chunks = 1
+            sv = SV[:, start]
+        else:
+            num_chunks = L // chunk_size
+            chunk = chunk.view(H, -1, chunk_size, k)  # [H, NC, CS]
+            sv = SV[:, start : start + num_chunks]
+        U = torch.matmul(chunk.contiguous(), sv.contiguous())
+        scores = (U * U).sum(dim=-1).clamp_min_(0.0).view(H, -1)
+        all_scores.append(scores.transpose(-1, -2))
+        start += num_chunks
+
+    scores = torch.cat(all_scores, dim=0)
+    if normalize:
+        grid = (len(chunks_lens),)
+        cu_k = chunk_lens_cuda.cumsum(dim=0)
+        _zscore_per_batch_epilogue_no_window[grid](
+            scores, cu_k, scores.stride(0), scores.stride(1), H
+        )
+    return scores
+
+
+@triton_autotune(
+    configs=[triton.Config({"BLOCK_K": bk}) for bk in [32, 64, 128]],
+    key=["HK"],
+    cache_results=True,
+)
+@triton.jit
+def _zscore_per_batch_epilogue_no_window(
+    OUT,  # [Nk, Hk], float32
+    cu_k,  # [B+1] int32
+    STRIDE_OUT_NK,
+    STRIDE_OUT_HK,
+    HK: tl.constexpr,  # Hk
+    BLOCK_K: tl.constexpr,  # e.g., 128
+):
+    b = tl.program_id(0)
+
+    k_beg = tl.load(cu_k + b)
+    k_end = tl.load(cu_k + b + 1)
+    if k_end <= k_beg:
+        return
+
+    sumv = tl.zeros([], dtype=tl.float32)
+    sumsq = tl.zeros([], dtype=tl.float32)
+    count = ((k_end - k_beg) * HK).to(tl.float32)
+
+    for ks in tl.range(k_beg, k_end, BLOCK_K):
+        nk = ks + tl.arange(0, BLOCK_K)
+        kmask = nk < k_end
+        for h in tl.range(0, HK):
+            ptrs = OUT + nk * STRIDE_OUT_NK + h * STRIDE_OUT_HK
+            vals = tl.load(ptrs, mask=kmask, other=0.0).to(tl.float32)
+            sumv += tl.sum(vals, 0)
+            sumsq += tl.sum(vals * vals, 0)
+
+    mean = sumv / count
+    var = tl.maximum(sumsq / count - mean * mean, 0.0)
+    invstd = 1.0 / tl.sqrt(var)
+
+    for ks in tl.range(k_beg, k_end, BLOCK_K):
+        nk = ks + tl.arange(0, BLOCK_K)
+        kmask = nk < k_end
+        for h in tl.range(0, HK):
+            ptrs = OUT + nk * STRIDE_OUT_NK + h * STRIDE_OUT_HK
+            vals = tl.load(ptrs, mask=kmask, other=0.0).to(tl.float32)
+            vals = (vals - mean) * invstd
+            tl.store(ptrs, vals, mask=kmask)
+
+
+def _approximate_leverage_scores_qr_fallback(
+    X: torch.Tensor,  # [H, N, k], already sketched (KΦ) and centered in-place
+    chunks_lens: List[int],  # [num_chunks]
+    chunk_lens_cuda: torch.Tensor,  # [num_chunks + 1] (prefix base)
+    normalize: bool,
+    chunk_size: int,
+) -> torch.Tensor:
+    H, N, k = X.shape
+    device, dtype = X.device, X.dtype
+    offsets: List[int] = []
+    offset = 0
+    for L in chunks_lens:
+        offsets.append(offset)
+        offset += L
+    if offset != N:
+        raise RuntimeError(
+            f"QR fallback: sum(chunks_lens)={offset} does not match N={N}"
+        )
+
+    blocks = torch.split(X, chunks_lens, dim=-2)
+    scores = torch.empty(N, H, device=device, dtype=dtype)
+    if chunk_size > 0:
+        full_indices = [i for i, L in enumerate(chunks_lens) if L == chunk_size]
+        epi_indices = [i for i, L in enumerate(chunks_lens) if L != chunk_size]
+
+        if full_indices:
+            # stack full chunks
+            full_blocks = torch.stack(
+                [blocks[i] for i in full_indices], dim=0
+            )  # [M, H, CS, k]
+            M, Hf, Lf, kf = full_blocks.shape
+            assert Lf == chunk_size
+
+            # merge (M, H) into a single batch dim for torch.linalg.q
+            full_blocks_2d = full_blocks.view(M * Hf, Lf, kf).to(torch.float32)
+
+            U_full, _ = torch.linalg.qr(full_blocks_2d, mode="reduced")
+            U_full = U_full.to(dtype)
+            scores_full = (U_full * U_full).sum(dim=-1).clamp_min(0.0)  # [M * Hf, Lf]
+            scores_full = scores_full.view(M, Hf, Lf).transpose(-1, -2)  # [M, H, CS]
+            for m, chunk_idx in enumerate(full_indices):
+                start = offsets[chunk_idx]
+                Lc = chunks_lens[chunk_idx]
+                scores[start : start + Lc].copy_(scores_full[m])
+    else:
+        epi_indices = list(range(len(chunks_lens)))
+
+    for chunk_idx in epi_indices:
+        block = blocks[chunk_idx]
+        _, Lc, _ = block.shape
+        if Lc == 0:
+            continue
+        U_epi, _ = torch.linalg.qr(block.to(torch.float32), mode="reduced")
+        scores_epi = (U_epi * U_epi).sum(dim=-1).to(dtype)  # [H, Lc]
+        start = offsets[chunk_idx]
+        scores[start : start + Lc] = scores_epi.transpose(0, 1)  # [Lc, H]
+
+    if normalize:
+        grid = (len(chunks_lens),)
+        cu_k = chunk_lens_cuda.cumsum(dim=0)
+        _zscore_per_batch_epilogue_no_window[grid](
+            scores, cu_k, scores.stride(0), scores.stride(1), H
+        )
+    return scores
+
+
+@triton_autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_M": BM, "BLOCK_K": BK, "WARPSPEC": False}, num_warps=w, num_stages=s
+        )
+        for BM in [64]
+        for BK in [64]
+        for w in [4]
+        for s in [2]
+    ],
+    key=[
+        "QUERY_GROUP_SIZE",
+        "D",
+        "CHUNK_SIZE",
+    ],
+    cache_results=True,
+)
+@triton.jit
+def _non_causal_attn_kernel(
+    Q,
+    K,
+    V,
+    accum_scores,
+    cu_seqlens_qk,
+    #
+    STRIDE_Q_G,
+    STRIDE_Q_N,
+    STRIDE_Q_H,
+    STRIDE_Q_D,
+    STRIDE_K_G,
+    STRIDE_K_N,
+    STRIDE_K_D,
+    STRIDE_V_G,
+    STRIDE_V_N,
+    STRIDE_V_D,
+    STRIDE_OUT_N,
+    STRIDE_OUT_H,
+    sm_scale,
+    #
+    CHUNK_SIZE: tl.constexpr,
+    QUERY_GROUP_SIZE: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    D: tl.constexpr,
+    WARPSPEC: tl.constexpr,
+):
+    TOTAL_QUERIES_PER_BLOCK: tl.constexpr = BLOCK_M * QUERY_GROUP_SIZE
+    INVERSE_CHUNK: tl.constexpr = 1.0 / CHUNK_SIZE
+    pid_g = tl.program_id(0)  # KV head in [0, HKV)
+    pid_b = tl.program_id(1)  # batch id
+    pid_m = tl.program_id(2)  # chunk id within batch
+
+    off_b = tl.load(cu_seqlens_qk + pid_b)
+    off_b1 = tl.load(cu_seqlens_qk + pid_b + 1)
+
+    chunk_start = off_b + pid_m * CHUNK_SIZE
+    chunk_end = tl.minimum(chunk_start + CHUNK_SIZE, off_b1)
+    M = chunk_end - chunk_start
+    if M <= 0:
+        return
+
+    offs_d = tl.arange(0, D)
+    offs_k = tl.arange(0, BLOCK_K)
+
+    # Flattened query rows inside a [BLOCK_M, QUERY_GROUP_SIZE] tile
+    offs_q = tl.arange(0, TOTAL_QUERIES_PER_BLOCK)
+    row_m = offs_q % BLOCK_M  # token offset in this tile
+    row_h = offs_q // BLOCK_M  # query-group index
+
+    qk_scale = sm_scale * 1.44269504  # convert to log2-domain
+    NEG_INF = -1.0e9
+
+    # Iterate over query tiles within this chunk
+    for qs in tl.range(chunk_start, chunk_end, BLOCK_M):
+        # Global query indices for rows in this tile
+        q_idx = qs + row_m  # [TOTAL_QUERIES_PER_BLOCK]
+        q_mask = q_idx < chunk_end  # mask for valid rows in this tile
+
+        # Load Q tile: [TOTAL_QUERIES_PER_BLOCK, D]
+        q_ptrs = (
+            Q
+            + pid_g * STRIDE_Q_G
+            + q_idx[:, None] * STRIDE_Q_N
+            + row_h[:, None] * STRIDE_Q_H
+            + offs_d[None, :] * STRIDE_Q_D
+        )
+        q = tl.load(q_ptrs, mask=q_mask[:, None], other=0.0)
+
+        # ---- Pass 1: per-row max and denominator over all keys in this chunk ----
+        row_max = tl.full([TOTAL_QUERIES_PER_BLOCK], NEG_INF, tl.float32)
+        row_sum = tl.zeros([TOTAL_QUERIES_PER_BLOCK], dtype=tl.float32)
+
+        for ks in tl.range(chunk_start, chunk_end, BLOCK_K):
+            k_idx = ks + offs_k  # [BLOCK_K]
+            k_mask = k_idx < chunk_end  # which keys are valid in this tile
+
+            k_ptrs = (
+                K
+                + pid_g * STRIDE_K_G
+                + k_idx[:, None] * STRIDE_K_N
+                + offs_d[None, :] * STRIDE_K_D
+            )
+            k = tl.load(k_ptrs, mask=k_mask[:, None], other=0.0)  # [BLOCK_K, D]
+
+            # logits: [TOTAL_QUERIES_PER_BLOCK, BLOCK_K]
+            qk = tl.dot(q, k.T) * qk_scale
+            qk = tl.where(q_mask[:, None] & k_mask[None, :], qk, NEG_INF)
+
+            cur_max = tl.max(qk, 1)
+            new_max = tl.maximum(row_max, cur_max)
+
+            # rescale previous sum to new_max (base 2)
+            rescale = tl.math.exp2(row_max - new_max)
+            p = tl.math.exp2(qk - new_max[:, None])
+
+            row_sum = row_sum * rescale + tl.sum(p, 1)
+            row_max = new_max
+
+        # Avoid division by zero for inactive rows
+        denom = tl.where(q_mask, row_sum, 1.0)
+
+        for ks in tl.range(chunk_start, chunk_end, BLOCK_K):
+            k_idx = ks + offs_k
+            k_mask = k_idx < chunk_end
+
+            k_ptrs = (
+                K
+                + pid_g * STRIDE_K_G
+                + k_idx[:, None] * STRIDE_K_N
+                + offs_d[None, :] * STRIDE_K_D
+            )
+            k = tl.load(k_ptrs, mask=k_mask[:, None], other=0.0)
+
+            qk = tl.dot(q, k.T) * qk_scale
+            qk = tl.where(q_mask[:, None] & k_mask[None, :], qk, NEG_INF)
+
+            # p has shape [TOTAL_QUERIES_PER_BLOCK, BLOCK_K]
+            p = tl.math.exp2(qk - row_max[:, None]) / denom[:, None]
+            # zero-out invalid rows / columns
+            p = tl.where(
+                q_mask[:, None], p, INVERSE_CHUNK
+            )  # preserve attention mass in shorter chunks
+
+            contrib = tl.sum(p, 0)  # [BLOCK_K], sum over queries & query-groups
+
+            out_ptrs = accum_scores + k_idx * STRIDE_OUT_N + pid_g * STRIDE_OUT_H
+            old = tl.load(out_ptrs, mask=k_mask, other=0.0)
+            new = old + contrib.to(old.dtype)
+            tl.store(out_ptrs, new, mask=k_mask)
+
+
+def non_causal_attn_scores(
+    q: torch.Tensor,  # [N, HQ, D]
+    k: torch.Tensor,  # [N, HKV, D]
+    v: torch.Tensor,  # [N, HKV, D]
+    cu_seqlens_qk: torch.Tensor,  # [B + 1]
+    max_seqlen_qk: int,
+    chunk_size: int,
+    sm_scale: float = None,
+    normalize: bool = True,
+    context_lens: Optional[List[int]] = None,
+    protected_first_tokens: Optional[List[int]] = None,
+    protected_last_tokens: Optional[List[int]] = None,
+    *,
+    accum_scores: torch.Tensor = None,  # [N, HKV] (float32)
+    accum_blending: float = None,
+) -> torch.Tensor:
+    """
+    :param q: Tensor of shape ``[N, H, D]`` containing post-rope queries
+    :param k: Tensor of shape ``[N, H, D]`` containing post-rope keys
+    :param v: Tensor of shape ``[N, H, D]`` containing values
+    :param cu_seqlens_qk Tensor of shape ``[B + 1]`` demarcating batch boundaries
+    :param max_seqlen_qk int containing the maximum sequence length
+    :param chunk_size: int specifying the size of the chunk to perform non-causal attention over
+    :param sm_scale: float specifying the scaling factor applied to attention scores (1/sqrt(D) if None)
+    :param normalize: bool specifying whether to z-score normalize final attention scores
+    :param context_lens: List[int] specifying the context lengths. CPU version of cu_seqlens_qk.diff(0)
+    :param protected_first_tokens: List[int] specifying how many tokens should be protected at the
+            start of each sequence
+    :param protected_last_tokens: List[int] specifying how many tokens should be protected at the
+            end of each sequence
+    :param accum_scores: Tensor of shape ``[N, H]`` containing key scores that should be accumulated into
+    :param accum_blending float specifying the scaling of ``accum_scores`` prior to adding the new
+        non-causal attention scores. Final output is equivalent to return out + accum_blending * accum_scores
+    """
+    assert q.ndim == 3 and k.ndim == 3
+    assert q.shape[0] == k.shape[0] and q.shape[-1] == k.shape[-1]
+    N, HQ, D = q.shape
+    HKV = k.shape[1]
+    assert HQ % HKV == 0, "Number of query heads must divide number of KV heads"
+    assert (D & (D - 1)) == 0, "D must be a power of two"
+
+    B = cu_seqlens_qk.numel() - 1
+    H_g = HQ // HKV  # query-group size per KV head
+
+    if sm_scale is None:
+        sm_scale = 1.0 / math.sqrt(D)
+    out = torch.zeros(N, HKV, device=q.device, dtype=torch.float32)
+    q = q.view(N, HKV, H_g, D).permute(1, 0, 2, 3)
+    k = k.view(N, HKV, D).permute(1, 0, 2)
+    # v = v.view(N, HKV, D).permute(1, 0, 2)
+
+    if cu_seqlens_qk.device != q.device:
+        cu_seqlens_qk = cu_seqlens_qk.to(device=q.device)
+    cu_seqlens_qk = cu_seqlens_qk.to(torch.int32)
+
+    STRIDE_Q_G, STRIDE_Q_N, STRIDE_Q_H, STRIDE_Q_D = q.stride()
+    STRIDE_K_G, STRIDE_K_N, STRIDE_K_D = k.stride()
+    STRIDE_V_G, STRIDE_V_N, STRIDE_V_D = v.stride()
+    STRIDE_OUT_N, STRIDE_OUT_H = out.stride()
+
+    assert STRIDE_Q_D == 1 and STRIDE_K_D == 1, "last dim must be contiguous"
+
+    def grid(_):
+        return (
+            HKV,
+            B,
+            triton.cdiv(max_seqlen_qk, chunk_size),
+        )
+
+    _non_causal_attn_kernel[grid](
+        q,
+        k,
+        v,
+        out,
+        cu_seqlens_qk,
+        STRIDE_Q_G,
+        STRIDE_Q_N,
+        STRIDE_Q_H,
+        STRIDE_Q_D,
+        STRIDE_K_G,
+        STRIDE_K_N,
+        STRIDE_K_D,
+        STRIDE_V_G,
+        STRIDE_V_N,
+        STRIDE_V_D,
+        STRIDE_OUT_N,
+        STRIDE_OUT_H,
+        sm_scale,
+        CHUNK_SIZE=chunk_size,
+        QUERY_GROUP_SIZE=H_g,
+        D=D,
+    )
+    if normalize:
+        grid = (B,)
+        _zscore_per_batch_epilogue_no_window[grid](
+            out, cu_seqlens_qk, out.stride(0), out.stride(1), HKV
+        )
+    if accum_scores is not None:
+        if accum_blending is not None:
+            out += accum_scores * accum_blending
+        else:
+            out += accum_scores
+    if protected_first_tokens is not None or protected_last_tokens is not None:
+        start = 0
+        for first, last, L in zip(
+            protected_first_tokens, protected_last_tokens, context_lens
+        ):
+            out[start : start + first].fill_(torch.inf)
+            out[start + L - last : start + L].fill_(torch.inf)
+            start += L
+    return out
--- a/vllm/kvprune/compression/compression_config.py
+++ b/vllm/kvprune/compression/compression_config.py
+import logging
+from dataclasses import dataclass
+from enum import Enum, auto
+
+logger = logging.getLogger(__name__)
+
+
+class CompressionMethod(Enum):
+    CRITICALADAKV = auto()
+    COMPACTOR = auto()
+    SNAPKV = auto()
+    NONE = auto()
+
+
+# class CachingPolicy(Enum):
+#     CACHE_PROMPT = auto()
+#     DONT_CACHE = auto()
+
+
+# class CompressionType(Enum):
+#     QUERY_AWARE = auto()
+#     QUERY_AGNOSTIC = auto()
+
+
+@dataclass
+class SequenceCompressionParams:
+    compression_ratio: float = 1.0
+    protected_first_tokens: int = 16
+    protected_last_tokens: int = 64
+
+
+@dataclass
+class BatchCompressionParams:
+    # compression_type: CompressionType = CompressionType.QUERY_AGNOSTIC
+    compression_method: CompressionMethod = CompressionMethod.COMPACTOR
+
+    do_chunked_compression: bool = True
+    chunk_size: int = 512
+
+    def __post_init__(self):
+        if self.compression_method == CompressionMethod.SNAPKV:
+            self.do_chunked_compression = False
+            logger.warning(
+                "CompressionMethod.SNAPKV is not compatible with chunked compression. Disabling it."
+            )
--- a/vllm/kvprune/compression/criticalkv-cursor.py
+++ b/vllm/kvprune/compression/criticalkv-cursor.py
+"""
+CriticalAdaKV: 在 Compactor（pre RoPE 杠杆分 + post RoPE 非因果注意力融合）基础上，
+用输出投影 Wo 对 Value 的 L1 范数做 Stage-2 重加权；Stage-1 在 Compactor 基础分上做预算内 top-k 保护。
+
+预算与 vllm.kvprune 引擎一致：使用 ``compression_context.batch_tokens_to_retain``（flatten 的
+(token, head) 对数量）及首/尾保护段长度。
+
+注意：不得在 import 时加载 ``vllm.kvprune.utils.context``（其会再 import ``CompressionMethod``，
+与 ``compression/__init__.py`` 导入本模块形成环）。运行时只使用与 ``CompressionContext`` 同字段的 duck 对象。
+"""
+
+from __future__ import annotations
+
+from typing import Any, Optional, Tuple
+
+import torch
+import triton
+from triton import language as tl
+
+from vllm.kvprune.compression.common import BaseCompressionMethod
+from vllm.kvprune.compression.compactor import (
+    CompactorCompression,
+    non_causal_attn_scores,
+)
+from vllm.kvprune.compression.snapkv import SnapKVCompression
+from vllm.kvprune.utils.helpers import maybe_execute_in_stream
+from vllm.kvprune.utils.triton_compat import autotune as triton_autotune
+
+
+
+# ============================================================================
+# Triton Kernel 1: 计算 ||Wo @ V||₁ (L1 范数)
+# ============================================================================
+@triton_autotune(
+    configs=[
+        triton.Config({"BLOCK_K": bk, "BLOCK_D": bd}, num_warps=nw, num_stages=ns)
+        for bk in [32, 64, 128]
+        for bd in [32, 64]
+        for nw in [4, 8]
+        for ns in [3, 4]
+    ],
+    key=["Hk", "D", "HIDDEN"],
+    cache_results=True,
+)
+@triton.jit
+def _compute_wo_v_l1_kernel(
+    V,
+    WO,
+    cu_k,
+    OUT,
+    STRIDE_V_NK,
+    STRIDE_V_HK,
+    STRIDE_V_D,
+    STRIDE_WO_HQ,
+    STRIDE_WO_D,
+    STRIDE_WO_HID,
+    STRIDE_OUT_NK,
+    STRIDE_OUT_HK,
+    Hk: tl.constexpr,
+    Hq: tl.constexpr,
+    D: tl.constexpr,
+    HIDDEN: tl.constexpr,
+    QUERY_GROUP_SIZE: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+):
+    b = tl.program_id(0)
+    hk = tl.program_id(1)
+    ks = tl.program_id(2)
+
+    k_beg = tl.load(cu_k + b)
+    k_end = tl.load(cu_k + b + 1)
+
+    nk_off = ks * BLOCK_K + tl.arange(0, BLOCK_K)
+    nk = k_beg + nk_off
+    k_mask = nk < k_end
+
+    out_ptrs = OUT + nk * STRIDE_OUT_NK + hk * STRIDE_OUT_HK
+    l1_sum = tl.zeros([BLOCK_K], dtype=tl.float32)
+
+    for g in range(QUERY_GROUP_SIZE):
+        hq = hk * QUERY_GROUP_SIZE + g
+
+        v_ptrs = (
+            V
+            + nk[:, None] * STRIDE_V_NK
+            + hk * STRIDE_V_HK
+            + tl.arange(0, D)[None, :] * STRIDE_V_D
+        )
+        v_blk = tl.load(v_ptrs, mask=k_mask[:, None], other=0.0).to(tl.float32)
+
+        for hid_off in range(0, HIDDEN, BLOCK_D):
+            hid_idx = hid_off + tl.arange(0, BLOCK_D)
+            hid_mask = hid_idx < HIDDEN
+
+            wo_ptrs = (
+                WO
+                + hq * STRIDE_WO_HQ
+                + tl.arange(0, D)[:, None] * STRIDE_WO_D
+                + hid_idx[None, :] * STRIDE_WO_HID
+            )
+            wo_tile = tl.load(wo_ptrs, mask=hid_mask[None, :], other=0.0).to(tl.float32)
+
+            wov_tile = tl.dot(v_blk, wo_tile)
+            l1_sum += tl.sum(tl.abs(wov_tile), axis=1)
+
+    l1_sum = l1_sum / QUERY_GROUP_SIZE
+    tl.store(out_ptrs, l1_sum, mask=k_mask)
+
+
+# ============================================================================
+# Triton Kernel 2: Stage 1 保护 + Stage 2 加权融合
+# ============================================================================
+@triton_autotune(
+    configs=[triton.Config({"BLOCK_K": bk}) for bk in [32, 64, 128, 256]],
+    key=["Hk"],
+    cache_results=True,
+)
+@triton.jit
+def _critical_ada_fuse_kernel(
+    BASE_SCORES,
+    WO_V_NORM,
+    STAGE1_MASK,
+    cu_k,
+    OUT,
+    EPSILON: tl.constexpr,
+    STRIDE_BS_NK,
+    STRIDE_BS_HK,
+    STRIDE_WN_NK,
+    STRIDE_WN_HK,
+    STRIDE_S1_NK,
+    STRIDE_S1_HK,
+    STRIDE_OUT_NK,
+    STRIDE_OUT_HK,
+    Hk: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
+    b = tl.program_id(0)
+    hk = tl.program_id(1)
+
+    k_beg = tl.load(cu_k + b)
+    k_end = tl.load(cu_k + b + 1)
+
+    for ks in tl.range(k_beg, k_end, BLOCK_K):
+        nk = ks + tl.arange(0, BLOCK_K)
+        kmask = nk < k_end
+
+        bs_ptrs = BASE_SCORES + nk * STRIDE_BS_NK + hk * STRIDE_BS_HK
+        wn_ptrs = WO_V_NORM + nk * STRIDE_WN_NK + hk * STRIDE_WN_HK
+        s1_ptrs = STAGE1_MASK + nk * STRIDE_S1_NK + hk * STRIDE_S1_HK
+
+        base = tl.load(bs_ptrs, mask=kmask, other=0.0)
+        wnorm = tl.load(wn_ptrs, mask=kmask, other=1.0)
+        stage1_protect = tl.load(s1_ptrs, mask=kmask, other=0).to(tl.int32)
+
+        fused = (base + EPSILON) * wnorm
+        fused = tl.where(stage1_protect == 1, float("inf"), fused)
+
+        out_ptrs = OUT + nk * STRIDE_OUT_NK + hk * STRIDE_OUT_HK
+        tl.store(out_ptrs, fused, mask=kmask)
+
+
+def critical_ada_key_scores(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    wo_weight: torch.Tensor,
+    cu_seqlens: torch.Tensor,
+    base_scores: torch.Tensor,
+    compression_ctx: Any,
+    *,
+    store_stream: Optional[torch.cuda.Stream] = None,
+) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]]:
+    """
+    使用与引擎一致的保留预算 ``batch_tokens_to_retain``（每条序列的 (token, head) 对数），
+    在每条序列上尽量贴近 kvpress 的 CriticalAdaKV 语义：
+      1) alpha_safeguard 安全预算（每头至少保留一部分）；
+      2) 基于 base_scores 的 head-wise 自适应预算分配（head_budgets）；
+      3) Stage-1 按 head_budgets * first_stage_ratio 保护；
+      4) Stage-2 计算 ``(base + eps) * ||Wo@V||_1``，再按 head_budgets 做每头 top-k 保护。
+
+    Args:
+        compression_ctx: 与 ``CompressionContext`` 相同字段即可（duck typing），须含
+            ``batch_tokens_to_retain``、``protected_first_tokens``、``protected_last_tokens``；
+            可选 ``critical_ada_epsilon``、``critical_ada_first_stage_ratio``、
+            ``critical_ada_alpha_safeguard``。
+    """
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1
+    device = q.device
+    _, Hq, D = q.shape
+    N_k, Hk, Dk = k.shape
+    assert D == Dk and Hq % Hk == 0
+
+    # 与 non_causal_attn_scores 使用同一 cu（prefill 下即 context.cu_seqlens_q），
+    # 保证 base_scores 行与 Triton 分段一致；勿与 cu_seqlens_k 混用。
+    B = cu_seqlens.numel() - 1
+    G = Hq // Hk
+    k_lengths = cu_seqlens[1:] - cu_seqlens[:-1]
+
+    btr = compression_ctx.batch_tokens_to_retain
+    assert btr is not None and btr.numel() == B
+    btr = btr.to(device=device, dtype=torch.int32)
+
+    prot_first = compression_ctx.protected_first_tokens or [0] * B
+    prot_last = compression_ctx.protected_last_tokens or [0] * B
+    epsilon = compression_ctx.critical_ada_epsilon
+    first_stage_ratio = compression_ctx.critical_ada_first_stage_ratio
+    alpha_safeguard = float(getattr(compression_ctx, "critical_ada_alpha_safeguard", 0.2))
+    alpha_safeguard = max(0.0, min(1.0, alpha_safeguard))
+
+    if wo_weight.dim() == 2:
+        hidden_size, _ = wo_weight.shape
+        wo = wo_weight.transpose(0, 1).view(Hq, D, hidden_size).contiguous()
+    else:
+        wo = wo_weight.contiguous()
+    hidden_size = wo.size(-1)
+
+    wo_v_norm = torch.empty((N_k, Hk), dtype=torch.float32, device=device)
+
+    def grid_wo(META):
+        max_k_len = int(k_lengths.max().item())
+        return (B, Hk, triton.cdiv(max_k_len, META["BLOCK_K"]))
+
+    _compute_wo_v_l1_kernel[grid_wo](
+        v,
+        wo,
+        cu_seqlens,
+        wo_v_norm,
+        *v.stride(),
+        *wo.stride(),
+        *wo_v_norm.stride(),
+        Hk=Hk,
+        Hq=Hq,
+        D=D,
+        HIDDEN=hidden_size,
+        QUERY_GROUP_SIZE=G,
+    )
+
+    stage1_mask = torch.zeros((N_k, Hk), dtype=torch.int32, device=device)
+    # kvpress 风格的每头预算（按序列自适应），用于 Stage-1/Stage-2。
+    head_budgets_by_batch = []
+
+    for b in range(B):
+        k_len = int(k_lengths[b].item())
+        if k_len == 0:
+            head_budgets_by_batch.append(None)
+            continue
+        k_beg = int(cu_seqlens[b].item())
+        k_end = int(cu_seqlens[b + 1].item())
+        s = int(prot_first[b]) if b < len(prot_first) else 0
+        e = int(prot_last[b]) if b < len(prot_last) else 0
+        lo, hi = k_beg + s, k_end - e
+        compressible = max(0, hi - lo)
+        keep_pairs = int(btr[b].item())
+        if compressible <= 0:
+            head_budgets_by_batch.append(None)
+            continue
+        # 每头 token 预算（kvpress 的 n_kept）
+        n_kept_tokens = max(1, keep_pairs // Hk)
+        n_kept_tokens = min(n_kept_tokens, compressible)
+        # 安全预算（每头至少保留 n_safe）
+        n_safe = int(n_kept_tokens * alpha_safeguard)
+        if n_safe > 0:
+            tk_safe = min(n_safe, compressible)
+            for hk in range(Hk):
+                safe_idx = torch.topk(base_scores[lo:hi, hk], tk_safe, sorted=False).indices
+                stage1_mask[lo + safe_idx, hk] = 1
+
+        # 自适应预算分配：在扁平 (token, head) 空间取 top n_kept_tokens*Hk，统计每个 head 的预算
+        budget_scores = base_scores[lo:hi, :].clone()
+        if n_safe > 0:
+            budget_scores[stage1_mask[lo:hi, :] == 1] = float("inf")
+        top_pairs = min(n_kept_tokens * Hk, budget_scores.numel())
+        if top_pairs <= 0:
+            head_budgets_by_batch.append(None)
+            continue
+        top_idx_flat = torch.topk(
+            budget_scores.reshape(-1), top_pairs, sorted=False
+        ).indices
+        top_head_idx = top_idx_flat % Hk
+        head_budgets = torch.bincount(top_head_idx, minlength=Hk).to(torch.int32)
+        head_budgets_by_batch.append(head_budgets)
+
+        # Stage-1：按 head_budgets 的 first_stage_ratio 分头保护（kvpress 语义）
+        for hk in range(Hk):
+            phase1_budget = int(head_budgets[hk].item() * first_stage_ratio)
+            if phase1_budget <= 0:
+                continue
+            tk = min(phase1_budget, compressible)
+            top_idx = torch.topk(base_scores[lo:hi, hk], tk, sorted=False).indices
+            stage1_mask[lo + top_idx, hk] = 1
+
+    final_scores = torch.empty((N_k, Hk), dtype=torch.float32, device=device)
+
+    def grid_fuse(_META):
+        return (B, Hk)
+
+    _critical_ada_fuse_kernel[grid_fuse](
+        base_scores,
+        wo_v_norm,
+        stage1_mask,
+        cu_seqlens,
+        final_scores,
+        EPSILON=epsilon,
+        *base_scores.stride(),
+        *wo_v_norm.stride(),
+        *stage1_mask.stride(),
+        *final_scores.stride(),
+        Hk=Hk,
+    )
+
+    # Stage-2（kvpress 语义）：在融合后按每头预算再做一次 top-k 保护。
+    for b in range(B):
+        hb = head_budgets_by_batch[b]
+        if hb is None:
+            continue
+        k_beg = int(cu_seqlens[b].item())
+        k_end = int(cu_seqlens[b + 1].item())
+        s = int(prot_first[b]) if b < len(prot_first) else 0
+        e = int(prot_last[b]) if b < len(prot_last) else 0
+        lo, hi = k_beg + s, k_end - e
+        if hi <= lo:
+            continue
+        region_len = hi - lo
+        for hk in range(Hk):
+            budget = int(hb[hk].item())
+            if budget <= 0:
+                continue
+            tk = min(budget, region_len)
+            idx = torch.topk(final_scores[lo:hi, hk], tk, sorted=False).indices
+            final_scores[lo + idx, hk] = float("inf")
+
+    masked_key_indices = None
+    for b in range(B):
+        k_len = int(k_lengths[b].item())
+        if k_len == 0:
+            continue
+        keep_pairs = int(btr[b].item())
+        total_pairs = k_len * Hk
+        if keep_pairs >= total_pairs:
+            continue
+        k_beg = int(cu_seqlens[b].item())
+        k_end = int(cu_seqlens[b + 1].item())
+        n_prune_pairs = min(total_pairs - keep_pairs, total_pairs)
+        if n_prune_pairs <= 0:
+            continue
+
+        flat_scores = final_scores[k_beg:k_end, :].reshape(-1)
+        prune_idx = torch.topk(
+            -flat_scores, min(n_prune_pairs, flat_scores.numel()), sorted=False
+        ).indices
+        batch_idx = torch.full_like(prune_idx, b, dtype=torch.int64)
+        head_idx = prune_idx % Hk
+        seq_idx = prune_idx // Hk + k_beg
+        if masked_key_indices is None:
+            masked_key_indices = (batch_idx, head_idx, seq_idx)
+        else:
+            masked_key_indices = (
+                torch.cat([masked_key_indices[0], batch_idx]),
+                torch.cat([masked_key_indices[1], head_idx]),
+                torch.cat([masked_key_indices[2], seq_idx]),
+            )
+
+    if store_stream is not None:
+        final_scores.record_stream(store_stream)
+
+    return final_scores, masked_key_indices
+
+
+class CriticalAdaKVCompression(BaseCompressionMethod):
+    """
+    以 CompactorCompression 为基分（pre RoPE 杠杆 + post RoPE 非因果融合），
+    再应用 CriticalAda 两阶段加权；须由 Attention 在 post-RoPE 前注入 ``compression_context.wo_weight``。
+    """
+
+    @staticmethod
+    def pre_rope_scoring(
+        q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, context
+    ) -> Optional[torch.Tensor]:
+        cc = context.compression_context
+        base = getattr(cc, "critical_ada_base_scorer", "compactor") if cc is not None else "compactor"
+        if str(base).lower() == "snapkv":
+            return SnapKVCompression.pre_rope_scoring(q, k, v, context)
+        return CompactorCompression.pre_rope_scoring(q, k, v, context)
+
+    @staticmethod
+    def post_rope_scoring(
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        pre_rope_scores: Optional[torch.Tensor],
+        context,
+    ) -> Optional[torch.Tensor]:
+        compression_context = context.compression_context
+        assert compression_context is not None
+        base = str(getattr(compression_context, "critical_ada_base_scorer", "compactor")).lower()
+
+        if base == "snapkv":
+            base_scores = SnapKVCompression.post_rope_scoring(q, k, v, pre_rope_scores, context)
+        else:
+            # 与 compactor.py 中 CompactorCompression.post_rope_scoring 逐字一致：
+            # maybe_execute_in_stream(non_causal_attn_scores, q,k,v, cu_seqlens_q, max_seqlen_q, ...)
+            # 不得改为其它封装，否则与单独使用 COMPACTOR 时分数字不一致。
+            if context.STORE_STREAM is not None:
+                torch.cuda.current_stream().wait_stream(context.STORE_STREAM)
+
+            base_scores = maybe_execute_in_stream(
+                non_causal_attn_scores,
+                q,
+                k,
+                v,
+                context.cu_seqlens_q,
+                context.max_seqlen_q,
+                chunk_size=CompactorCompression.chunk_size,
+                sm_scale=1.0,
+                normalize=True,
+                accum_scores=pre_rope_scores,
+                context_lens=compression_context.context_lens,
+                protected_first_tokens=compression_context.protected_first_tokens,
+                protected_last_tokens=compression_context.protected_last_tokens,
+                accum_blending=0.5,
+            )
+
+        wo_weight = compression_context.wo_weight
+        if wo_weight is None:
+            return base_scores
+
+        scores, _masked = maybe_execute_in_stream(
+            critical_ada_key_scores,
+            q,
+            k,
+            v,
+            wo_weight,
+            context.cu_seqlens_q,
+            base_scores,
+            compression_context,
+            STORE_STREAM=context.STORE_STREAM,
+            store_stream=context.STORE_STREAM,
+        )
+        return scores
+
+    @staticmethod
+    def prepare_layer(module: torch.nn.Module, device: torch.device, dtype: torch.dtype):
+        """可选：预计算并缓存 Wo；实际推理以 Attention.forward 中注入的 ``cc.wo_weight`` 为准。"""
+        if not hasattr(module, "o_proj") or module.o_proj.weight is None:
+            return
+        if not hasattr(module, "num_heads") or not hasattr(module, "head_dim"):
+            return
+        wo_raw = module.o_proj.weight.data
+        hidden_size, _ = wo_raw.shape
+        Hq = module.num_heads
+        head_dim = module.head_dim
+        wo = (
+            wo_raw.transpose(0, 1)
+            .view(Hq, head_dim, hidden_size)
+            .to(device=device, dtype=torch.float32)
+        )
+        module._critical_ada_wo_weight = wo
+
--- a/vllm/kvprune/compression/criticalkv.py
+++ b/vllm/kvprune/compression/criticalkv.py
+"""
+CriticalAdaKV: 在 Compactor（pre RoPE 杠杆分 + post RoPE 非因果注意力融合）基础上，
+用输出投影 Wo 对 Value 的 L1 范数做 Stage-2 重加权；Stage-1 在 Compactor 基础分上做预算内 top-k 保护。
+
+预算与 vllm.kvprune 引擎一致：使用 ``compression_context.batch_tokens_to_retain``（flatten 的
+(token, head) 对数量）。CriticalAda 主链在 **PyTorch** 中与 kvpress ``CriticalAdaKVPress.compress``
+对齐；``||Wo@V||_1`` 仍默认用 Triton ``_compute_wo_v_l1_kernel``（与 ``CriticalKVPress.vwl1norm`` 同式）。
+将 ``_USE_WO_L1_REFERENCE_BACKEND`` 置为 ``True`` 可改走 ``_vwl1_norm_kvpress_reference``。
+
+注意：不得在 import 时加载 ``vllm.kvprune.utils.context``（其会再 import ``CompressionMethod``，
+与 ``compression/__init__.py`` 导入本模块形成环）。运行时只使用与 ``CompressionContext`` 同字段的 duck 对象。
+"""
+
+from __future__ import annotations
+
+from typing import Any, Optional, Tuple
+
+import torch
+import triton
+from triton import language as tl
+from transformers.models.llama.modeling_llama import repeat_kv
+
+from vllm.kvprune.compression.common import BaseCompressionMethod
+from vllm.kvprune.compression.compactor import (
+    CompactorCompression,
+    kvpress_compactor_post_rope,
+    resolve_kvpress_compactor_blending,
+)
+from vllm.kvprune.compression.snapkv import SnapKVCompression
+from vllm.kvprune.utils.helpers import maybe_execute_in_stream
+from vllm.kvprune.utils.triton_compat import autotune as triton_autotune
+
+# Wo@V 的 L1：False = Triton（默认），True = PyTorch 参考（调试/对齐）
+_USE_WO_L1_REFERENCE_BACKEND = False
+
+
+def _vwl1_norm_kvpress_reference(
+    values_seg: torch.Tensor,
+    wo: torch.Tensor,
+    num_kv_heads: int,
+    num_query_groups: int,
+) -> torch.Tensor:
+    """
+    与 kvpress ``CriticalKVPress.vwl1norm`` 等价的 **可选参考实现**（PyTorch，仅用于核对；
+    将 ``_USE_WO_L1_REFERENCE_BACKEND`` 置为 ``True`` 时选用，默认走 Triton）。
+
+    算法：repeat_kv → 逐 query 头 ``|V @ Wo_h|_1`` → 在 GQA 组上 mean，与 Triton 路径同一公式。
+    """
+    k_len, Hk, D = values_seg.shape
+    Hq, D_wo, hidden = wo.shape
+    assert D == D_wo and Hk == num_kv_heads and Hq == Hk * num_query_groups
+    # [1, Hk, k_len, D] 与 HF repeat_kv 约定一致
+    v_4d = values_seg.permute(1, 0, 2).unsqueeze(0).contiguous()
+    v_rep = repeat_kv(v_4d, num_query_groups)  # [1, Hq, k_len, D]
+    # Wo 在 attention 里注入为 float32，V 常为 bf16/fp16，matmul 前对齐 dtype
+    wo_f = wo
+    head_list = []
+    for head in range(Hq):
+        v_h = v_rep[0, head, :, :].to(dtype=wo_f.dtype)
+        head_wov = v_h.matmul(wo_f[head, :, :])
+        head_wov_norm = torch.norm(head_wov, p=1, dim=-1)
+        head_list.append(head_wov_norm)
+    stacked = torch.stack(head_list, dim=0)  # [Hq, k_len]
+    stacked = stacked.view(Hk, num_query_groups, k_len).mean(dim=1)
+    return stacked.transpose(0, 1).contiguous()
+
+
+# ============================================================================
+# Triton：||Wo @ V||₁ 按 kvpress 定义（GQA 上对 query 组 L1 后取均值）
+# ============================================================================
+@triton_autotune(
+    configs=[
+        triton.Config({"BLOCK_K": bk, "BLOCK_D": bd}, num_warps=nw, num_stages=ns)
+        for bk in [32, 64, 128]
+        for bd in [32, 64]
+        for nw in [4, 8]
+        for ns in [3, 4]
+    ],
+    key=["Hk", "D", "HIDDEN"],
+    cache_results=True,
+)
+@triton.jit
+def _compute_wo_v_l1_kernel(
+    V,
+    WO,
+    cu_k,
+    OUT,
+    STRIDE_V_NK,
+    STRIDE_V_HK,
+    STRIDE_V_D,
+    STRIDE_WO_HQ,
+    STRIDE_WO_D,
+    STRIDE_WO_HID,
+    STRIDE_OUT_NK,
+    STRIDE_OUT_HK,
+    Hk: tl.constexpr,
+    Hq: tl.constexpr,
+    D: tl.constexpr,
+    HIDDEN: tl.constexpr,
+    QUERY_GROUP_SIZE: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+):
+    """对每个 KV 头：对 G 个 query 头分别算 ``sum(|V @ Wo|)``，再除以 G（与 kvpress mean 一致）。"""
+    b = tl.program_id(0)
+    hk = tl.program_id(1)
+    ks = tl.program_id(2)
+
+    k_beg = tl.load(cu_k + b)
+    k_end = tl.load(cu_k + b + 1)
+
+    nk_off = ks * BLOCK_K + tl.arange(0, BLOCK_K)
+    nk = k_beg + nk_off
+    k_mask = nk < k_end
+
+    out_ptrs = OUT + nk * STRIDE_OUT_NK + hk * STRIDE_OUT_HK
+    l1_sum = tl.zeros([BLOCK_K], dtype=tl.float32)
+
+    for g in range(QUERY_GROUP_SIZE):
+        hq = hk * QUERY_GROUP_SIZE + g
+
+        v_ptrs = (
+            V
+            + nk[:, None] * STRIDE_V_NK
+            + hk * STRIDE_V_HK
+            + tl.arange(0, D)[None, :] * STRIDE_V_D
+        )
+        v_blk = tl.load(v_ptrs, mask=k_mask[:, None], other=0.0).to(tl.float32)
+
+        for hid_off in range(0, HIDDEN, BLOCK_D):
+            hid_idx = hid_off + tl.arange(0, BLOCK_D)
+            hid_mask = hid_idx < HIDDEN
+
+            wo_ptrs = (
+                WO
+                + hq * STRIDE_WO_HQ
+                + tl.arange(0, D)[:, None] * STRIDE_WO_D
+                + hid_idx[None, :] * STRIDE_WO_HID
+            )
+            wo_tile = tl.load(wo_ptrs, mask=hid_mask[None, :], other=0.0).to(tl.float32)
+
+            wov_tile = tl.dot(v_blk, wo_tile)
+            l1_sum += tl.sum(tl.abs(wov_tile), axis=1)
+
+    l1_sum = l1_sum / QUERY_GROUP_SIZE
+    tl.store(out_ptrs, l1_sum, mask=k_mask)
+
+
+def critical_ada_key_scores(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    wo_weight: torch.Tensor,
+    cu_seqlens: torch.Tensor,
+    base_scores: torch.Tensor,
+    compression_ctx: Any,
+    *,
+    store_stream: Optional[torch.cuda.Stream] = None,
+) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]]:
+    """
+    使用与引擎一致的保留预算 ``batch_tokens_to_retain``（每条序列的 (token, head) 对数），
+    按 kvpress ``CriticalAdaKVPress.compress`` 的顺序实现：safeguard scatter →
+    head-major 展平做 head_budgets → Stage1 在 **已抬高** 的分数上 top-k →
+    ``(scores + ε) * ||WoV||₁`` → Stage2 scatter → 最终按 head-major 展平做 bottom-k。
+
+    ``||Wo@V||₁`` 仍用 Triton（``_compute_wo_v_l1_kernel``）；中间 CriticalAda 步骤用 PyTorch
+    与 kvpress 逐句对齐。仅 base 分数来自 Compactor/SnapKV。
+
+    Args:
+        compression_ctx: 与 ``CompressionContext`` 相同字段即可（duck typing），须含
+            ``batch_tokens_to_retain``；可选 ``critical_ada_epsilon``、
+            ``critical_ada_first_stage_ratio``、``critical_ada_alpha_safeguard``。
+    """
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1
+    device = q.device
+    _, Hq, D = q.shape
+    N_k, Hk, Dk = k.shape
+    assert D == Dk and Hq % Hk == 0
+
+    # 与 non_causal_attn_scores 使用同一 cu（prefill 下即 context.cu_seqlens_q），
+    # 保证 base_scores 行与 Triton 分段一致；勿与 cu_seqlens_k 混用。
+    B = cu_seqlens.numel() - 1
+    G = Hq // Hk
+    k_lengths = cu_seqlens[1:] - cu_seqlens[:-1]
+
+    btr = compression_ctx.batch_tokens_to_retain
+    assert btr is not None and btr.numel() == B
+    btr = btr.to(device=device, dtype=torch.int32)
+
+    epsilon = compression_ctx.critical_ada_epsilon
+    first_stage_ratio = compression_ctx.critical_ada_first_stage_ratio
+    alpha_safeguard = float(compression_ctx.critical_ada_alpha_safeguard)
+    alpha_safeguard = max(0.0, min(1.0, alpha_safeguard))
+
+    if wo_weight.dim() == 2:
+        hidden_size, _ = wo_weight.shape
+        wo = wo_weight.transpose(0, 1).view(Hq, D, hidden_size).contiguous()
+    else:
+        wo = wo_weight.contiguous()
+    hidden_size = wo.size(-1)
+
+    wo_v_norm = torch.empty((N_k, Hk), dtype=torch.float32, device=device)
+    if B > 0 and int(k_lengths.max().item()) > 0:
+        if _USE_WO_L1_REFERENCE_BACKEND:
+            for b in range(B):
+                k_beg = int(cu_seqlens[b].item())
+                k_end = int(cu_seqlens[b + 1].item())
+                if k_end <= k_beg:
+                    continue
+                v_seg = v[k_beg:k_end, :, :].contiguous()
+                wo_v_norm[k_beg:k_end, :] = _vwl1_norm_kvpress_reference(
+                    v_seg, wo, Hk, G
+                )
+        else:
+
+            def grid_wo(META):
+                max_k_len = int(k_lengths.max().item())
+                return (B, Hk, triton.cdiv(max_k_len, META["BLOCK_K"]))
+
+            _compute_wo_v_l1_kernel[grid_wo](
+                v,
+                wo,
+                cu_seqlens,
+                wo_v_norm,
+                *v.stride(),
+                *wo.stride(),
+                *wo_v_norm.stride(),
+                Hk=Hk,
+                Hq=Hq,
+                D=D,
+                HIDDEN=hidden_size,
+                QUERY_GROUP_SIZE=G,
+            )
+
+    # kvpress 用 finfo.max 抬高分数；与 inf 混用时 topk 行为一致
+    _score_max = float(torch.finfo(torch.float32).max)
+
+    final_scores = torch.empty((N_k, Hk), dtype=torch.float32, device=device)
+    head_budgets_by_batch: list[Optional[torch.Tensor]] = []
+
+    for b in range(B):
+        k_len = int(k_lengths[b].item())
+        k_beg = int(cu_seqlens[b].item())
+        k_end = int(cu_seqlens[b + 1].item())
+        if k_len == 0:
+            head_budgets_by_batch.append(None)
+            continue
+
+        scores_seg = base_scores[k_beg:k_end, :].float()
+        keep_pairs = int(btr[b].item())
+        n_kept_tokens = max(1, keep_pairs // Hk)
+        n_kept_tokens = min(n_kept_tokens, k_len)
+
+        # scores_work: 布局 [k_len, Hk]，对应 kvpress [bsz=1, H, k_len] 的 transpose(0,2) 视角下沿 token 维的 topk
+        scores_work = scores_seg.clone()
+
+        # --- Alpha safeguard（kvpress L148–152）---
+        n_safe = int(n_kept_tokens * alpha_safeguard)
+        nk = min(n_safe, k_len) if n_safe > 0 else 0
+        if nk > 0:
+            for hk in range(Hk):
+                top_idx = torch.topk(scores_work[:, hk], nk, dim=0, largest=True).indices
+                scores_work[top_idx, hk] = _score_max
+
+        # --- Head budgets：kvpress L158–164，展平顺序与 [bsz, H, k_len] 一致（head-major：h*K + t）---
+        top_pairs = min(n_kept_tokens * Hk, k_len * Hk)
+        if top_pairs <= 0:
+            head_budgets_by_batch.append(None)
+            wn = wo_v_norm[k_beg:k_end, :]
+            final_scores[k_beg:k_end, :] = (scores_seg + epsilon) * wn
+            continue
+
+        budget_flat = scores_work.permute(1, 0).contiguous().reshape(-1)
+        top_idx_flat = torch.topk(
+            budget_flat, top_pairs, largest=True, sorted=False
+        ).indices
+        top_head_idx = top_idx_flat // k_len
+        head_budgets = torch.bincount(top_head_idx, minlength=Hk).to(torch.int64)
+        head_budgets_by_batch.append(head_budgets)
+
+        # --- Stage 1（kvpress L166–171）：在已 safeguard 的 scores_work 上沿 token 维 top-k ---
+        head_selection_budget_1st = (
+            (head_budgets.to(torch.float32) * float(first_stage_ratio))
+            .to(torch.int64)
+            .tolist()
+        )
+        M1 = max(head_selection_budget_1st) if head_selection_budget_1st else 0
+        mk = min(M1, k_len) if M1 > 0 else 0
+        if mk > 0:
+            top_k_index = torch.topk(scores_work, mk, dim=0, largest=True, sorted=True).indices
+            for hk in range(Hk):
+                phase1_budget = int(head_selection_budget_1st[hk])
+                if phase1_budget <= 0:
+                    continue
+                take = min(phase1_budget, mk)
+                scores_work[top_k_index[:take, hk], hk] = _score_max
+
+        # --- Stage 2 重加权（kvpress L173–175）---
+        wn = wo_v_norm[k_beg:k_end, :]
+        scores_fused = (scores_work + epsilon) * wn
+
+        # --- Stage 2 scatter（kvpress L176–179）---
+        M2 = int(head_budgets.max().item())
+        mk2 = min(M2, k_len) if M2 > 0 else 0
+        if mk2 > 0:
+            top_k_index2 = torch.topk(
+                scores_fused, mk2, dim=0, largest=True, sorted=True
+            ).indices
+            for hk in range(Hk):
+                budget = int(head_budgets[hk].item())
+                if budget <= 0:
+                    continue
+                take = min(budget, mk2)
+                scores_fused[top_k_index2[:take, hk], hk] = _score_max
+
+        final_scores[k_beg:k_end, :] = scores_fused
+
+    masked_key_indices = None
+    for b in range(B):
+        k_len = int(k_lengths[b].item())
+        if k_len == 0:
+            continue
+        keep_pairs = int(btr[b].item())
+        total_pairs = k_len * Hk
+        if keep_pairs >= total_pairs:
+            continue
+        k_beg = int(cu_seqlens[b].item())
+        k_end = int(cu_seqlens[b + 1].item())
+        n_prune_pairs = min(total_pairs - keep_pairs, total_pairs)
+        if n_prune_pairs <= 0:
+            continue
+
+        # kvpress L187：``scores.reshape(bsz, -1)`` 即 [H, K] 按 head-major 展平（flat = h*K + t）
+        flat_scores = (
+            final_scores[k_beg:k_end, :].permute(1, 0).contiguous().reshape(-1)
+        )
+        prune_idx = torch.topk(
+            -flat_scores, min(n_prune_pairs, flat_scores.numel()), sorted=False
+        ).indices
+        batch_idx = torch.full_like(prune_idx, b, dtype=torch.int64)
+        head_idx = prune_idx // k_len
+        seq_idx = prune_idx % k_len + k_beg
+        if masked_key_indices is None:
+            masked_key_indices = (batch_idx, head_idx, seq_idx)
+        else:
+            masked_key_indices = (
+                torch.cat([masked_key_indices[0], batch_idx]),
+                torch.cat([masked_key_indices[1], head_idx]),
+                torch.cat([masked_key_indices[2], seq_idx]),
+            )
+
+    if store_stream is not None:
+        final_scores.record_stream(store_stream)
+
+    return final_scores, masked_key_indices
+
+
+class CriticalAdaKVCompression(BaseCompressionMethod):
+    """
+    仅 ``critical_ada_base_scorer == "compactor"`` 时与 kvpress ``CompactorPress.score`` 一致
+    （``kvpress_compactor_post_rope``：``blending * l_scores + attn_scores``）；其它 base（如 SnapKV）
+    走对应单一 ScorerPress，再叠 CriticalAda。须由 Attention 在 post-RoPE 前注入 ``compression_context.wo_weight``。
+    """
+
+    @staticmethod
+    def pre_rope_scoring(
+        q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, context
+    ) -> Optional[torch.Tensor]:
+        cc = context.compression_context
+        base = (
+            getattr(cc, "critical_ada_base_scorer", "compactor")
+            if cc is not None
+            else "compactor"
+        )
+        if str(base).lower() == "compactor":
+            return CompactorCompression.pre_rope_scoring(q, k, v, context)
+        return SnapKVCompression.pre_rope_scoring(q, k, v, context)
+
+    @staticmethod
+    def post_rope_scoring(
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        pre_rope_scores: Optional[torch.Tensor],
+        context,
+    ) -> Optional[torch.Tensor]:
+        compression_context = context.compression_context
+        assert compression_context is not None
+        base = str(getattr(compression_context, "critical_ada_base_scorer", "compactor")).lower()
+
+        if base == "compactor":
+            # 特例：与 ``CompactorPress.score`` / ``CompactorCompression.post_rope_scoring`` 一致。
+            if context.STORE_STREAM is not None:
+                torch.cuda.current_stream().wait_stream(context.STORE_STREAM)
+
+            blending = resolve_kvpress_compactor_blending(compression_context)
+            base_scores = maybe_execute_in_stream(
+                kvpress_compactor_post_rope,
+                q,
+                k,
+                v,
+                context.cu_seqlens_q,
+                pre_rope_scores,
+                compression_context,
+                context.max_seqlen_q,
+                chunk_size=CompactorCompression.chunk_size,
+                blending=float(blending),
+                STORE_STREAM=context.STORE_STREAM,
+            )
+        else:
+            base_scores = SnapKVCompression.post_rope_scoring(
+                q, k, v, pre_rope_scores, context
+            )
+
+        wo_weight = compression_context.wo_weight
+        if wo_weight is None:
+            return base_scores
+
+        scores, _masked = maybe_execute_in_stream(
+            critical_ada_key_scores,
+            q,
+            k,
+            v,
+            wo_weight,
+            context.cu_seqlens_q,
+            base_scores,
+            compression_context,
+            STORE_STREAM=context.STORE_STREAM,
+            store_stream=context.STORE_STREAM,
+        )
+        return scores
+
+    @staticmethod
+    def prepare_layer(module: torch.nn.Module, device: torch.device, dtype: torch.dtype):
+        """可选：预计算并缓存 Wo；实际推理以 Attention.forward 中注入的 ``cc.wo_weight`` 为准。"""
+        if not hasattr(module, "o_proj") or module.o_proj.weight is None:
+            return
+        if not hasattr(module, "num_heads") or not hasattr(module, "head_dim"):
+            return
+        wo_raw = module.o_proj.weight.data
+        hidden_size, _ = wo_raw.shape
+        Hq = module.num_heads
+        head_dim = module.head_dim
+        wo = (
+            wo_raw.transpose(0, 1)
+            .view(Hq, head_dim, hidden_size)
+            .to(device=device, dtype=torch.float32)
+        )
+        module._critical_ada_wo_weight = wo
+
+
--- a/vllm/kvprune/compression/criticalkv_origin.py
+++ b/vllm/kvprune/compression/criticalkv_origin.py
+"""
+CriticalAdaKV: 在 Compactor（pre RoPE 杠杆分 + post RoPE 非因果注意力融合）基础上，
+用输出投影 Wo 对 Value 的 L1 范数做 Stage-2 重加权；Stage-1 在 Compactor 基础分上做预算内 top-k 保护。
+
+预算与 vllm.kvprune 引擎一致：使用 ``compression_context.batch_tokens_to_retain``（flatten 的
+(token, head) 对数量）。Stage1/2 与 kvpress 论文/实现一致；``||Wo@V||_1`` 在 **算法上** 与
+``CriticalKVPress.vwl1norm`` 相同（GQA 上逐 query 头 L1 再对组取均值）。**默认用 Triton**
+（``_compute_wo_v_l1_kernel``）；若需与 PyTorch 逐行对齐，将模块内 ``_USE_WO_L1_REFERENCE_BACKEND`` 改为 ``True`` 即走 ``_vwl1_norm_kvpress_reference``。
+
+注意：不得在 import 时加载 ``vllm.kvprune.utils.context``（其会再 import ``CompressionMethod``，
+与 ``compression/__init__.py`` 导入本模块形成环）。运行时只使用与 ``CompressionContext`` 同字段的 duck 对象。
+"""
+
+from __future__ import annotations
+
+from typing import Any, Optional, Tuple
+
+import torch
+import triton
+from triton import language as tl
+from transformers.models.llama.modeling_llama import repeat_kv
+
+from vllm.kvprune.compression.common import BaseCompressionMethod
+from vllm.kvprune.compression.compactor import (
+    CompactorCompression,
+    non_causal_attn_scores,
+)
+from vllm.kvprune.compression.snapkv import SnapKVCompression
+from vllm.kvprune.utils.helpers import maybe_execute_in_stream
+from vllm.kvprune.utils.triton_compat import autotune as triton_autotune
+
+# Wo@V 的 L1：False = Triton（默认），True = PyTorch 参考（调试/对齐）
+_USE_WO_L1_REFERENCE_BACKEND = False
+
+
+def _vwl1_norm_kvpress_reference(
+    values_seg: torch.Tensor,
+    wo: torch.Tensor,
+    num_kv_heads: int,
+    num_query_groups: int,
+) -> torch.Tensor:
+    """
+    与 kvpress ``CriticalKVPress.vwl1norm`` 等价的 **可选参考实现**（PyTorch，仅用于核对；
+    将 ``_USE_WO_L1_REFERENCE_BACKEND`` 置为 ``True`` 时选用，默认走 Triton）。
+
+    算法：repeat_kv → 逐 query 头 ``|V @ Wo_h|_1`` → 在 GQA 组上 mean，与 Triton 路径同一公式。
+    """
+    k_len, Hk, D = values_seg.shape
+    Hq, D_wo, hidden = wo.shape
+    assert D == D_wo and Hk == num_kv_heads and Hq == Hk * num_query_groups
+    # [1, Hk, k_len, D] 与 HF repeat_kv 约定一致
+    v_4d = values_seg.permute(1, 0, 2).unsqueeze(0).contiguous()
+    v_rep = repeat_kv(v_4d, num_query_groups)  # [1, Hq, k_len, D]
+    # Wo 在 attention 里注入为 float32，V 常为 bf16/fp16，matmul 前对齐 dtype
+    wo_f = wo
+    head_list = []
+    for head in range(Hq):
+        v_h = v_rep[0, head, :, :].to(dtype=wo_f.dtype)
+        head_wov = v_h.matmul(wo_f[head, :, :])
+        head_wov_norm = torch.norm(head_wov, p=1, dim=-1)
+        head_list.append(head_wov_norm)
+    stacked = torch.stack(head_list, dim=0)  # [Hq, k_len]
+    stacked = stacked.view(Hk, num_query_groups, k_len).mean(dim=1)
+    return stacked.transpose(0, 1).contiguous()
+
+
+# ============================================================================
+# Triton：||Wo @ V||₁ 按 kvpress 定义（GQA 上对 query 组 L1 后取均值）
+# ============================================================================
+@triton_autotune(
+    configs=[
+        triton.Config({"BLOCK_K": bk, "BLOCK_D": bd}, num_warps=nw, num_stages=ns)
+        for bk in [32, 64, 128]
+        for bd in [32, 64]
+        for nw in [4, 8]
+        for ns in [3, 4]
+    ],
+    key=["Hk", "D", "HIDDEN"],
+    cache_results=True,
+)
+@triton.jit
+def _compute_wo_v_l1_kernel(
+    V,
+    WO,
+    cu_k,
+    OUT,
+    STRIDE_V_NK,
+    STRIDE_V_HK,
+    STRIDE_V_D,
+    STRIDE_WO_HQ,
+    STRIDE_WO_D,
+    STRIDE_WO_HID,
+    STRIDE_OUT_NK,
+    STRIDE_OUT_HK,
+    Hk: tl.constexpr,
+    Hq: tl.constexpr,
+    D: tl.constexpr,
+    HIDDEN: tl.constexpr,
+    QUERY_GROUP_SIZE: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+):
+    """对每个 KV 头：对 G 个 query 头分别算 ``sum(|V @ Wo|)``，再除以 G（与 kvpress mean 一致）。"""
+    b = tl.program_id(0)
+    hk = tl.program_id(1)
+    ks = tl.program_id(2)
+
+    k_beg = tl.load(cu_k + b)
+    k_end = tl.load(cu_k + b + 1)
+
+    nk_off = ks * BLOCK_K + tl.arange(0, BLOCK_K)
+    nk = k_beg + nk_off
+    k_mask = nk < k_end
+
+    out_ptrs = OUT + nk * STRIDE_OUT_NK + hk * STRIDE_OUT_HK
+    l1_sum = tl.zeros([BLOCK_K], dtype=tl.float32)
+
+    for g in range(QUERY_GROUP_SIZE):
+        hq = hk * QUERY_GROUP_SIZE + g
+
+        v_ptrs = (
+            V
+            + nk[:, None] * STRIDE_V_NK
+            + hk * STRIDE_V_HK
+            + tl.arange(0, D)[None, :] * STRIDE_V_D
+        )
+        v_blk = tl.load(v_ptrs, mask=k_mask[:, None], other=0.0).to(tl.float32)
+
+        for hid_off in range(0, HIDDEN, BLOCK_D):
+            hid_idx = hid_off + tl.arange(0, BLOCK_D)
+            hid_mask = hid_idx < HIDDEN
+
+            wo_ptrs = (
+                WO
+                + hq * STRIDE_WO_HQ
+                + tl.arange(0, D)[:, None] * STRIDE_WO_D
+                + hid_idx[None, :] * STRIDE_WO_HID
+            )
+            wo_tile = tl.load(wo_ptrs, mask=hid_mask[None, :], other=0.0).to(tl.float32)
+
+            wov_tile = tl.dot(v_blk, wo_tile)
+            l1_sum += tl.sum(tl.abs(wov_tile), axis=1)
+
+    l1_sum = l1_sum / QUERY_GROUP_SIZE
+    tl.store(out_ptrs, l1_sum, mask=k_mask)
+
+
+# ============================================================================
+# Triton：Stage 1 保护 + Stage 2 加权融合（逐元素）
+# ============================================================================
+@triton_autotune(
+    configs=[triton.Config({"BLOCK_K": bk}) for bk in [32, 64, 128, 256]],
+    key=["Hk"],
+    cache_results=True,
+)
+@triton.jit
+def _critical_ada_fuse_kernel(
+    BASE_SCORES,
+    WO_V_NORM,
+    STAGE1_MASK,
+    cu_k,
+    OUT,
+    STRIDE_BS_NK,
+    STRIDE_BS_HK,
+    STRIDE_WN_NK,
+    STRIDE_WN_HK,
+    STRIDE_S1_NK,
+    STRIDE_S1_HK,
+    STRIDE_OUT_NK,
+    STRIDE_OUT_HK,
+    EPSILON: tl.constexpr,
+    Hk: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
+    b = tl.program_id(0)
+    hk = tl.program_id(1)
+
+    k_beg = tl.load(cu_k + b)
+    k_end = tl.load(cu_k + b + 1)
+
+    for ks in tl.range(k_beg, k_end, BLOCK_K):
+        nk = ks + tl.arange(0, BLOCK_K)
+        kmask = nk < k_end
+
+        bs_ptrs = BASE_SCORES + nk * STRIDE_BS_NK + hk * STRIDE_BS_HK
+        wn_ptrs = WO_V_NORM + nk * STRIDE_WN_NK + hk * STRIDE_WN_HK
+        s1_ptrs = STAGE1_MASK + nk * STRIDE_S1_NK + hk * STRIDE_S1_HK
+
+        base = tl.load(bs_ptrs, mask=kmask, other=0.0)
+        wnorm = tl.load(wn_ptrs, mask=kmask, other=1.0)
+        stage1_protect = tl.load(s1_ptrs, mask=kmask, other=0).to(tl.int32)
+
+        fused = (base + EPSILON) * wnorm
+        fused = tl.where(stage1_protect == 1, float("inf"), fused)
+
+        out_ptrs = OUT + nk * STRIDE_OUT_NK + hk * STRIDE_OUT_HK
+        tl.store(out_ptrs, fused, mask=kmask)
+
+
+def critical_ada_key_scores(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    wo_weight: torch.Tensor,
+    cu_seqlens: torch.Tensor,
+    base_scores: torch.Tensor,
+    compression_ctx: Any,
+    *,
+    store_stream: Optional[torch.cuda.Stream] = None,
+) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]]:
+    """
+    使用与引擎一致的保留预算 ``batch_tokens_to_retain``（每条序列的 (token, head) 对数），
+    在每条序列上对齐 kvpress ``CriticalAdaKVPress.compress``（整段 ``k_len``、与源实现相同的
+    top-k / scatter 顺序）；仅 base 分数来自 vllm.kvprune 的 Compactor/SnapKV。
+
+    Args:
+        compression_ctx: 与 ``CompressionContext`` 相同字段即可（duck typing），须含
+            ``batch_tokens_to_retain``；可选 ``critical_ada_epsilon``、
+            ``critical_ada_first_stage_ratio``、``critical_ada_alpha_safeguard``。
+    """
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1
+    device = q.device
+    _, Hq, D = q.shape
+    N_k, Hk, Dk = k.shape
+    assert D == Dk and Hq % Hk == 0
+
+    # 与 non_causal_attn_scores 使用同一 cu（prefill 下即 context.cu_seqlens_q），
+    # 保证 base_scores 行与 Triton 分段一致；勿与 cu_seqlens_k 混用。
+    B = cu_seqlens.numel() - 1
+    G = Hq // Hk
+    k_lengths = cu_seqlens[1:] - cu_seqlens[:-1]
+
+    btr = compression_ctx.batch_tokens_to_retain
+    assert btr is not None and btr.numel() == B
+    btr = btr.to(device=device, dtype=torch.int32)
+
+    epsilon = compression_ctx.critical_ada_epsilon
+    first_stage_ratio = compression_ctx.critical_ada_first_stage_ratio
+    alpha_safeguard = float(compression_ctx.critical_ada_alpha_safeguard)
+    alpha_safeguard = max(0.0, min(1.0, alpha_safeguard))
+
+    if wo_weight.dim() == 2:
+        hidden_size, _ = wo_weight.shape
+        wo = wo_weight.transpose(0, 1).view(Hq, D, hidden_size).contiguous()
+    else:
+        wo = wo_weight.contiguous()
+    hidden_size = wo.size(-1)
+
+    wo_v_norm = torch.empty((N_k, Hk), dtype=torch.float32, device=device)
+    if B > 0 and int(k_lengths.max().item()) > 0:
+        if _USE_WO_L1_REFERENCE_BACKEND:
+            for b in range(B):
+                k_beg = int(cu_seqlens[b].item())
+                k_end = int(cu_seqlens[b + 1].item())
+                if k_end <= k_beg:
+                    continue
+                v_seg = v[k_beg:k_end, :, :].contiguous()
+                wo_v_norm[k_beg:k_end, :] = _vwl1_norm_kvpress_reference(
+                    v_seg, wo, Hk, G
+                )
+        else:
+
+            def grid_wo(META):
+                max_k_len = int(k_lengths.max().item())
+                return (B, Hk, triton.cdiv(max_k_len, META["BLOCK_K"]))
+
+            _compute_wo_v_l1_kernel[grid_wo](
+                v,
+                wo,
+                cu_seqlens,
+                wo_v_norm,
+                *v.stride(),
+                *wo.stride(),
+                *wo_v_norm.stride(),
+                Hk=Hk,
+                Hq=Hq,
+                D=D,
+                HIDDEN=hidden_size,
+                QUERY_GROUP_SIZE=G,
+            )
+
+    stage1_mask = torch.zeros((N_k, Hk), dtype=torch.int32, device=device)
+    head_budgets_by_batch: list[Optional[torch.Tensor]] = []
+
+    for b in range(B):
+        k_len = int(k_lengths[b].item())
+        if k_len == 0:
+            head_budgets_by_batch.append(None)
+            continue
+        k_beg = int(cu_seqlens[b].item())
+        k_end = int(cu_seqlens[b + 1].item())
+        keep_pairs = int(btr[b].item())
+        scores_seg = base_scores[k_beg:k_end, :]
+        # 与 kvpress 的 n_kept 一致：每头保留 n_kept 个 token
+        n_kept_tokens = max(1, keep_pairs // Hk)
+        n_kept_tokens = min(n_kept_tokens, k_len)
+
+        # kvpress：topk 在「未改动的」scores 上取索引，scatter 只写在副本上，供 head_budgets 用；
+        # Stage1 仍用原始 scores_seg（见下）。
+        working = scores_seg.clone()
+        n_safe = int(n_kept_tokens * alpha_safeguard)
+        if n_safe > 0:
+            nk = min(n_safe, k_len)
+            for hk in range(Hk):
+                top_idx = torch.topk(scores_seg[:, hk], nk, sorted=True).indices
+                working[:, hk].scatter_(0, top_idx, float("inf"))
+
+        top_pairs = min(n_kept_tokens * Hk, working.numel())
+        if top_pairs <= 0:
+            head_budgets_by_batch.append(None)
+            continue
+        top_idx_flat = torch.topk(working.reshape(-1), top_pairs, sorted=False).indices
+        top_head_idx = top_idx_flat % Hk
+        head_budgets = torch.bincount(top_head_idx, minlength=Hk).to(torch.int32)
+        head_budgets_by_batch.append(head_budgets)
+
+        # Stage 1：与 kvpress 相同 — 先 topk(..., M1, sorted=True)，再每头取前 phase1 个下标
+        head_selection_budget_1st = (
+            (head_budgets.to(torch.float32) * float(first_stage_ratio))
+            .to(torch.int64)
+            .tolist()
+        )
+        M1 = max(head_selection_budget_1st) if head_selection_budget_1st else 0
+        if M1 > 0:
+            mk = min(M1, k_len)
+            for hk in range(Hk):
+                phase1_budget = int(head_selection_budget_1st[hk])
+                if phase1_budget <= 0:
+                    continue
+                full_idx = torch.topk(scores_seg[:, hk], mk, sorted=True).indices
+                take = min(phase1_budget, mk)
+                stage1_mask[k_beg + full_idx[:take], hk] = 1
+
+    final_scores = torch.empty((N_k, Hk), dtype=torch.float32, device=device)
+
+    def grid_fuse(_META):
+        return (B, Hk)
+
+    _critical_ada_fuse_kernel[grid_fuse](
+        base_scores,
+        wo_v_norm,
+        stage1_mask,
+        cu_seqlens,
+        final_scores,
+        *base_scores.stride(),
+        *wo_v_norm.stride(),
+        *stage1_mask.stride(),
+        *final_scores.stride(),
+        Hk=Hk,
+        EPSILON=float(epsilon),
+    )
+
+    # Stage 2（kvpress）：对融合后分数先 topk(..., M2, sorted=True)，再每头取前 budget 个下标置 inf
+    for b in range(B):
+        hb = head_budgets_by_batch[b]
+        if hb is None:
+            continue
+        k_beg = int(cu_seqlens[b].item())
+        k_end = int(cu_seqlens[b + 1].item())
+        k_len = k_end - k_beg
+        if k_len <= 0:
+            continue
+        fused_seg = final_scores[k_beg:k_end, :]
+        M2 = int(hb.max().item())
+        if M2 <= 0:
+            continue
+        mk = min(M2, k_len)
+        for hk in range(Hk):
+            budget = int(hb[hk].item())
+            if budget <= 0:
+                continue
+            full_idx = torch.topk(fused_seg[:, hk], mk, sorted=True).indices
+            take = min(budget, mk)
+            final_scores[k_beg + full_idx[:take], hk] = float("inf")
+
+    masked_key_indices = None
+    for b in range(B):
+        k_len = int(k_lengths[b].item())
+        if k_len == 0:
+            continue
+        keep_pairs = int(btr[b].item())
+        total_pairs = k_len * Hk
+        if keep_pairs >= total_pairs:
+            continue
+        k_beg = int(cu_seqlens[b].item())
+        k_end = int(cu_seqlens[b + 1].item())
+        n_prune_pairs = min(total_pairs - keep_pairs, total_pairs)
+        if n_prune_pairs <= 0:
+            continue
+
+        flat_scores = final_scores[k_beg:k_end, :].reshape(-1)
+        prune_idx = torch.topk(
+            -flat_scores, min(n_prune_pairs, flat_scores.numel()), sorted=False
+        ).indices
+        batch_idx = torch.full_like(prune_idx, b, dtype=torch.int64)
+        head_idx = prune_idx % Hk
+        seq_idx = prune_idx // Hk + k_beg
+        if masked_key_indices is None:
+            masked_key_indices = (batch_idx, head_idx, seq_idx)
+        else:
+            masked_key_indices = (
+                torch.cat([masked_key_indices[0], batch_idx]),
+                torch.cat([masked_key_indices[1], head_idx]),
+                torch.cat([masked_key_indices[2], seq_idx]),
+            )
+
+    if store_stream is not None:
+        final_scores.record_stream(store_stream)
+
+    return final_scores, masked_key_indices
+
+
+class CriticalAdaKVCompression(BaseCompressionMethod):
+    """
+    以 CompactorCompression 为基分（pre RoPE 杠杆 + post RoPE 非因果融合），
+    再应用 CriticalAda 两阶段加权；须由 Attention 在 post-RoPE 前注入 ``compression_context.wo_weight``。
+    """
+
+    @staticmethod
+    def pre_rope_scoring(
+        q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, context
+    ) -> Optional[torch.Tensor]:
+        cc = context.compression_context
+        base = getattr(cc, "critical_ada_base_scorer", "snapkv") if cc is not None else "compactor"
+        if str(base).lower() == "snapkv":
+            return SnapKVCompression.pre_rope_scoring(q, k, v, context)
+        return CompactorCompression.pre_rope_scoring(q, k, v, context)
+
+    @staticmethod
+    def post_rope_scoring(
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        pre_rope_scores: Optional[torch.Tensor],
+        context,
+    ) -> Optional[torch.Tensor]:
+        compression_context = context.compression_context
+        assert compression_context is not None
+        base = str(getattr(compression_context, "critical_ada_base_scorer", "compactor")).lower()
+
+        if base == "snapkv":
+            base_scores = SnapKVCompression.post_rope_scoring(q, k, v, pre_rope_scores, context)
+        else:
+            # 与 compactor.py 中 CompactorCompression.post_rope_scoring 逐字一致：
+            # maybe_execute_in_stream(non_causal_attn_scores, q,k,v, cu_seqlens_q, max_seqlen_q, ...)
+            # 不得改为其它封装，否则与单独使用 COMPACTOR 时分数字不一致。
+            if context.STORE_STREAM is not None:
+                torch.cuda.current_stream().wait_stream(context.STORE_STREAM)
+
+            base_scores = maybe_execute_in_stream(
+                non_causal_attn_scores,
+                q,
+                k,
+                v,
+                context.cu_seqlens_q,
+                context.max_seqlen_q,
+                chunk_size=CompactorCompression.chunk_size,
+                sm_scale=1.0,
+                normalize=True,
+                accum_scores=pre_rope_scores,
+                context_lens=compression_context.context_lens,
+                protected_first_tokens=compression_context.protected_first_tokens,
+                protected_last_tokens=compression_context.protected_last_tokens,
+                accum_blending=0.5,
+            )
+
+        wo_weight = compression_context.wo_weight
+        if wo_weight is None:
+            return base_scores
+
+        scores, _masked = maybe_execute_in_stream(
+            critical_ada_key_scores,
+            q,
+            k,
+            v,
+            wo_weight,
+            context.cu_seqlens_q,
+            base_scores,
+            compression_context,
+            STORE_STREAM=context.STORE_STREAM,
+            store_stream=context.STORE_STREAM,
+        )
+        return scores
+
+    @staticmethod
+    def prepare_layer(module: torch.nn.Module, device: torch.device, dtype: torch.dtype):
+        """可选：预计算并缓存 Wo；实际推理以 Attention.forward 中注入的 ``cc.wo_weight`` 为准。"""
+        if not hasattr(module, "o_proj") or module.o_proj.weight is None:
+            return
+        if not hasattr(module, "num_heads") or not hasattr(module, "head_dim"):
+            return
+        wo_raw = module.o_proj.weight.data
+        hidden_size, _ = wo_raw.shape
+        Hq = module.num_heads
+        head_dim = module.head_dim
+        wo = (
+            wo_raw.transpose(0, 1)
+            .view(Hq, head_dim, hidden_size)
+            .to(device=device, dtype=torch.float32)
+        )
+        module._critical_ada_wo_weight = wo
+
--- a/vllm/kvprune/compression/snapkv.py
+++ b/vllm/kvprune/compression/snapkv.py
+import math
+from typing import Optional
+
+import torch
+import triton
+from triton import language as tl
+
+from vllm.kvprune.compression.common import BaseCompressionMethod
+from vllm.kvprune.utils.helpers import maybe_execute_in_stream
+from vllm.kvprune.utils.triton_compat import autotune as triton_autotune
+
+# SnapKV defaults aligned with kvpress `SnapKVPress` (snapkv_press.py).
+DEFAULT_SNAPKV_WINDOW_SIZE = 64
+DEFAULT_SNAPKV_KERNEL_SIZE = 5
+
+
+class SnapKVCompression(BaseCompressionMethod):
+    @staticmethod
+    def pre_rope_scoring(
+        q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, context
+    ) -> Optional[torch.Tensor]:
+        return None
+
+    @staticmethod
+    def post_rope_scoring(
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        pre_rope_scores: torch.Tensor,
+        context,
+    ) -> Optional[torch.Tensor]:
+        scores = maybe_execute_in_stream(
+            query_aware_key_scores,
+            q,
+            k,
+            context.cu_seqlens_q,
+            context.cu_seqlens_k,
+            w=DEFAULT_SNAPKV_WINDOW_SIZE,
+            kernel_size=DEFAULT_SNAPKV_KERNEL_SIZE,
+            STORE_STREAM=context.STORE_STREAM,
+        )
+        return scores
+
+
+@triton_autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_Q": bq, "BLOCK_K": bk}, num_warps=num_warps, num_stages=num_stages
+        )
+        for bq in [32, 64]
+        for bk in [32, 64]
+        for num_warps in [4, 8]
+        for num_stages in [3, 4]
+    ],
+    key=["QUERY_GROUP_SIZE", "D", "ROWS_MAX"],
+    cache_results=True,
+)
+@triton.jit
+def _lse_and_store_logits_kernel(
+    Q,
+    K,
+    cu_q,
+    cu_k,
+    w_b,  # int32 pointers
+    out_m,
+    out_S,  # [B, Hk, ROWS_MAX] float32
+    LOGITS,  # [Nk, Hk, ROWS_MAX] float32
+    sm_scale,  # float
+    QUERY_GROUP_SIZE: tl.constexpr,
+    D: tl.constexpr,
+    STRIDE_Q_NQ,
+    STRIDE_Q_HQ,
+    STRIDE_K_NK,
+    STRIDE_K_HK,
+    STRIDE_M_B,
+    STRIDE_M_H,
+    STRIDE_M_R,
+    STRIDE_S_B,
+    STRIDE_S_H,
+    STRIDE_S_R,
+    STRIDE_LG_NK,
+    STRIDE_LG_HK,
+    STRIDE_LG_R,
+    BLOCK_Q: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    ROWS_MAX,
+):
+    # program ids
+    b = tl.program_id(0)
+    hk = tl.program_id(1)
+    rid = tl.program_id(2)  # row-tile id
+    # batch segment bounds
+    q_end = tl.load(cu_q + b + 1)
+    k_beg = tl.load(cu_k + b)
+    k_end = tl.load(cu_k + b + 1)
+    win = tl.load(w_b + b)
+
+    q_win_beg = q_end - win
+    k_eff_end = k_end - win
+    if (win <= 0) or (k_eff_end <= k_beg):
+        return
+
+    # rows for this (b,hk)
+    rows_b = win * QUERY_GROUP_SIZE
+    row0 = rid * BLOCK_Q
+    if row0 >= rows_b:
+        return
+
+    # exp(x) = exp2(x * 1/ln2)
+    qk_scale = sm_scale * 1.4426950408889634
+
+    offs_qrow = row0 + tl.arange(0, BLOCK_Q)
+    row_mask = offs_qrow < rows_b
+
+    # map row -> (q_idx, hq_local)
+    hq_local = offs_qrow % QUERY_GROUP_SIZE
+    q_off = offs_qrow // QUERY_GROUP_SIZE
+    q_idx = q_win_beg + q_off
+    hq_glob = hk * QUERY_GROUP_SIZE + hq_local
+
+    offs_d = tl.arange(0, D)
+
+    q_ptrs = (
+        Q
+        + q_idx[:, None] * STRIDE_Q_NQ
+        + hq_glob[:, None] * STRIDE_Q_HQ
+        + offs_d[None, :]
+    )
+    q_rows = tl.load(q_ptrs, mask=row_mask[:, None], other=0.0)
+    m = tl.zeros([BLOCK_Q], dtype=tl.float32) + (-float("inf"))
+    S = tl.zeros([BLOCK_Q], dtype=tl.float32)
+
+    # Full-sequence causal attention (matches kvpress softmax), then use prefix columns only.
+    for ks in tl.range(k_beg, k_end, BLOCK_K):
+        nk = ks + tl.arange(0, BLOCK_K)
+        kmask = nk < k_end
+
+        k_ptrs = K + nk[:, None] * STRIDE_K_NK + hk * STRIDE_K_HK + offs_d[None, :]
+        k_blk = tl.load(k_ptrs, mask=kmask[:, None], other=0.0)  # [BK, D]
+
+        s = tl.dot(q_rows, k_blk.T) * qk_scale  # [BQ, BK]
+        s = tl.where(kmask[None, :], s, -float("inf"))
+        # Causal: key j only if j <= q_idx (same as kvpress triu mask on the window×k_len grid).
+        causal_ok = nk[None, :] <= q_idx[:, None]
+        s = tl.where(causal_ok, s, -float("inf"))
+
+        # store prefix logits only (for marginal probs on prefix keys)
+        log_ptrs = (
+            LOGITS
+            + nk[:, None] * STRIDE_LG_NK
+            + hk * STRIDE_LG_HK
+            + (row0 + tl.arange(0, BLOCK_Q))[None, :] * STRIDE_LG_R
+        )
+        store_mask = kmask & (nk < k_eff_end)
+        tl.store(log_ptrs, s.T, mask=store_mask[:, None] & row_mask[None, :])
+
+        # log2 streaming LSE over all keys in [k_beg, k_end) (after causal mask)
+        cur_max = tl.max(s, 1)  # [BQ]
+        n_m = tl.maximum(m, cur_max)
+        rescale = tl.math.exp2(m - n_m)
+        S = S * rescale + tl.sum(tl.math.exp2(s - n_m[:, None]), 1)
+        m = n_m
+
+    # store m,S for these rows
+    m_base = out_m + b * STRIDE_M_B + hk * STRIDE_M_H + row0 * STRIDE_M_R
+    S_base = out_S + b * STRIDE_S_B + hk * STRIDE_S_H + row0 * STRIDE_S_R
+    tl.store(m_base + tl.arange(0, BLOCK_Q) * STRIDE_M_R, m, mask=row_mask)
+    tl.store(S_base + tl.arange(0, BLOCK_Q) * STRIDE_S_R, S, mask=row_mask)
+
+
+@triton_autotune(
+    configs=[
+        triton.Config({"BLOCK_Q": bq, "BLOCK_K": bk})
+        for bq in [16, 32, 64]
+        for bk in [32, 64, 128]
+    ],
+    key=["HK", "HQ"],
+    cache_results=True,
+)
+@triton.jit
+def _prefix_probs_kernel(
+    cu_k,
+    w_b,
+    in_m,
+    in_S,  # [B, Hk, ROWS_MAX] f32
+    LOGITS,  # [Nk, Hk, ROWS_MAX] f32, base-2 logits (prefix keys only)
+    PROBS,  # [Nk, Hk, ROWS_MAX] f32 — per-row prefix marginal probs
+    #
+    QUERY_GROUP_SIZE: tl.constexpr,
+    STRIDE_M_B,
+    STRIDE_M_H,
+    STRIDE_M_R,
+    STRIDE_S_B,
+    STRIDE_S_H,
+    STRIDE_S_R,
+    STRIDE_LG_NK,
+    STRIDE_LG_HK,
+    STRIDE_LG_R,
+    STRIDE_PB_NK,
+    STRIDE_PB_HK,
+    STRIDE_PB_R,
+    BLOCK_Q: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
+    b = tl.program_id(0)
+    hk = tl.program_id(1)
+
+    k_beg = tl.load(cu_k + b)
+    k_end = tl.load(cu_k + b + 1)
+    win = tl.load(w_b + b)
+
+    k_eff_end = k_end - win
+    if (win <= 0) or (k_eff_end <= k_beg):
+        return
+
+    rows_b = win * QUERY_GROUP_SIZE
+
+    for ks in tl.range(k_beg, k_eff_end, BLOCK_K):
+        nk = ks + tl.arange(0, BLOCK_K)
+        kmask = nk < k_eff_end
+
+        for row0 in tl.range(0, rows_b, BLOCK_Q):
+            r_idx = row0 + tl.arange(0, BLOCK_Q)
+            rmask = r_idx < rows_b
+
+            m_ptr = in_m + b * STRIDE_M_B + hk * STRIDE_M_H + row0 * STRIDE_M_R
+            S_ptr = in_S + b * STRIDE_S_B + hk * STRIDE_S_H + row0 * STRIDE_S_R
+            m = tl.load(
+                m_ptr + tl.arange(0, BLOCK_Q) * STRIDE_M_R,
+                mask=rmask,
+                other=-float("inf"),
+            )
+            S = tl.load(
+                S_ptr + tl.arange(0, BLOCK_Q) * STRIDE_S_R, mask=rmask, other=0.0
+            )
+
+            valid_row = S > 0
+            m = tl.where(valid_row, m, 0.0)
+            S = tl.where(valid_row, S, 1.0)
+
+            log_ptrs = (
+                LOGITS
+                + nk[:, None] * STRIDE_LG_NK
+                + hk * STRIDE_LG_HK
+                + (row0 + tl.arange(0, BLOCK_Q))[None, :] * STRIDE_LG_R
+            )
+            s_T = tl.load(
+                log_ptrs, mask=kmask[:, None] & rmask[None, :], other=-float("inf")
+            )  # [BK, BQ]
+
+            probs_T = tl.math.exp2(s_T - m[None, :]) / S[None, :]
+            probs_T = tl.where(valid_row[None, :], probs_T, 0.0)
+
+            prob_ptrs = (
+                PROBS
+                + nk[:, None] * STRIDE_PB_NK
+                + hk * STRIDE_PB_HK
+                + (row0 + tl.arange(0, BLOCK_Q))[None, :] * STRIDE_PB_R
+            )
+            tl.store(prob_ptrs, probs_T, mask=kmask[:, None] & rmask[None, :])
+
+
+@triton_autotune(
+    configs=[triton.Config({"BLOCK_K": bk}) for bk in [32, 64, 128]],
+    key=["HK"],
+    cache_results=True,
+)
+@triton.jit
+def _zscore_per_batch_epilogue(
+    OUT,  # [Nk, Hk], float32
+    cu_k,
+    w_b,  # [B+1], [B] int32
+    STRIDE_OUT_NK,
+    STRIDE_OUT_HK,
+    HK: tl.constexpr,  # Hk
+    EPS: tl.constexpr,  # e.g., 1e-12
+    BLOCK_K: tl.constexpr,  # e.g., 128
+):
+    b = tl.program_id(0)
+
+    k_beg = tl.load(cu_k + b)
+    k_end = tl.load(cu_k + b + 1)
+    win = tl.load(w_b + b)
+
+    k_eff_end = k_end - win
+    if k_eff_end <= k_beg:
+        return
+
+    sumv = tl.zeros([], dtype=tl.float32)
+    sumsq = tl.zeros([], dtype=tl.float32)
+    count = ((k_eff_end - k_beg) * HK).to(tl.float32)
+
+    for ks in tl.range(k_beg, k_eff_end, BLOCK_K):
+        nk = ks + tl.arange(0, BLOCK_K)
+        kmask = nk < k_eff_end
+        for h in tl.range(0, HK):
+            ptrs = OUT + nk * STRIDE_OUT_NK + h * STRIDE_OUT_HK
+            vals = tl.load(ptrs, mask=kmask, other=0.0).to(tl.float32)
+            sumv += tl.sum(vals, 0)
+            sumsq += tl.sum(vals * vals, 0)
+
+    mean = sumv / count
+    var = tl.maximum(sumsq / count - mean * mean, 0.0)
+    invstd = 1.0 / tl.sqrt(var + EPS)
+
+    for ks in tl.range(k_beg, k_eff_end, BLOCK_K):
+        nk = ks + tl.arange(0, BLOCK_K)
+        kmask = nk < k_eff_end
+        for h in tl.range(0, HK):
+            ptrs = OUT + nk * STRIDE_OUT_NK + h * STRIDE_OUT_HK
+            vals = tl.load(ptrs, mask=kmask, other=0.0).to(tl.float32)
+            vals = (vals - mean) * invstd
+            tl.store(ptrs, vals, mask=kmask)
+
+
+@triton_autotune(
+    configs=[triton.Config({"BLOCK_T": bt}) for bt in [32, 64, 128, 256]],
+    key=["KERNEL_SIZE"],
+    cache_results=True,
+)
+@triton.jit
+def _snapkv_avg_pool1d_kernel(
+    IN,
+    OUT,
+    Lp,
+    STRIDE_IN_C,
+    STRIDE_IN_L,
+    STRIDE_OUT_C,
+    STRIDE_OUT_L,
+    KERNEL_SIZE: tl.constexpr,
+    PAD: tl.constexpr,
+    BLOCK_T: tl.constexpr,
+):
+    """
+    Symmetric 1D average pool on the last dimension, matching
+    `F.avg_pool1d(x, kernel_size=K, padding=K//2, stride=1)` on `x` shaped [C, Lp]
+    (equivalent to PyTorch [C, 1, Lp] avg_pool1d with divisor = kernel size).
+    """
+    c = tl.program_id(0)
+    t0 = tl.program_id(1) * BLOCK_T + tl.arange(0, BLOCK_T)
+    mask = t0 < Lp
+
+    acc = tl.zeros([BLOCK_T], dtype=tl.float32)
+    for j in tl.static_range(KERNEL_SIZE):
+        idx = t0 - PAD + j
+        valid = (idx >= 0) & (idx < Lp)
+        ptrs = IN + c * STRIDE_IN_C + idx * STRIDE_IN_L
+        v = tl.load(ptrs, mask=valid & mask, other=0.0).to(tl.float32)
+        acc += v
+    acc = acc / tl.cast(KERNEL_SIZE, tl.float32)
+
+    out_ptrs = OUT + c * STRIDE_OUT_C + t0 * STRIDE_OUT_L
+    tl.store(out_ptrs, acc, mask=mask)
+
+
+def _snapkv_avg_pool1d_triton(x: torch.Tensor, kernel_size: int) -> torch.Tensor:
+    """
+    kvpress-equivalent smoothing: same as `F.avg_pool1d` on [Hk*G, 1, Lp].
+    `x` must be float32 and contiguous along Lp (shape [Hk, G, Lp]).
+    """
+    assert x.dtype == torch.float32
+    Hk, G, Lp = x.shape
+    if Lp == 0:
+        return x
+    pad = kernel_size // 2
+    x2 = x.reshape(Hk * G, Lp).contiguous()
+    out = torch.empty_like(x2)
+    C = Hk * G
+    si_c, si_l = x2.stride()
+    so_c, so_l = out.stride()
+
+    def grid(meta):
+        return (C, triton.cdiv(Lp, meta["BLOCK_T"]))
+
+    _snapkv_avg_pool1d_kernel[grid](
+        x2,
+        out,
+        Lp,
+        si_c,
+        si_l,
+        so_c,
+        so_l,
+        KERNEL_SIZE=kernel_size,
+        PAD=pad,
+    )
+    return out.view(Hk, G, Lp)
+
+
+def _snapkv_kvpress_epilogue(
+    probs_buf: torch.Tensor,
+    out: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    w: torch.Tensor,
+    G: int,
+    Hk: int,
+    kernel_size: int,
+) -> None:
+    """
+    Match kvpress SnapKV order: mean over window queries → symmetric avg_pool1d
+    → mean over GQA groups → pad tail with global max of prefix scores.
+    """
+    B = cu_seqlens_k.numel() - 1
+    for b in range(B):
+        k_beg = int(cu_seqlens_k[b].item())
+        k_end = int(cu_seqlens_k[b + 1].item())
+        win = int(w[b].item())
+        k_eff_end = k_end - win
+        if win <= 0 or k_eff_end <= k_beg:
+            continue
+        Lp = k_eff_end - k_beg
+        rows_b = win * G
+        p = probs_buf[k_beg:k_eff_end, :, :rows_b]
+        # [Lp, Hk, win, G] — rows are (q_off, g) order per Triton row layout
+        x = p.view(Lp, Hk, win, G).mean(dim=2)
+        x = x.permute(1, 2, 0).contiguous()  # [Hk, G, Lp]
+        x = _snapkv_avg_pool1d_triton(x, kernel_size)
+        x = x.mean(dim=1)
+        seg = x.permute(1, 0).contiguous()
+        out[k_beg:k_eff_end, :] = seg
+        pad_val = seg.max()
+        out[k_eff_end:k_end, :] = pad_val
+
+
+def query_aware_key_scores(
+    q: torch.Tensor,  # [N_q, Hq, D]
+    k: torch.Tensor,  # [N_k, Hk, D]
+    cu_seqlens_q: torch.Tensor,  # [B+1], int32
+    cu_seqlens_k: torch.Tensor,  # [B+1], int32
+    w: torch.Tensor | int,  # [B], int32
+    sm_scale: float = None,  # defaults to 1/sqrt(D)
+    *,
+    kernel_size: int = DEFAULT_SNAPKV_KERNEL_SIZE,
+    accum_scores: torch.Tensor = None,
+    accum_blending: float = None,
+    normalize: bool = False,
+) -> Optional[torch.Tensor]:
+    assert q.stride(-1) == 1 and k.stride(-1) == 1, "last dim must be contiguous"
+    device = q.device
+    N_q, Hq, D = q.shape
+    N_k, Hk, Dk = k.shape
+    assert (Hq % Hk) == 0, "Hq must be a multiple of Hk"
+    if sm_scale is None:
+        sm_scale = 1.0 / math.sqrt(D)
+
+    B = cu_seqlens_q.numel() - 1
+    assert B == cu_seqlens_k.numel() - 1
+
+    G = Hq // Hk
+    if type(w) is int:
+        max_w = w
+        w = torch.full((B,), fill_value=w, device=device, dtype=torch.int32)
+    else:
+        max_w = int(w.max().item())
+        assert w.numel() == B
+    ROWS_MAX = max_w * G
+    if ROWS_MAX == 0:
+        return torch.zeros((N_k, Hk), dtype=torch.float32, device=device)
+
+    out = torch.zeros((N_k, Hk), dtype=torch.float32, device=device)
+    m_scratch = torch.empty((B, Hk, ROWS_MAX), dtype=torch.float32, device=device)
+    S_scratch = torch.empty((B, Hk, ROWS_MAX), dtype=torch.float32, device=device)
+    logits_buf = torch.empty((N_k, Hk, ROWS_MAX), dtype=torch.float32, device=device)
+    probs_buf = torch.empty((N_k, Hk, ROWS_MAX), dtype=torch.float32, device=device)
+
+    # strides
+    STRIDE_Q_NQ, STRIDE_Q_HQ, _ = q.stride()
+    STRIDE_K_NK, STRIDE_K_HK, _ = k.stride()
+    STRIDE_M_B, STRIDE_M_H, STRIDE_M_R = m_scratch.stride()
+    STRIDE_S_B, STRIDE_S_H, STRIDE_S_R = S_scratch.stride()
+    STRIDE_LG_NK, STRIDE_LG_HK, STRIDE_LG_R = logits_buf.stride()
+    STRIDE_PB_NK, STRIDE_PB_HK, STRIDE_PB_R = probs_buf.stride()
+    STRIDE_OUT_NK, STRIDE_OUT_HK = out.stride()
+
+    def grid(META):
+        return B, Hk, triton.cdiv(ROWS_MAX, META["BLOCK_Q"])
+
+    _lse_and_store_logits_kernel[grid](
+        q,
+        k,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        w,
+        m_scratch,
+        S_scratch,
+        logits_buf,
+        sm_scale,
+        QUERY_GROUP_SIZE=Hq // Hk,
+        D=D,
+        STRIDE_Q_NQ=STRIDE_Q_NQ,
+        STRIDE_Q_HQ=STRIDE_Q_HQ,
+        STRIDE_K_NK=STRIDE_K_NK,
+        STRIDE_K_HK=STRIDE_K_HK,
+        STRIDE_M_B=STRIDE_M_B,
+        STRIDE_M_H=STRIDE_M_H,
+        STRIDE_M_R=STRIDE_M_R,
+        STRIDE_S_B=STRIDE_S_B,
+        STRIDE_S_H=STRIDE_S_H,
+        STRIDE_S_R=STRIDE_S_R,
+        STRIDE_LG_NK=STRIDE_LG_NK,
+        STRIDE_LG_HK=STRIDE_LG_HK,
+        STRIDE_LG_R=STRIDE_LG_R,
+        ROWS_MAX=ROWS_MAX,
+    )
+
+    _prefix_probs_kernel[(B, Hk)](
+        cu_seqlens_k,
+        w,
+        m_scratch,
+        S_scratch,
+        logits_buf,
+        probs_buf,
+        QUERY_GROUP_SIZE=Hq // Hk,
+        STRIDE_M_B=STRIDE_M_B,
+        STRIDE_M_H=STRIDE_M_H,
+        STRIDE_M_R=STRIDE_M_R,
+        STRIDE_S_B=STRIDE_S_B,
+        STRIDE_S_H=STRIDE_S_H,
+        STRIDE_S_R=STRIDE_S_R,
+        STRIDE_LG_NK=STRIDE_LG_NK,
+        STRIDE_LG_HK=STRIDE_LG_HK,
+        STRIDE_LG_R=STRIDE_LG_R,
+        STRIDE_PB_NK=STRIDE_PB_NK,
+        STRIDE_PB_HK=STRIDE_PB_HK,
+        STRIDE_PB_R=STRIDE_PB_R,
+    )
+    _snapkv_kvpress_epilogue(
+        probs_buf, out, cu_seqlens_k, w, G, Hk, kernel_size
+    )
+    if normalize:
+        _zscore_per_batch_epilogue[(B,)](
+            out,
+            cu_seqlens_k,
+            w,
+            STRIDE_OUT_NK,
+            STRIDE_OUT_HK,
+            HK=Hk,
+            EPS=1e-12,
+        )
+    if accum_scores is not None:
+        if accum_blending is not None:
+            accum_scores.mul_(accum_blending)
+        accum_scores.add_(out)
+        return accum_scores
+    else:
+        return out
+
--- a/vllm/kvprune/compression/snapkv_origin.py
+++ b/vllm/kvprune/compression/snapkv_origin.py
+import math
+from typing import Optional
+
+import torch
+import triton
+from triton import language as tl
+
+from vllm.kvprune.compression.common import BaseCompressionMethod
+from vllm.kvprune.utils.helpers import maybe_execute_in_stream
+from vllm.kvprune.utils.triton_compat import autotune as triton_autotune
+
+
+class SnapKVCompression(BaseCompressionMethod):
+    @staticmethod
+    def pre_rope_scoring(
+        q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, context
+    ) -> Optional[torch.Tensor]:
+        return None
+
+    @staticmethod
+    def post_rope_scoring(
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        pre_rope_scores: torch.Tensor,
+        context,
+    ) -> Optional[torch.Tensor]:
+        scores = maybe_execute_in_stream(
+            query_aware_key_scores,
+            q,
+            k,
+            context.cu_seqlens_q,
+            context.cu_seqlens_k,
+            w=32,
+            STORE_STREAM=context.STORE_STREAM,
+        )
+        return scores
+
+
+@triton_autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_Q": bq, "BLOCK_K": bk}, num_warps=num_warps, num_stages=num_stages
+        )
+        for bq in [32, 64]
+        for bk in [32, 64]
+        for num_warps in [4, 8]
+        for num_stages in [3, 4]
+    ],
+    key=["QUERY_GROUP_SIZE", "D", "ROWS_MAX"],
+    cache_results=True,
+)
+@triton.jit
+def _lse_and_store_logits_kernel(
+    Q,
+    K,
+    cu_q,
+    cu_k,
+    w_b,  # int32 pointers
+    out_m,
+    out_S,  # [B, Hk, ROWS_MAX] float32
+    LOGITS,  # [Nk, Hk, ROWS_MAX] float32
+    sm_scale,  # float
+    QUERY_GROUP_SIZE: tl.constexpr,
+    D: tl.constexpr,
+    STRIDE_Q_NQ,
+    STRIDE_Q_HQ,
+    STRIDE_K_NK,
+    STRIDE_K_HK,
+    STRIDE_M_B,
+    STRIDE_M_H,
+    STRIDE_M_R,
+    STRIDE_S_B,
+    STRIDE_S_H,
+    STRIDE_S_R,
+    STRIDE_LG_NK,
+    STRIDE_LG_HK,
+    STRIDE_LG_R,
+    BLOCK_Q: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    ROWS_MAX,
+):
+    # program ids
+    b = tl.program_id(0)
+    hk = tl.program_id(1)
+    rid = tl.program_id(2)  # row-tile id
+    # batch segment bounds
+    q_end = tl.load(cu_q + b + 1)
+    k_beg = tl.load(cu_k + b)
+    k_end = tl.load(cu_k + b + 1)
+    win = tl.load(w_b + b)
+
+    q_win_beg = q_end - win
+    k_eff_end = k_end - win
+    if (win <= 0) or (k_eff_end <= k_beg):
+        return
+
+    # rows for this (b,hk)
+    rows_b = win * QUERY_GROUP_SIZE
+    row0 = rid * BLOCK_Q
+    if row0 >= rows_b:
+        return
+
+    # exp(x) = exp2(x * 1/ln2)
+    qk_scale = sm_scale * 1.4426950408889634
+
+    offs_qrow = row0 + tl.arange(0, BLOCK_Q)
+    row_mask = offs_qrow < rows_b
+
+    # map row -> (q_idx, hq_local)
+    hq_local = offs_qrow % QUERY_GROUP_SIZE
+    q_off = offs_qrow // QUERY_GROUP_SIZE
+    q_idx = q_win_beg + q_off
+    hq_glob = hk * QUERY_GROUP_SIZE + hq_local
+
+    offs_d = tl.arange(0, D)
+
+    q_ptrs = (
+        Q
+        + q_idx[:, None] * STRIDE_Q_NQ
+        + hq_glob[:, None] * STRIDE_Q_HQ
+        + offs_d[None, :]
+    )
+    q_rows = tl.load(q_ptrs, mask=row_mask[:, None], other=0.0)
+    m = tl.zeros([BLOCK_Q], dtype=tl.float32) + (-float("inf"))
+    S = tl.zeros([BLOCK_Q], dtype=tl.float32)
+
+    for ks in tl.range(k_beg, k_eff_end, BLOCK_K):
+        nk = ks + tl.arange(0, BLOCK_K)
+        kmask = nk < k_eff_end
+
+        k_ptrs = K + nk[:, None] * STRIDE_K_NK + hk * STRIDE_K_HK + offs_d[None, :]
+        k_blk = tl.load(k_ptrs, mask=kmask[:, None], other=0.0)  # [BK, D]
+
+        s = tl.dot(q_rows, k_blk.T) * qk_scale  # [BQ, BK]
+        s = tl.where(kmask[None, :], s, -float("inf"))
+
+        # store into LOGITS[nk, hk, row]  -> [BK, BQ]
+        log_ptrs = (
+            LOGITS
+            + nk[:, None] * STRIDE_LG_NK
+            + hk * STRIDE_LG_HK
+            + (row0 + tl.arange(0, BLOCK_Q))[None, :] * STRIDE_LG_R
+        )
+        tl.store(log_ptrs, s.T, mask=kmask[:, None] & row_mask[None, :])
+
+        # log2 streaming LSE update
+        cur_max = tl.max(s, 1)  # [BQ]
+        n_m = tl.maximum(m, cur_max)
+        rescale = tl.math.exp2(m - n_m)
+        S = S * rescale + tl.sum(tl.math.exp2(s - n_m[:, None]), 1)
+        m = n_m
+
+    # store m,S for these rows
+    m_base = out_m + b * STRIDE_M_B + hk * STRIDE_M_H + row0 * STRIDE_M_R
+    S_base = out_S + b * STRIDE_S_B + hk * STRIDE_S_H + row0 * STRIDE_S_R
+    tl.store(m_base + tl.arange(0, BLOCK_Q) * STRIDE_M_R, m, mask=row_mask)
+    tl.store(S_base + tl.arange(0, BLOCK_Q) * STRIDE_S_R, S, mask=row_mask)
+
+
+@triton_autotune(
+    configs=[
+        triton.Config({"BLOCK_Q": bq, "BLOCK_K": bk})
+        for bq in [16, 32, 64]
+        for bk in [32, 64, 128]
+    ],
+    key=["HK", "HQ"],
+    cache_results=True,
+)
+@triton.jit
+def _scores_from_logits_kernel(
+    cu_k,
+    w_b,
+    in_m,
+    in_S,  # [B, Hk, ROWS_MAX] f32
+    LOGITS,  # [Nk, Hk, ROWS_MAX] f32, base-2 logits
+    OUT,  # [Nk, Hk] f32
+    #
+    QUERY_GROUP_SIZE: tl.constexpr,
+    STRIDE_M_B,
+    STRIDE_M_H,
+    STRIDE_M_R,
+    STRIDE_S_B,
+    STRIDE_S_H,
+    STRIDE_S_R,
+    STRIDE_LG_NK,
+    STRIDE_LG_HK,
+    STRIDE_LG_R,
+    STRIDE_OUT_NK,
+    STRIDE_OUT_HK,
+    BLOCK_Q: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    #
+    DO_POOL: tl.constexpr,  # set True to enable in-place avg pool
+    KPOOL: tl.constexpr,  # kernel size for avg pool (stride=1)
+):
+    b = tl.program_id(0)
+    hk = tl.program_id(1)
+
+    k_beg = tl.load(cu_k + b)
+    k_end = tl.load(cu_k + b + 1)
+    win = tl.load(w_b + b)
+
+    k_eff_end = k_end - win
+    if (win <= 0) or (k_eff_end <= k_beg):
+        return
+
+    rows_b = win * QUERY_GROUP_SIZE
+
+    # === scores over computed region ===
+    for ks in tl.range(k_beg, k_eff_end, BLOCK_K):
+        nk = ks + tl.arange(0, BLOCK_K)
+        kmask = nk < k_eff_end
+
+        scores = tl.zeros([BLOCK_K], dtype=tl.float32)
+
+        for row0 in tl.range(0, rows_b, BLOCK_Q):
+            r_idx = row0 + tl.arange(0, BLOCK_Q)
+            rmask = r_idx < rows_b
+
+            # load m, S for rows
+            m_ptr = in_m + b * STRIDE_M_B + hk * STRIDE_M_H + row0 * STRIDE_M_R
+            S_ptr = in_S + b * STRIDE_S_B + hk * STRIDE_S_H + row0 * STRIDE_S_R
+            m = tl.load(
+                m_ptr + tl.arange(0, BLOCK_Q) * STRIDE_M_R,
+                mask=rmask,
+                other=-float("inf"),
+            )
+            S = tl.load(
+                S_ptr + tl.arange(0, BLOCK_Q) * STRIDE_S_R, mask=rmask, other=0.0
+            )
+
+            valid_row = S > 0
+            m = tl.where(valid_row, m, 0.0)
+            S = tl.where(valid_row, S, 1.0)
+
+            # load stored logits^T: [BK, BQ]
+            log_ptrs = (
+                LOGITS
+                + nk[:, None] * STRIDE_LG_NK
+                + hk * STRIDE_LG_HK
+                + (row0 + tl.arange(0, BLOCK_Q))[None, :] * STRIDE_LG_R
+            )
+            s_T = tl.load(
+                log_ptrs, mask=kmask[:, None] & rmask[None, :], other=-float("inf")
+            )  # [BK, BQ]
+
+            # probs^T = exp2(s_T - m) / S, sum over rows
+            probs_T = tl.math.exp2(s_T - m[None, :]) / S[None, :]
+            probs_T = tl.where(valid_row[None, :], probs_T, 0.0)
+
+            scores += tl.sum(probs_T, 1)  # [BK]
+
+        if DO_POOL and (KPOOL > 1):
+            i = tl.arange(0, BLOCK_K)[:, None]
+            j = tl.arange(0, BLOCK_K)[None, :]
+            band = (j <= i) & ((i - j) < KPOOL)
+            band = band & kmask[None, :]
+            # sum within band
+            sums = tl.sum(tl.where(band, scores[None, :], 0.0), 1)  # [BK]
+            denom = tl.sum(band, 1).to(tl.float32)  # [BK]
+            denom = tl.where(denom > 0, denom, 1.0)
+            scores = sums / denom
+
+        out_ptrs = OUT + nk * STRIDE_OUT_NK + hk * STRIDE_OUT_HK
+        tl.store(out_ptrs, scores, mask=kmask)
+
+    pad_beg = k_eff_end
+    pad_end = k_end
+    if pad_end > pad_beg:
+        for ks in tl.range(pad_beg, pad_end, BLOCK_K):
+            nk = ks + tl.arange(0, BLOCK_K)
+            kmask = nk < pad_end
+            out_ptrs = OUT + nk * STRIDE_OUT_NK + hk * STRIDE_OUT_HK
+            tl.store(
+                out_ptrs, tl.full([BLOCK_K], float("inf"), dtype=tl.float32), mask=kmask
+            )
+
+
+@triton_autotune(
+    configs=[triton.Config({"BLOCK_K": bk}) for bk in [32, 64, 128]],
+    key=["HK"],
+    cache_results=True,
+)
+@triton.jit
+def _zscore_per_batch_epilogue(
+    OUT,  # [Nk, Hk], float32
+    cu_k,
+    w_b,  # [B+1], [B] int32
+    STRIDE_OUT_NK,
+    STRIDE_OUT_HK,
+    HK: tl.constexpr,  # Hk
+    EPS: tl.constexpr,  # e.g., 1e-12
+    BLOCK_K: tl.constexpr,  # e.g., 128
+):
+    b = tl.program_id(0)
+
+    k_beg = tl.load(cu_k + b)
+    k_end = tl.load(cu_k + b + 1)
+    win = tl.load(w_b + b)
+
+    k_eff_end = k_end - win
+    if k_eff_end <= k_beg:
+        return
+
+    sumv = tl.zeros([], dtype=tl.float32)
+    sumsq = tl.zeros([], dtype=tl.float32)
+    count = ((k_eff_end - k_beg) * HK).to(tl.float32)
+
+    for ks in tl.range(k_beg, k_eff_end, BLOCK_K):
+        nk = ks + tl.arange(0, BLOCK_K)
+        kmask = nk < k_eff_end
+        for h in tl.range(0, HK):
+            ptrs = OUT + nk * STRIDE_OUT_NK + h * STRIDE_OUT_HK
+            vals = tl.load(ptrs, mask=kmask, other=0.0).to(tl.float32)
+            sumv += tl.sum(vals, 0)
+            sumsq += tl.sum(vals * vals, 0)
+
+    mean = sumv / count
+    var = tl.maximum(sumsq / count - mean * mean, 0.0)
+    invstd = 1.0 / tl.sqrt(var + EPS)
+
+    for ks in tl.range(k_beg, k_eff_end, BLOCK_K):
+        nk = ks + tl.arange(0, BLOCK_K)
+        kmask = nk < k_eff_end
+        for h in tl.range(0, HK):
+            ptrs = OUT + nk * STRIDE_OUT_NK + h * STRIDE_OUT_HK
+            vals = tl.load(ptrs, mask=kmask, other=0.0).to(tl.float32)
+            vals = (vals - mean) * invstd
+            tl.store(ptrs, vals, mask=kmask)
+
+
+def query_aware_key_scores(
+    q: torch.Tensor,  # [N_q, Hq, D]
+    k: torch.Tensor,  # [N_k, Hk, D]
+    cu_seqlens_q: torch.Tensor,  # [B+1], int32
+    cu_seqlens_k: torch.Tensor,  # [B+1], int32
+    w: torch.Tensor | int,  # [B], int32
+    sm_scale: float = None,  # defaults to 1/sqrt(D)
+    *,
+    accum_scores: torch.Tensor = None,
+    accum_blending: float = None,
+    normalize: bool = False,
+) -> Optional[torch.Tensor]:
+    assert q.stride(-1) == 1 and k.stride(-1) == 1, "last dim must be contiguous"
+    device = q.device
+    N_q, Hq, D = q.shape
+    N_k, Hk, Dk = k.shape
+    assert (Hq % Hk) == 0, "Hq must be a multiple of Hk"
+    if sm_scale is None:
+        sm_scale = 1.0 / math.sqrt(D)
+
+    B = cu_seqlens_q.numel() - 1
+    assert B == cu_seqlens_k.numel() - 1
+
+    G = Hq // Hk
+    if type(w) is int:
+        max_w = w
+        w = torch.full((B,), fill_value=w, device=device, dtype=torch.int32)
+    else:
+        max_w = int(w.max().item())
+        assert w.numel() == B
+    ROWS_MAX = max_w * G
+    if ROWS_MAX == 0:
+        return torch.zeros((N_k, Hk), dtype=torch.float32, device=device)
+
+    out = torch.empty((N_k, Hk), dtype=torch.float32, device=device)
+    m_scratch = torch.empty((B, Hk, ROWS_MAX), dtype=torch.float32, device=device)
+    S_scratch = torch.empty((B, Hk, ROWS_MAX), dtype=torch.float32, device=device)
+    logits_buf = torch.empty((N_k, Hk, ROWS_MAX), dtype=torch.float32, device=device)
+
+    # strides
+    STRIDE_Q_NQ, STRIDE_Q_HQ, _ = q.stride()
+    STRIDE_K_NK, STRIDE_K_HK, _ = k.stride()
+    STRIDE_M_B, STRIDE_M_H, STRIDE_M_R = m_scratch.stride()
+    STRIDE_S_B, STRIDE_S_H, STRIDE_S_R = S_scratch.stride()
+    STRIDE_LG_NK, STRIDE_LG_HK, STRIDE_LG_R = logits_buf.stride()
+    STRIDE_OUT_NK, STRIDE_OUT_HK = out.stride()
+
+    def grid(META):
+        return B, Hk, triton.cdiv(ROWS_MAX, META["BLOCK_Q"])
+
+    _lse_and_store_logits_kernel[grid](
+        q,
+        k,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        w,
+        m_scratch,
+        S_scratch,
+        logits_buf,
+        sm_scale,
+        QUERY_GROUP_SIZE=Hq // Hk,
+        D=D,
+        STRIDE_Q_NQ=STRIDE_Q_NQ,
+        STRIDE_Q_HQ=STRIDE_Q_HQ,
+        STRIDE_K_NK=STRIDE_K_NK,
+        STRIDE_K_HK=STRIDE_K_HK,
+        STRIDE_M_B=STRIDE_M_B,
+        STRIDE_M_H=STRIDE_M_H,
+        STRIDE_M_R=STRIDE_M_R,
+        STRIDE_S_B=STRIDE_S_B,
+        STRIDE_S_H=STRIDE_S_H,
+        STRIDE_S_R=STRIDE_S_R,
+        STRIDE_LG_NK=STRIDE_LG_NK,
+        STRIDE_LG_HK=STRIDE_LG_HK,
+        STRIDE_LG_R=STRIDE_LG_R,
+        ROWS_MAX=ROWS_MAX,
+    )
+
+    _scores_from_logits_kernel[(B, Hk)](
+        cu_seqlens_k,
+        w,
+        m_scratch,
+        S_scratch,
+        logits_buf,
+        out,
+        QUERY_GROUP_SIZE=Hq // Hk,
+        STRIDE_M_B=STRIDE_M_B,
+        STRIDE_M_H=STRIDE_M_H,
+        STRIDE_M_R=STRIDE_M_R,
+        STRIDE_S_B=STRIDE_S_B,
+        STRIDE_S_H=STRIDE_S_H,
+        STRIDE_S_R=STRIDE_S_R,
+        STRIDE_LG_NK=STRIDE_LG_NK,
+        STRIDE_LG_HK=STRIDE_LG_HK,
+        STRIDE_LG_R=STRIDE_LG_R,
+        STRIDE_OUT_NK=STRIDE_OUT_NK,
+        STRIDE_OUT_HK=STRIDE_OUT_HK,
+        DO_POOL=True,
+        KPOOL=5,
+    )
+    if normalize:
+        _zscore_per_batch_epilogue[(B,)](
+            out,
+            cu_seqlens_k,
+            w,
+            STRIDE_OUT_NK,
+            STRIDE_OUT_HK,
+            HK=Hk,
+            EPS=1e-12,
+        )
+    if accum_scores is not None:
+        if accum_blending is not None:
+            accum_scores.mul_(accum_blending)
+        accum_scores.add_(out)
+        return accum_scores
+    else:
+        return out
--- a/vllm/kvprune/config/__init__.py
+++ b/vllm/kvprune/config/__init__.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Engine / sampling / kernel constants (compactor-compatible)."""
+
+from vllm.kvprune.config.constants import RESERVED_BATCH, TRITON_RESERVED_BATCH
+
+__all__ = ["RESERVED_BATCH", "TRITON_RESERVED_BATCH"]
--- a/vllm/kvprune/config/constants.py
+++ b/vllm/kvprune/config/constants.py
+RESERVED_BATCH = 0
+# NOTE: Triton `tl.constexpr` is intended for use in kernel signatures/annotations.
+# Some Triton builds reject passing `tl.constexpr(...)` objects as constexpr values.
+# Keep the runtime value as a plain int and let kernel signatures declare constexpr.
+TRITON_RESERVED_BATCH = RESERVED_BATCH
--- a/vllm/kvprune/config/engine_config.py
+++ b/vllm/kvprune/config/engine_config.py
+import os
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import List, Optional
+
+from transformers import AutoConfig
+
+
+class AttentionBackend(Enum):
+    """Legacy coarse backend toggle (prefer :class:`KvpruneAttentionSchedule`)."""
+
+    FLASH_ATTENTION = auto()
+    COMPACTOR_TRITON = auto()
+
+
+class KvpruneAttentionSchedule(Enum):
+    """FlashAttention vs Triton split for prefill / decode (KV **writes** stay Triton)."""
+
+    # Default: FA varlen prefill; decode uses ``head_sparse_decode_attention`` (Triton).
+    FA_PREFILL_TRITON_DECODE = auto()
+    # Prefill attention uses ``causal_sparse_varlen_with_cache`` (Triton); decode Triton.
+    TRITON_PREFILL_TRITON_DECODE = auto()
+    # "PDFA": FA prefill + FA decode; paged KV **storage** (incl. pruned top-k) unchanged.
+    PDFA = auto()
+
+
+@dataclass
+class LLMConfig:
+    """Configuration for the :class:`LLM` engine.
+    Parameters
+    ----------
+    model : str
+        Hugging Face model identifier (e.g. ``"meta-llama/Meta-Llama-3-8B"``) or
+        a local model name that can be resolved by
+        :func:`transformers.AutoConfig.from_pretrained`.
+    path : str, optional
+        Local directory containing the model weights. If ``None``, the engine
+        will attempt to resolve a local snapshot for ``model`` using
+        :func:`huggingface_hub.snapshot_download`.
+    max_num_seqs : int, default 256
+        Upper bound on the number of concurrent batches that the scheduler and
+        KV-cache manager are allowed to handle. This affects the size of the
+        page table and some internal buffers.
+    max_model_len : int, default 40960
+        Maximum context length (in tokens) that the engine will allocate KV cache
+        and CUDA graphs for. During initialization this value is clamped to
+        ``hf_config.max_position_embeddings`` for the chosen model.
+    gpu_memory_utilization : float, default 0.9
+        Fraction of the total GPU memory that may be used for KV cache and model
+        activations. Values should be in ``(0, 1]``. If this budget is too small,
+        the KV-cache manager may raise an error at warmup time due
+        to insufficient memory.
+    tensor_parallel_size : int, default 1
+        Number of tensor-parallel workers to shard the model
+        across. Must be between 1 and 8, and must evenly divide the model's
+        number of key/value heads.
+    enforce_eager : bool, default False
+        If ``True``, disable CUDA graph capture and always run the model in
+        eager mode during decoding. This reduces throughput. When ``False``,
+        the engine will capture and reuse CUDA graphs for supported
+        batch sizes and sequence lengths.
+    hf_config : transformers.AutoConfig, optional
+        Pre-loaded Hugging Face configuration for the model. If ``None``,
+         it will then be populated automatically based on ``model``.
+    eos : int, default -1
+        Primary stop token id (warmup / single-id paths). If ``-1``, the
+        :class:`LLM` constructor fills this and :attr:`eos_token_ids` from the
+        tokenizer.
+    eos_token_ids : list of int, optional
+        All token ids that terminate generation (e.g. HF tokenizers may expose
+        ``eos_token_id`` as a list for chat models). If ``None``, inferred in
+        :class:`LLM` from the tokenizer and model type.
+    kvcache_page_size : int, default 128
+        Number of tokens stored in a single KV-cache page. Smaller pages improve
+        allocation flexibility but increase page-table overhead; larger pages
+        reduce overhead but have coarser granularity.
+    leverage_sketch_size : int, default 48
+        Sketch dimension used by the Compactor leverage-score estimator.
+    attention_schedule : KvpruneAttentionSchedule, default FA_PREFILL_TRITON_DECODE
+        Which **attention** implementation runs on prefill vs decode. KV **writes**
+        (``prefill_store_*``, ``decode_store_kv``, pruned top-k) always use the
+        existing Triton store kernels. Env ``VLLM_KVPRUNE_ATTENTION_SCHEDULE`` uses
+        short names: ``fa_triton`` (default), ``pdtriton``, ``pdfa``. Enum values:
+        ``FA_PREFILL_TRITON_DECODE`` — FA prefill, Triton decode;
+        ``TRITON_PREFILL_TRITON_DECODE`` — Triton prefill + decode;
+        ``PDFA`` — FA prefill + FA decode (still Triton KV I/O).
+    attention_backend : AttentionBackend, optional
+        Deprecated. Ignored if ``attention_schedule`` is set; otherwise mapped
+        for backward compatibility.
+    """
+
+    model: str
+    path: Optional[str] = None
+    nccl_port: Optional[int] = 1218
+    max_num_seqs: int = 256
+    max_model_len: int = 40960
+    gpu_memory_utilization: float = 0.9
+    tensor_parallel_size: int = 1
+    enforce_eager: bool = False
+    hf_config: AutoConfig | None = None
+    eos: int = -1
+    eos_token_ids: Optional[List[int]] = None
+    kvcache_page_size: int = 128
+    leverage_sketch_size: int = 48
+    attention_schedule: KvpruneAttentionSchedule = (
+        KvpruneAttentionSchedule.FA_PREFILL_TRITON_DECODE
+    )
+    attention_backend: AttentionBackend | None = None
+    show_progress_bar: bool = True
+
+    def __post_init__(self):
+        if self.attention_backend is not None:
+            if self.attention_backend == AttentionBackend.FLASH_ATTENTION:
+                self.attention_schedule = KvpruneAttentionSchedule.FA_PREFILL_TRITON_DECODE
+            else:
+                self.attention_schedule = (
+                    KvpruneAttentionSchedule.TRITON_PREFILL_TRITON_DECODE
+                )
+        if self.path is not None and not os.path.isdir(self.path):
+            raise NotADirectoryError(f"Engine config dir {self.path} does not exist")
+        if self.tensor_parallel_size <= 0 or self.tensor_parallel_size > 8:
+            assert 1 <= self.tensor_parallel_size <= 8
+            raise ValueError("tensor_parallel_size must be >= 1 and <= 8")
+        if self.hf_config is None:
+            self.hf_config = AutoConfig.from_pretrained(self.model)
+        self.max_model_len = min(
+            self.max_model_len, self.hf_config.max_position_embeddings
+        )
+
--- a/vllm/kvprune/config/sampling_params.py
+++ b/vllm/kvprune/config/sampling_params.py
+from dataclasses import dataclass
+
+
+@dataclass
+class SamplingParams:
+    temperature: float = 1.0
+    max_new_tokens: int = 256
+
+    def __post_init__(self):
+        if self.temperature < 0:
+            raise ValueError("Temperature cannot be negative")
--- a/vllm/kvprune/core/__init__.py
+++ b/vllm/kvprune/core/__init__.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Core: compactor ``LLMEngine`` stack (``llm_engine``, ``model_runner``, ``scheduler``, …).
+
+v1 集成路径使用子模块显式导入（如 ``from vllm.kvprune.core.llm_engine import LLMEngine``），
+不要求本包聚合已移除的可选钩子（``runtime`` / ``flash_integration`` / ``block_budget``）。
+"""
+
+from vllm.kvprune.core.compression_bridge import (
+    VALID_ALIASES_FOR_SAMPLING,
+    compression_method_id_to_enum,
+    compression_method_str_to_id,
+)
+
+__all__ = [
+    "VALID_ALIASES_FOR_SAMPLING",
+    "compression_method_id_to_enum",
+    "compression_method_str_to_id",
+]
--- a/vllm/kvprune/core/compression_bridge.py
+++ b/vllm/kvprune/core/compression_bridge.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Map compression method strings (e.g. from :class:`~vllm.kvprune.integration.CompressionParams`) to kvprune GPU / enum IDs."""
+
+from __future__ import annotations
+
+from vllm.kvprune.compression.compression_config import CompressionMethod
+
+# IDs stored on device [num_reqs_padded] (int32). Order is stable for kernels.
+COMPRESSION_METHOD_ID_NONE = 0
+COMPRESSION_METHOD_ID_CRITICALADAKV = 1
+COMPRESSION_METHOD_ID_COMPACTOR = 2
+COMPRESSION_METHOD_ID_SNAPKV = 3
+
+# Aliases accepted for method strings (case-insensitive after strip).
+VALID_ALIASES_FOR_SAMPLING: frozenset[str] = frozenset(
+    {"none", "criticaladakv", "compactor", "snapkv"}
+)
+
+_STR_TO_ID: dict[str, int] = {
+    "none": COMPRESSION_METHOD_ID_NONE,
+    "criticaladakv": COMPRESSION_METHOD_ID_CRITICALADAKV,
+    "compactor": COMPRESSION_METHOD_ID_COMPACTOR,
+    "snapkv": COMPRESSION_METHOD_ID_SNAPKV,
+}
+
+_ID_TO_COMPRESSION_METHOD: dict[int, CompressionMethod] = {
+    COMPRESSION_METHOD_ID_NONE: CompressionMethod.NONE,
+    COMPRESSION_METHOD_ID_CRITICALADAKV: CompressionMethod.CRITICALADAKV,
+    COMPRESSION_METHOD_ID_COMPACTOR: CompressionMethod.COMPACTOR,
+    COMPRESSION_METHOD_ID_SNAPKV: CompressionMethod.SNAPKV,
+}
+
+
+def compression_method_str_to_id(s: str) -> int:
+    """Normalize and map user string to a stable int id (0..3)."""
+    key = (s or "none").strip().lower()
+    if key not in _STR_TO_ID:
+        raise ValueError(
+            f"Unknown compression_method {s!r}; expected one of "
+            f"{sorted(VALID_ALIASES_FOR_SAMPLING)}"
+        )
+    return _STR_TO_ID[key]
+
+
+def compression_method_id_to_enum(method_id: int) -> CompressionMethod:
+    if method_id not in _ID_TO_COMPRESSION_METHOD:
+        return CompressionMethod.NONE
+    return _ID_TO_COMPRESSION_METHOD[method_id]
+
+
+__all__ = [
+    "COMPRESSION_METHOD_ID_NONE",
+    "COMPRESSION_METHOD_ID_CRITICALADAKV",
+    "COMPRESSION_METHOD_ID_COMPACTOR",
+    "COMPRESSION_METHOD_ID_SNAPKV",
+    "VALID_ALIASES_FOR_SAMPLING",
+    "compression_method_id_to_enum",
+    "compression_method_str_to_id",
+]
--- a/vllm/kvprune/core/llm_engine.py
+++ b/vllm/kvprune/core/llm_engine.py
+from __future__ import annotations
+
+import atexit
+import inspect
+import logging
+from pathlib import Path
+from typing import Any, List, Optional, Union
+
+import torch.nn as nn
+import torch.multiprocessing as mp
+from vllm.kvprune.compression.compression_config import (
+    BatchCompressionParams,
+    SequenceCompressionParams,
+)
+from vllm.kvprune.config.engine_config import LLMConfig
+from vllm.kvprune.config.sampling_params import SamplingParams
+from vllm.kvprune.core.model_runner import ModelRunner
+from vllm.kvprune.models import MODEL_REGISTRY
+from vllm.kvprune.utils.sequence import Sequence
+from transformers import AutoTokenizer
+
+logger = logging.getLogger(__name__)
+
+PromptLike = Union[str, List[int]]
+
+
+def _infer_stop_token_ids(tokenizer, hf_config) -> list[int]:
+    """
+    Build the set of token ids that should end generation.
+
+    Newer HF chat tokenizers often expose ``eos_token_id`` as a *list* of ids.
+    The engine must not compare generated ids to that list as a single ``int``;
+    see :attr:`LLMConfig.eos_token_ids` and decode-time ``torch.isin``.
+
+    Qwen chat uses ``</think>`` (im_end) as the assistant turn boundary; include it
+    when present in ``additional_special_tokens`` / ``added_tokens_encoder``. We
+    avoid loose substring matches like ``\"end\"`` that can tag unrelated tokens.
+    """
+    raw = tokenizer.eos_token_id
+    ids: list[int] = []
+    if isinstance(raw, (list, tuple)):
+        ids.extend(int(x) for x in raw)
+    elif raw is not None:
+        ids.append(int(raw))
+    unk_id = getattr(tokenizer, "unk_token_id", None)
+
+    def _maybe_add_tid(tid: int) -> None:
+        if not isinstance(tid, int) or tid < 0:
+            return
+        if unk_id is not None and tid == unk_id:
+            return
+        if tid not in ids:
+            ids.append(tid)
+
+    model_type = getattr(hf_config, "model_type", None)
+    if model_type in ("qwen2", "qwen3", "qwen2_moe", "qwen3_moe"):
+        enc = getattr(tokenizer, "added_tokens_encoder", None)
+        if isinstance(enc, dict):
+            for key, tid in enc.items():
+                if isinstance(key, str) and "im_end" in key:
+                    _maybe_add_tid(int(tid))
+        for extra in getattr(tokenizer, "additional_special_tokens", []) or []:
+            if not isinstance(extra, str) or "im_end" not in extra:
+                continue
+            try:
+                tid = tokenizer.convert_tokens_to_ids(extra)
+            except (TypeError, ValueError, KeyError):
+                continue
+            _maybe_add_tid(tid)
+
+    if not ids:
+        raise ValueError(
+            "Could not infer stop token ids from the tokenizer; set "
+            "LLMConfig(eos_token_ids=[...]) explicitly."
+        )
+    return ids
+
+
+def _merge_apply_chat_template_kwargs(
+    tokenizer,
+    user_kwargs: Optional[dict[str, Any]],
+) -> dict[str, Any]:
+    """
+    Merge user kwargs with defaults for HF chat templates that support them.
+
+    Qwen3 (and similar) instruct models expect `add_generation_prompt=True` so
+    the first generated token continues the assistant turn; without it, output
+    can repeat punctuation / template fragments. `enable_thinking=False` avoids
+    the Qwen3 reasoning channel when the tokenizer supports it.
+    """
+    out = dict(user_kwargs or {})
+    try:
+        sig = inspect.signature(tokenizer.apply_chat_template)
+    except (TypeError, ValueError):
+        return out
+    if "add_generation_prompt" in sig.parameters and "add_generation_prompt" not in out:
+        out["add_generation_prompt"] = True
+    if "enable_thinking" in sig.parameters and "enable_thinking" not in out:
+        out["enable_thinking"] = False
+    return out
+
+
+def _runner_entry(config: LLMConfig, rank: int, evt):
+    runner = None
+    try:
+        runner = ModelRunner(config, rank, evt)
+        runner.loop()
+    except Exception as e:
+        logging.exception(f"Rank {rank}: {repr(e)}")
+    finally:
+        if runner is not None:
+            runner.exit()
+
+
+class LLMEngine:
+    """High-level engine coordinating model runners and scheduling"""
+
+    def __init__(self, config: LLMConfig, external_model: nn.Module | None = None):
+        self.config = config
+        if self.config.hf_config.model_type not in MODEL_REGISTRY:
+            raise ValueError(f"Unknown model {self.config.model}")
+        if config.path is None:
+            # Local directory: use it directly (no Hub round-trip).
+            try:
+                mp = Path(config.model)
+                if mp.is_dir() and (mp / "config.json").is_file():
+                    self.config.path = str(mp.resolve())
+                    logger.info("Using local model directory for tokenizer: %s", self.config.path)
+            except OSError:
+                pass
+        if config.path is None:
+            from huggingface_hub import snapshot_download
+
+            # Hub repo id: allow downloading missing shards/tokenizer files when cache
+            # is incomplete (local_files_only=False). Local dirs are handled above.
+            self.config.path = snapshot_download(
+                repo_id=config.model,
+                local_files_only=False,
+            )
+            logger.info(
+                "Resolved Hugging Face snapshot for %s @ %s",
+                self.config.model,
+                self.config.path,
+            )
+        assert self.config.path is not None
+        _trust = bool(getattr(self.config.hf_config, "trust_remote_code", False))
+        # Always load tokenizer from the resolved on-disk tree so we do not re-hit
+        # the Hub with the repo id (can re-download tokenizer / LFS shards).
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.config.path,
+            use_fast=True,
+            trust_remote_code=_trust,
+        )
+        if self.config.eos_token_ids is None:
+            if self.config.eos != -1:
+                self.config.eos_token_ids = [int(self.config.eos)]
+            else:
+                self.config.eos_token_ids = _infer_stop_token_ids(
+                    self.tokenizer, self.config.hf_config
+                )
+        else:
+            self.config.eos_token_ids = [int(x) for x in self.config.eos_token_ids]
+        self.config.eos_token_ids = sorted(set(self.config.eos_token_ids))
+        if self.config.eos == -1:
+            self.config.eos = int(self.config.eos_token_ids[0])
+        else:
+            self.config.eos = int(self.config.eos)
+            if self.config.eos not in self.config.eos_token_ids:
+                self.config.eos_token_ids = sorted(
+                    self.config.eos_token_ids + [self.config.eos]
+                )
+
+        if external_model is not None and int(self.config.tensor_parallel_size) != 1:
+            raise ValueError(
+                "external_model (shared-weight compactor path) only supports "
+                "tensor_parallel_size=1"
+            )
+
+        self.ps = []
+        world_size = int(self.config.tensor_parallel_size)
+        self.events = []
+        if world_size > 1:
+            ctx = mp.get_context("spawn")
+            for r in range(1, world_size):
+                event = ctx.Event()
+                p = ctx.Process(
+                    target=_runner_entry,
+                    args=(self.config, r, event),
+                    daemon=True,
+                )
+                p.start()
+                self.ps.append(p)
+                self.events.append(event)
+
+        self.master_model_runner = ModelRunner(
+            self.config,
+            rank=0,
+            peer_events=self.events,
+            external_model=external_model,
+        )
+        atexit.register(self.exit)
+
+    def exit(self):
+        if getattr(self, "_exited", False):
+            return
+        self._exited = True
+        runner = getattr(self, "master_model_runner", None)
+        if runner is not None:
+            try:
+                runner.exit()
+            except Exception:
+                logger.exception("Failed to exit master ModelRunner cleanly")
+        for p in self.ps:
+            if p.is_alive():
+                p.terminate()
+                p.join(timeout=1.0)
+        if hasattr(self, "events"):
+            self.events.clear()
+
+    def tokenize_prompt(self, prompt: PromptLike, **tokenizer_kwargs) -> List[int]:
+        """
+        Turn a raw prompt into token IDs.
+        """
+        if isinstance(prompt, str):
+            return self.tokenizer(prompt, **tokenizer_kwargs)["input_ids"]
+        else:
+            return list(prompt)
+
+    def detokenize_prompt(
+        self, sequences: List[Sequence], **detokenizer_kwargs
+    ) -> List[str]:
+        """
+        Turn completed Sequences into strings.
+        """
+        defaults: dict[str, Any] = {"skip_special_tokens": True}
+        merged = {**defaults, **detokenizer_kwargs}
+        return self.tokenizer.batch_decode(
+            [s.completion_token_ids for s in sequences], **merged
+        )
+
+    def _build_sequences(
+        self,
+        prompts: List[PromptLike] | PromptLike,
+        sampling_params: SamplingParams | List[SamplingParams],
+        per_sequence_compression_params: Optional[
+            SequenceCompressionParams | List[SequenceCompressionParams]
+        ] = None,
+        tokenizer_kwargs: Optional[dict[str, Any]] = None,
+    ) -> List[Sequence]:
+        """
+        Build Sequence objects from prompts, sampling params, and optional
+        per-sequence compression parameters.
+        """
+        tokenizer_kwargs = {} if tokenizer_kwargs is None else tokenizer_kwargs
+
+        if not isinstance(prompts, list):
+            prompts = [prompts]
+
+        if isinstance(sampling_params, SamplingParams):
+            sampling_params_list: List[SamplingParams] = [sampling_params] * len(
+                prompts
+            )
+        else:
+            sampling_params_list = sampling_params
+            assert len(sampling_params_list) == len(prompts), (
+                "sampling_params list must match prompts length"
+            )
+        if per_sequence_compression_params is None:
+            compression_params_list: List[SequenceCompressionParams] = [
+                SequenceCompressionParams(1.0) for _ in prompts
+            ]
+        elif isinstance(per_sequence_compression_params, SequenceCompressionParams):
+            compression_params_list = [per_sequence_compression_params] * len(prompts)
+        else:
+            # list-like
+            assert len(per_sequence_compression_params) == len(prompts), (
+                "per_sequence_compression_params list must match prompts length"
+            )
+            compression_params_list = list(per_sequence_compression_params)
+
+        seqs: List[Sequence] = []
+        for prompt, sparams, cparams in zip(
+            prompts, sampling_params_list, compression_params_list
+        ):
+            token_ids = self.tokenize_prompt(prompt, **tokenizer_kwargs)
+            if cparams.protected_first_tokens + cparams.protected_last_tokens >= len(token_ids):
+                cparams.compression_ratio = 1.0
+            seqs.append(
+                Sequence(
+                    prompt_token_ids=token_ids,
+                    sampling_params=sparams,
+                    compression_params=cparams,
+                )
+            )
+        return seqs
+
+    def generate(
+        self,
+        prompts: List[PromptLike] | PromptLike,
+        sampling_params: SamplingParams | List[SamplingParams],
+        batch_compression_params: BatchCompressionParams,
+        *,
+        per_sequence_compression_params: Union[
+            List[SequenceCompressionParams], SequenceCompressionParams
+        ] = None,
+        tokenizer_kwargs: Optional[dict[str, Any]] = None,
+        detokenizer_kwargs: Optional[dict[str, Any]] = None,
+        return_sequences: bool = False,
+    ) -> List[str] | tuple[List[str], List[Sequence]]:
+        """
+        Accept prompts and return completed Sequences.
+        Args:
+            :param prompts:
+                Single prompt or list of prompts, each either a raw text prompt,
+                or pre-tokenized input IDs.
+            :param sampling_params:
+                A single SamplingParams for all prompts in this batch or a list of
+                SamplingParams with the same length as ``prompts``.
+            :param batch_compression_params:
+                Compression settings for this batch.
+            :param per_sequence_compression_params:
+                Per-sequence compression parameters, including the compression
+                ratio to be applied and the size of the protected regions of the
+                sequence (how many start tokens and end tokens to keep uncompressed).
+                If a SequenceCompressionParams instance, the same params will be
+                applied to all sequences in this batch; if a list is provided,
+                each SequenceCompressionParams will be attached to the corresponding
+                prompt in the batch.
+            :param tokenizer_kwargs:
+                Extra kwargs forwarded to ``tokenizer(...)`` when tokenizing
+                string prompts.
+            :param detokenizer_kwargs:
+                Passed through to `tokenizer.batch_decode`.
+            :param return_sequences:
+                Whether to return sequence objects or not
+        Returns:
+            :return List[Sequence]:
+                One Sequence per input prompt, with `completion_token_ids`
+                filled in after generation.
+        """
+        tokenizer_kwargs = {} if tokenizer_kwargs is None else tokenizer_kwargs
+        detokenizer_kwargs = {} if detokenizer_kwargs is None else detokenizer_kwargs
+        seqs = self._build_sequences(
+            prompts,
+            sampling_params=sampling_params,
+            per_sequence_compression_params=per_sequence_compression_params,
+            tokenizer_kwargs=tokenizer_kwargs,
+        )
+        self.master_model_runner.generate(seqs, batch_compression_params)
+        output_strings = self.detokenize_prompt(seqs, **detokenizer_kwargs)
+        if return_sequences:
+            return output_strings, seqs
+        return output_strings
+
+    def generate_chat(
+        self,
+        messages_batch: List[List[dict]],
+        sampling_params: SamplingParams | List[SamplingParams],
+        batch_compression_params: BatchCompressionParams,
+        per_sequence_compression_params: Union[
+            SequenceCompressionParams, List[SequenceCompressionParams]
+        ],
+        *,
+        tokenizer_kwargs: Optional[dict[str, Any]] = None,
+        detokenizer_kwargs: Optional[dict[str, Any]] = None,
+        return_sequences: bool = False,
+    ) -> List[str] | tuple[List[str], List[Sequence]]:
+        """
+        Convenience API for chat-style prompts using HF `apply_chat_template`.
+        Args:
+            :param messages_batch:
+                List of conversations, where each conversation is a list of
+                message dicts like:
+                    {"role": "system" | "user" | "assistant", "content": str}
+            :param sampling_params:
+                A single SamplingParams for all prompts in this batch or a list of
+                SamplingParams with the same length as ``prompts``.
+            :param batch_compression_params:
+                Batch Level compression settings. Can set compression_method.
+            :param per_sequence_compression_params:
+                Per-sequence compression parameters, including the compression
+                ratio to be applied and the size of the protected regions of the
+                sequence (how many start tokens and end tokens to keep uncompressed).
+                If a SequenceCompressionParams instance, the same params will be
+                applied to all sequences in this batch; if a list is provided,
+                each SequenceCompressionParams will be attached to the corresponding
+                conversation in the batch.
+            :param tokenizer_kwargs:
+                Passed through to `tokenizer.apply_chat_template`.
+            :param detokenizer_kwargs:
+                Passed through to `tokenizer.batch_decode`.
+            :param return_sequences:
+                Whether to return sequence objects or not
+        Returns:
+            :return List[str] or tuple[List[str], List[Sequence]]:
+                One string per conversation.
+        """
+        prompts_token_ids: List[List[int]] = []
+        tokenizer_kwargs = _merge_apply_chat_template_kwargs(
+            self.tokenizer, tokenizer_kwargs
+        )
+        detokenizer_kwargs = {} if detokenizer_kwargs is None else detokenizer_kwargs
+        for messages in messages_batch:
+            input_ids = self.tokenizer.apply_chat_template(
+                messages,
+                tokenize=True,
+                **tokenizer_kwargs,
+            )
+            if hasattr(input_ids, "tolist"):
+                input_ids = input_ids.tolist()
+            prompts_token_ids.append(input_ids)
+
+        return self.generate(
+            prompts_token_ids,
+            sampling_params=sampling_params,
+            batch_compression_params=batch_compression_params,
+            per_sequence_compression_params=per_sequence_compression_params,
+            tokenizer_kwargs=tokenizer_kwargs,
+            detokenizer_kwargs=detokenizer_kwargs,
+            return_sequences=return_sequences,
+        )
+
+    def generate_from_sequences(
+        self,
+        seqs: List[Sequence],
+        batch_compression_params: BatchCompressionParams,
+    ) -> List[Sequence]:
+        """
+        Args:
+            :param seqs:
+                List of Sequence instances
+            :param batch_compression_params:
+                Compression settings.
+
+        Returns:
+            :return List[Sequence]:
+                Same list, mutated in-place with completions.
+        """
+        self.master_model_runner.generate(seqs, batch_compression_params)
+        return seqs
+
--- a/vllm/kvprune/core/memory_manager.py
+++ b/vllm/kvprune/core/memory_manager.py
+import logging
+import os
+from typing import Iterable, List, Optional
+
+import torch
+from vllm.kvprune.config.engine_config import LLMConfig
+from vllm.kvprune.kv_cache.page_table import KVAllocationStatus, PagedKVCache
+from vllm.kvprune.utils.tp_utils import kv_heads_shard_divisor
+from torch import nn
+
+logger = logging.getLogger(__name__)
+
+
+class KVCacheManager:
+    def __init__(
+        self,
+        rank: int,
+        config: LLMConfig,
+        *,
+        device: str | None = None,
+    ):
+        super().__init__()
+        hf_config = config.hf_config
+        self.rank = rank
+        self.gpu_frac = config.gpu_memory_utilization
+        self.page_size = config.kvcache_page_size
+        self.world_size = config.tensor_parallel_size
+        self.max_num_batches = config.max_num_seqs
+        self.max_model_len = config.max_model_len
+        self.num_layers = hf_config.num_hidden_layers
+        self.model_dtype = hf_config.torch_dtype
+        self.head_dim = getattr(hf_config, "head_dim", None)
+        self.max_pages_per_batch = (
+            self.max_model_len + self.page_size - 1
+        ) // self.page_size
+        _ws = kv_heads_shard_divisor()
+        self.num_kv_heads = hf_config.num_key_value_heads // _ws
+        assert hf_config.num_key_value_heads % _ws == 0, (
+            "tensor-parallel world size needs to divide num_kv_heads"
+        )
+        self._cache_device = device if device is not None else f"cuda:{self.rank}"
+
+        self.num_pages = None
+        self.paged_cache: Optional[PagedKVCache] = None
+        self.max_batched_tokens = None
+
+        self.seq_id_to_batch = {}
+
+    def allocate_sequences(
+        self, seq_ids: List[int], max_positions: List[int]
+    ) -> (bool, Optional[torch.Tensor]):
+        batch_mapping = []
+        for seq_id, len_to_alloc in zip(seq_ids, max_positions):
+            if seq_id not in self.seq_id_to_batch:
+                batch_id = self.paged_cache.new_batch()
+                if batch_id is None:
+                    logger.warning("Failed to allocate batch!")
+                    return False, None
+                self.seq_id_to_batch[seq_id] = int(batch_id)
+            batch_mapping.append(self.seq_id_to_batch[seq_id])
+            if (
+                alloc_status := self.paged_cache.reserve_tokens(
+                    self.seq_id_to_batch[seq_id], len_to_alloc
+                )
+            ) != KVAllocationStatus.SUCCESS:
+                logger.warning(f"Failed to allocate pages ({alloc_status})!")
+                return False, None
+        batch_mapping = torch.as_tensor(batch_mapping, dtype=torch.int32, device="cuda")
+        return True, batch_mapping
+
+    def free_sequences(self, seq_ids: Iterable[int]):
+        for seq_id in seq_ids:
+            global_batch_id = self.seq_id_to_batch.pop(seq_id, None)
+            self.paged_cache.free_batch(global_batch_id)
+
+    def init_cache(self, model: nn.Module):
+        self.num_pages = self.get_num_pages(self.gpu_frac, self.max_pages_per_batch)
+        self.paged_cache = PagedKVCache(
+            num_layers=self.num_layers,
+            H_kv=self.num_kv_heads,
+            head_dim=self.head_dim,
+            page_size=self.page_size,
+            num_pages=int(self.num_pages),
+            max_num_batches=self.max_num_batches,
+            device=self._cache_device,
+            dtype=self.model_dtype,
+            max_logical_pages_per_head=int(self.max_pages_per_batch),
+        )
+        self._assign_cache_to_layers(model)
+
+    def _assign_cache_to_layers(self, model) -> None:
+        for layer_index, layer in enumerate(model.model.layers):
+            attn = layer.self_attn.attn
+            k, v, pt, bh = self.paged_cache.layer_slices(layer_index)
+            attn.k_cache = k
+            attn.v_cache = v
+            attn.page_table = pt
+            attn.bh_seq_lens = bh
+            attn.page_size = self.page_size
+
+    def get_num_pages(self, frac: float, n_logical_pages_max: int):
+        free, total = torch.cuda.mem_get_info()
+        used = total - free
+        stats = torch.cuda.memory_stats()
+        peak = int(stats["allocated_bytes.all.peak"])
+        current = int(stats["allocated_bytes.all.current"])
+        bytes_for_kv_budget = int(total * frac * 0.9) - used - peak + current
+
+        if bytes_for_kv_budget <= 0:
+            # Standalone compactor: ``frac`` is a fraction of total VRAM. When a second
+            # engine shares the GPU with vLLM (shared weights), most VRAM is already
+            # committed; the formula above goes negative. Fall back to a slice of
+            # *currently free* memory for the compactor KV pool.
+            free_frac = float(
+                os.environ.get("VLLM_KVPRUNE_COMPACTOR_KV_FREE_FRAC", "0.55")
+            )
+            free_frac = max(0.05, min(free_frac, 0.95))
+            bytes_for_kv_budget = int(free * free_frac)
+            logger.warning(
+                "KV cache budget from gpu_memory_utilization (%.2f) is exhausted "
+                "(%.2f MiB free on device); using %.0f%% of free memory (~%.2f MiB) "
+                "for compactor KV (set VLLM_KVPRUNE_COMPACTOR_KV_FREE_FRAC to adjust).",
+                frac,
+                free / (1024**2),
+                free_frac * 100,
+                bytes_for_kv_budget / (1024**2),
+            )
+        if bytes_for_kv_budget <= 0:
+            raise RuntimeError(
+                "Insufficient memory for compactor KV cache: no free GPU memory left "
+                "after the primary vLLM engine. Lower vLLM gpu_memory_utilization or "
+                "max_model_len, shorten prompts, or run compactor-only / vLLM-only "
+                "sessions. Raising gpu_memory_utilization here does not help."
+            )
+        # page_table[L, B, H_kv, N_LOGICAL_PAGES_MAX] + bh_seq_lens[L, B, H_kv]
+        int32_sz = torch.empty((), dtype=torch.int32).element_size()  # 4
+        page_table_bytes_per_layer = (
+            self.max_num_batches
+            * self.num_kv_heads
+            * n_logical_pages_max
+            * int32_sz  # page_table
+            + self.max_num_batches * self.num_kv_heads * int32_sz
+        )
+        total_page_table_bytes = self.num_layers * page_table_bytes_per_layer
+        kv_bytes_net = bytes_for_kv_budget - total_page_table_bytes
+        if kv_bytes_net <= 0:
+            # Tight VRAM: metadata alone can exceed the first budget; reserve page
+            # tables plus a slice of remaining free for KV tensors.
+            bytes_for_kv_budget = min(
+                int(free * 0.95),
+                total_page_table_bytes + max(int(free * 0.25), 8 * 1024 * 1024),
+            )
+            kv_bytes_net = bytes_for_kv_budget - total_page_table_bytes
+        if kv_bytes_net <= 0:
+            raise RuntimeError(
+                "page-table footprint exceeds available GPU memory for compactor KV. "
+                f"Reduce vLLM max_num_seqs (compactor uses {self.max_num_batches}) "
+                f"or max_model_len ({self.max_model_len}), or free GPU memory."
+            )
+        dtype_sz = torch.empty((), dtype=self.model_dtype).element_size()
+        bytes_per_page_across_layers = self.num_layers * (
+            2 * self.page_size * self.head_dim * dtype_sz
+        )
+        return max(1, kv_bytes_net // bytes_per_page_across_layers)
+
+    def estimate_max_batched_tokens(
+        self,
+        warmup_tokens: int,
+        bytes_used_before_warmup: int,
+        bytes_peak_after_warmup: int,
+    ) -> int:
+        """
+        Estimate the max total number of tokens that can be processed concurrently
+        without OOM.
+        """
+        assert warmup_tokens > 0, "warmup_tokens must be > 0"
+        # activation bytes per token
+        warmup_delta = max(
+            0, int(bytes_peak_after_warmup) - int(bytes_used_before_warmup)
+        )
+        bytes_per_token = max(1, (warmup_delta + warmup_tokens - 1) // warmup_tokens)
+
+        free, total = torch.cuda.mem_get_info()
+        target = int(total * self.gpu_frac)
+        used_now = int(total - free)
+        # reserve headroom equal to the gap between peak and current allocations seen so far
+        stats = torch.cuda.memory_stats()
+        peak_cur = int(stats.get("allocated_bytes.all.peak", 0))
+        cur_now = int(stats.get("allocated_bytes.all.current", 0))
+        cushion = max(0, peak_cur - cur_now)
+
+        activation_budget = int(max(0, target - used_now - cushion) * 0.95)
+        max_tokens_per_batch = activation_budget // bytes_per_token
+        max_tokens_in_cache = (self.num_pages * self.page_size) // self.num_kv_heads
+        # round to lower multiple of page size
+        max_tokens_per_batch = (max_tokens_per_batch // self.page_size) * self.page_size
+        max_tokens_in_cache = (max_tokens_in_cache // self.page_size) * self.page_size
+
+        # When vLLM shares the same GPU, ``used_now`` often exceeds ``target`` (same
+        # situation as ``get_num_pages``), so activation_budget is ~0 and
+        # ``max_tokens_per_batch`` rounds to 0 or one page. The min(...) would then
+        # cap prefill at ~page_size tokens (e.g. 32) even though the compactor KV pool
+        # is large — no prompt longer than that can be scheduled. Prefer KV capacity
+        # (capped by max_model_len) whenever activation math yields only a token or two.
+        if (
+            max_tokens_in_cache > 0
+            and max_tokens_per_batch <= self.page_size
+            and max_tokens_in_cache > max_tokens_per_batch
+        ):
+            max_tokens_per_batch = min(max_tokens_in_cache, self.max_model_len)
+
+        self.max_batched_tokens = min(max_tokens_in_cache, max_tokens_per_batch)
+        # Last resort: allow at least one page when KV exists but min(...) is still 0.
+        if self.max_batched_tokens == 0 and self.num_pages > 0 and max_tokens_in_cache > 0:
+            self.max_batched_tokens = min(max_tokens_in_cache, self.page_size)
+        return self.max_batched_tokens
+
+    @property
+    def num_free_batches(self) -> int:
+        return len(self.paged_cache.free_batches)
+
+    @property
+    def num_free_pages(self) -> int:
+        return min(len(fp) for fp in self.paged_cache.free_pages)
+
+    def reclaim_pages(
+        self,
+        seq_ids_to_reclaim: Iterable[int],
+        future_reserved_buffer: List[int] | torch.Tensor,
+    ) -> int:
+        approximate_bytes_freed = 0
+        for i, seq_id in enumerate(seq_ids_to_reclaim):
+            batch_idx = self.seq_id_to_batch[seq_id]
+            approximate_bytes_freed += self.paged_cache.reclaim_pages(
+                batch_idx, future_reserved_buffer[i]
+            )
+        return approximate_bytes_freed
--- a/vllm/kvprune/core/model_runner.py
+++ b/vllm/kvprune/core/model_runner.py
+import atexit
+import logging
+import os
+import inspect
+from typing import Any, List, Optional
+
+import torch
+import torch.nn as nn
+import torch.distributed as dist
+from vllm.kvprune.attention.sparse_decode_kernel import num_splits_heuristic
+from vllm.kvprune.compression.compression_config import BatchCompressionParams
+from vllm.kvprune.config.constants import RESERVED_BATCH
+from vllm.kvprune.config.engine_config import LLMConfig, KvpruneAttentionSchedule
+from vllm.kvprune.core.memory_manager import KVCacheManager
+from vllm.kvprune.core.scheduler import Scheduler
+from vllm.kvprune.layers.sampler import Sampler
+from vllm.kvprune.models import MODEL_REGISTRY
+from vllm.kvprune.utils.arguments import (
+    DecodeBatchArguments,
+    DecodeBatchOutput,
+    PackedTensorArguments,
+    PrefillBatchArguments,
+)
+from vllm.kvprune.utils.context import CompressionContext, reset_context, set_context
+from vllm.kvprune.utils.kv_dist import barrier_sync, broadcast_from_tp_rank0
+from vllm.kvprune.utils.sequence import Sequence
+from torch.multiprocessing import Event
+from tqdm import tqdm
+
+logger = logging.getLogger(__name__)
+
+
+class ModelRunner:
+    """Per-rank execution loop. Manages model, sampler, KV cache, and warmup"""
+
+    def __init__(
+        self,
+        config: LLMConfig,
+        rank: int,
+        batch_ready: Optional[Event] = None,
+        peer_events: List[Event] = None,
+        external_model: Optional[nn.Module] = None,
+        *,
+        embedded_in_vllm_worker: bool = False,
+        device: Optional[torch.device] = None,
+    ):
+        self.config = config
+        self.embedded_in_vllm_worker = embedded_in_vllm_worker
+        if embedded_in_vllm_worker:
+            from vllm.distributed.parallel_state import (
+                get_tensor_model_parallel_rank,
+                get_tensor_model_parallel_world_size,
+            )
+
+            tp_ws = get_tensor_model_parallel_world_size()
+            tp_rank = get_tensor_model_parallel_rank()
+            if tp_ws != config.tensor_parallel_size:
+                raise RuntimeError(
+                    f"tensor parallel world size {tp_ws} != "
+                    f"LLMConfig.tensor_parallel_size {config.tensor_parallel_size}"
+                )
+            self.rank = tp_rank
+            _dev = device if device is not None else torch.device(
+                f"cuda:{torch.cuda.current_device()}"
+            )
+            if not dist.is_initialized():
+                raise RuntimeError(
+                    "embedded_in_vllm_worker requires torch.distributed to be "
+                    "initialized (vLLM worker)."
+                )
+            if dist.get_world_size() != tp_ws:
+                raise NotImplementedError(
+                    "KV-prune compactor embedded in vLLM currently requires "
+                    "dist.get_world_size() == tensor_parallel_size "
+                    "(pipeline_parallel_size=1, data_parallel_size=1). "
+                    f"Got dist.get_world_size()={dist.get_world_size()}, "
+                    f"tp_ws={tp_ws}."
+                )
+        else:
+            self.rank = rank
+            _dev = device if device is not None else torch.device(f"cuda:{rank}")
+
+        self._device = _dev
+        assert config.eos_token_ids is not None and len(config.eos_token_ids) > 0, (
+            "LLMConfig.eos_token_ids must be set (filled in LLMEngine from tokenizer)."
+        )
+        self._stop_token_ids = torch.tensor(
+            config.eos_token_ids, dtype=torch.int64, device=_dev
+        )
+        hf_config = config.hf_config
+        self.enforce_eager = config.enforce_eager
+        if config.attention_schedule == KvpruneAttentionSchedule.PDFA:
+            if not self.enforce_eager and self.rank == 0:
+                logger.info(
+                    "attention_schedule=PDFA: disabling compactor decode CUDA graphs "
+                    "(FlashAttention decode path)."
+                )
+            self.enforce_eager = True
+        # Embedded in vLLM worker (TP>1): respect :attr:`LLMConfig.enforce_eager` from
+        # ``v1_tp_runner._apply_compactor_env_overrides``. Set
+        # ``VLLM_KVPRUNE_TP_EMBEDDED_GRAPH=0`` to force eager if graph replay is unstable
+        # with shared vLLM VRAM / streams / NCCL on your stack.
+        if embedded_in_vllm_worker:
+            _tp_graph = os.environ.get(
+                "VLLM_KVPRUNE_TP_EMBEDDED_GRAPH", "1"
+            ).strip().lower()
+            if _tp_graph in ("0", "false", "no"):
+                if not self.enforce_eager:
+                    logger.info(
+                        "embedded_in_vllm_worker: VLLM_KVPRUNE_TP_EMBEDDED_GRAPH=0 → "
+                        "forcing compactor enforce_eager=True (skip compactor CUDA graph "
+                        "capture)."
+                    )
+                self.enforce_eager = True
+        self.world_size = config.tensor_parallel_size
+        self.leverage_sketch_size = config.leverage_sketch_size
+        self.show_progress_bar = config.show_progress_bar
+        self.max_num_batches = config.max_num_seqs
+        self.max_model_len = config.max_model_len
+        self.num_layers = hf_config.num_hidden_layers
+        self.model_dtype = hf_config.torch_dtype
+        self.head_dim = getattr(hf_config, "head_dim", None)
+
+        init_kwargs = {}
+        if not embedded_in_vllm_worker:
+            if "device_id" in inspect.signature(dist.init_process_group).parameters:
+                init_kwargs["device_id"] = torch.device(f"cuda:{rank}")
+            if not dist.is_initialized():
+                dist.init_process_group(
+                    "nccl",
+                    f"tcp://localhost:{config.nccl_port}",
+                    world_size=self.world_size,
+                    rank=rank,
+                    **init_kwargs,
+                )
+            else:
+                ws = dist.get_world_size()
+                if ws != self.world_size:
+                    raise RuntimeError(
+                        "torch.distributed is already initialized with "
+                        f"world_size={ws}, but compactor ModelRunner expects "
+                        f"tensor_parallel_size={self.world_size}. "
+                        "Use tensor_parallel_size matching the active process group "
+                        "(typically 1 when sharing weights with vLLM)."
+                    )
+        torch.cuda.set_device(_dev)
+        default_dtype = torch.get_default_dtype()
+        torch.set_default_dtype(hf_config.torch_dtype)
+        torch.set_default_device("cuda")
+        model_type = hf_config.model_type
+        if external_model is not None:
+            self.model = external_model
+        else:
+            self.model = MODEL_REGISTRY[model_type](hf_config)
+            self.model.load_model(
+                config.path, use_tqdm=self.is_master and self.show_progress_bar
+            )
+        self.sampler = Sampler()
+
+        pre_warmup_mem = torch.cuda.memory_stats().get("allocated_bytes.all.current", 0)
+        # No paged KV yet: FA-only varlen path (see :meth:`warmup`).
+        self.warmup(num_warmup_tokens=self.max_model_len, with_kv=False)
+        post_warmup_peak = torch.cuda.memory_stats().get("allocated_bytes.all.peak", 0)
+
+        self.kv_manager = KVCacheManager(
+            self.rank, config, device=str(self._device)
+        )
+        self.kv_manager.init_cache(self.model)
+
+        self.store_stream: Optional[torch.cuda.Stream] = torch.cuda.Stream()
+        torch.set_default_device("cpu")
+        torch.set_default_dtype(default_dtype)
+
+        self.batch_ready = batch_ready
+        self.peer_events = peer_events if peer_events is not None else []
+        # Embedded TP peers: session end is signaled via TP-group broadcast in
+        # maybe_release_peers (no multiprocessing.Event — not pickleable over RPC).
+        self._embedded_peer_continue = True
+        self.captured_graphs = {}
+        self.min_captured_len = {}
+        self.max_batched_tokens = self.kv_manager.estimate_max_batched_tokens(
+            self.max_model_len, pre_warmup_mem, post_warmup_peak
+        )
+        if self.is_master:
+            logger.info(f"Estimated max batched tokens of {self.max_batched_tokens}")
+        self.warmup(num_warmup_tokens=self.max_model_len, with_kv=True)
+
+        if not self.enforce_eager:
+            bs = [1 << i for i in range(self.max_num_batches.bit_length())]
+            for bs in (
+                tqdm(bs, desc="Capturing CUDA Graphs")
+                if self.is_master and self.show_progress_bar
+                else bs
+            ):
+                for seq_len in [1024, 4096, 8192, 16384]:
+                    self.capture_cudagraph(bs, seq_len)
+
+            if not self.captured_graphs:
+                logger.warning(
+                    "No compactor CUDA graphs were captured (KV budget tight or "
+                    "allocate_sequences failed during capture). Using eager decode "
+                    "for this session."
+                )
+                self.enforce_eager = True
+
+        self.packed_args = PackedTensorArguments(
+            rank=self.rank,
+            max_batched_tokens=self.max_batched_tokens,
+            config=self.config,
+            device=self._device,
+            use_tp_group_for_collectives=embedded_in_vllm_worker,
+        )
+        atexit.register(self.exit)
+
+    @torch.inference_mode()
+    def warmup(self, num_warmup_tokens: int, *, with_kv: bool):
+        sched = (
+            self.config.attention_schedule
+            if with_kv
+            else KvpruneAttentionSchedule.FA_PREFILL_TRITON_DECODE
+        )
+        if self.rank == 0:
+            logger.info(
+                "Warming up compactor attention (%s KV init): schedule=%s",
+                "after" if with_kv else "before",
+                sched.name,
+            )
+        device = self._device
+        input_ids = torch.tensor(
+            [self.config.eos] * num_warmup_tokens, device=device, dtype=torch.int64
+        )
+        positions = torch.arange(num_warmup_tokens, device=device, dtype=torch.int64)
+        cu_seqlens_q = torch.tensor(
+            [0, num_warmup_tokens], device=device, dtype=torch.int32
+        )
+        cu_seqlens_k = torch.tensor(
+            [0, num_warmup_tokens], device=device, dtype=torch.int32
+        )
+        if with_kv:
+            success, batch_mapping = self.kv_manager.allocate_sequences(
+                [-1], [num_warmup_tokens]
+            )
+            assert success
+            max_bh_len = int(
+                self.kv_manager.paged_cache.bh_seq_lens.index_select(
+                    1, index=batch_mapping
+                )
+                .max()
+                .item()
+            )
+        else:
+            batch_mapping = None
+            max_bh_len = 0
+        set_context(
+            is_prefill=True,
+            do_compression=False,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            cu_seqlens_q_host=(0, num_warmup_tokens),
+            cu_seqlens_k_host=(0, num_warmup_tokens),
+            max_seqlen_q=num_warmup_tokens,
+            max_seqlen_k=num_warmup_tokens,
+            batch_mapping=batch_mapping,
+            max_bh_len=max_bh_len,
+            attention_schedule=sched,
+        )
+        for _ in range(2):
+            torch.cuda.reset_peak_memory_stats()
+            h = self.model(input_ids, positions)
+            self.model.compute_logits(h)
+            barrier_sync(use_tp_group=self.embedded_in_vllm_worker)
+            if with_kv:
+                self.kv_manager.paged_cache.bh_seq_lens.index_fill_(
+                    1, batch_mapping.to(torch.long), 0
+                )
+        reset_context()
+        if with_kv:
+            self.kv_manager.free_sequences([-1])
+
+    def exit(self):
+        if getattr(self, "_exited", False):
+            return
+        self._exited = True
+        try:
+            if hasattr(self, "captured_graphs"):
+                self.captured_graphs.clear()
+        finally:
+            if getattr(self, "embedded_in_vllm_worker", False):
+                return
+            if dist.is_initialized():
+                dist.destroy_process_group()
+
+    def loop(self):
+        while True:
+            if self.batch_ready.wait(1.0):
+                self._process_batches_peer()
+
+    @torch.inference_mode()
+    def run_prefill(
+        self, prefill_args: PrefillBatchArguments, batch_mapping: torch.Tensor
+    ):
+        assert prefill_args.B > 0 and prefill_args.N > 0
+        max_bh_len = (
+            self.kv_manager.paged_cache.bh_seq_lens.index_select(1, index=batch_mapping)
+            .max()
+            .item()
+        )
+        compression_context = CompressionContext(
+            compression_method=prefill_args.compression_method,
+            compression_chunk_size=prefill_args.compression_chunk_size,
+            batch_tokens_to_retain=prefill_args.batch_tokens_to_retain,
+            max_tokens_to_retain=prefill_args.max_tokens_to_retain,
+            context_lens=prefill_args.context_lens.tolist(),
+            PHI=prefill_args.PHI,
+            sketch_dimension=self.leverage_sketch_size,
+            protected_first_tokens=prefill_args.protected_first,
+            protected_last_tokens=prefill_args.protected_last,
+            compression_ratio=prefill_args.compression_ratio,
+        )
+        cu_q_host = tuple(
+            int(x) for x in prefill_args.cu_seqlens_q.detach().cpu().view(-1).tolist()
+        )
+        cu_k_host = tuple(
+            int(x) for x in prefill_args.cu_seqlens_k.detach().cpu().view(-1).tolist()
+        )
+        set_context(
+            is_prefill=True,
+            do_compression=prefill_args.do_compression,
+            cu_seqlens_q=prefill_args.cu_seqlens_q,
+            cu_seqlens_k=prefill_args.cu_seqlens_k,
+            cu_seqlens_q_host=cu_q_host,
+            cu_seqlens_k_host=cu_k_host,
+            max_seqlen_q=prefill_args.max_seqlen_q,
+            max_seqlen_k=prefill_args.max_seqlen_k,
+            batch_mapping=batch_mapping,
+            max_bh_len=max_bh_len,
+            compression_context=compression_context,
+            STORE_STREAM=self.store_stream,
+            attention_schedule=self.config.attention_schedule,
+        )
+        # int32 token ids break vLLM-delegated embedding (expects long indices) on some paths.
+        _iid = (
+            prefill_args.input_ids
+            if prefill_args.input_ids.dtype == torch.int64
+            else prefill_args.input_ids.long()
+        )
+        _pos = (
+            prefill_args.positions
+            if prefill_args.positions.dtype == torch.int64
+            else prefill_args.positions.long()
+        )
+        hidden = self.model(_iid, _pos)
+        logits = self.model.compute_logits(hidden)
+        reset_context()
+        return logits
+
+    def maybe_broadcast(self, tensor: torch.Tensor, *, label: str = "tensor") -> None:
+        if self.world_size > 1:
+            broadcast_from_tp_rank0(
+                tensor, use_tp_group=self.embedded_in_vllm_worker
+            )
+        return None
+
+    def maybe_release_peers(self, do_release=False):
+        if self.world_size <= 1:
+            return
+        if self.embedded_in_vllm_worker:
+            flag = torch.zeros(1, dtype=torch.int32, device=self._device)
+            if self.is_master:
+                flag[0] = 0 if do_release else 1
+            broadcast_from_tp_rank0(flag, use_tp_group=True)
+            if not self.is_master:
+                self._embedded_peer_continue = bool(flag[0].item())
+            barrier_sync(use_tp_group=True)
+            return
+        if self.is_master:
+            if do_release:
+                for event in self.peer_events:
+                    event.clear()
+            barrier_sync(use_tp_group=False)
+        else:
+            barrier_sync(use_tp_group=False)
+
+    def _peer_outer_loop_active(self) -> bool:
+        if self.batch_ready is not None:
+            return self.batch_ready.is_set()
+        if self.embedded_in_vllm_worker:
+            return self._embedded_peer_continue
+        return False
+
+    @torch.inference_mode()
+    def generate(
+        self,
+        all_sequences: List[Sequence],
+        batch_compression_params: Optional[BatchCompressionParams] = None,
+    ):
+        assert self.is_master, "generate can only be called on the master process"
+        if not self.embedded_in_vllm_worker:
+            for begin_execution_event in self.peer_events:
+                begin_execution_event.set()
+        if batch_compression_params is None:
+            batch_compression_params = BatchCompressionParams()
+        self._process_batches_master(all_sequences, batch_compression_params)
+
+    @property
+    def is_master(self):
+        return self.rank == 0
+
+    @torch.inference_mode()
+    def _process_batches_master(
+        self,
+        all_sequences: List[Sequence],
+        batch_compression_params: BatchCompressionParams,
+    ):
+        assert self.is_master
+        compression_details = f"Applying Compression Method: {batch_compression_params.compression_method}"
+        if any(seq.compression_params.compression_ratio < 1.0 for seq in all_sequences):
+            logger.info(compression_details)
+        scheduler = Scheduler(
+            all_sequences=all_sequences,
+            kv_manager=self.kv_manager,
+            use_tqdm=self.show_progress_bar,
+        )
+        decode_batch = DecodeBatchArguments()
+        decode_flags = torch.empty(2, dtype=torch.int32, device=self._device)
+        while not scheduler.is_finished():
+            sequences = scheduler.get_prefill_batch()
+            if not sequences:
+                if scheduler.pending_sequence_ids:
+                    raise RuntimeError(
+                        "KV-prune compactor cannot schedule any prefill (KV/token budget). "
+                        f"max_batched_tokens={self.kv_manager.max_batched_tokens}, "
+                        f"pending_sequences={len(scheduler.pending_sequence_ids)}. "
+                        "Lower v1 gpu_memory_utilization / max_model_len, set "
+                        "VLLM_KVPRUNE_RELEASE_V1_KV=1 to discard v1 KV (sleep+wake), "
+                        "or free GPU memory. Diagnostics: "
+                        f"{scheduler.diagnose_prefill_failure()}"
+                    )
+                # Pending is empty: either finished or decode-only continuation.
+                if decode_batch.token_ids is None:
+                    break
+                run_decode = True
+                occupancy = -1
+            else:
+                seq_ids_cpu = [seq.seq_id for seq in sequences]
+                scheduler.add_running_sequence_ids(seq_ids_cpu, update_status=True)
+                temps = torch.tensor(
+                    [s.sampling_params.temperature for s in sequences],
+                    dtype=torch.float32,
+                    pin_memory=True,
+                ).to(device=self._device, non_blocking=True)
+                prefill_arguments = self.packed_args.build_prefill_args(
+                    sequences, batch_compression_params=batch_compression_params
+                )
+                max_ctx_lens = (
+                    prefill_arguments.max_new_tokens + prefill_arguments.context_lens
+                )
+
+                success, batch_mapping = self.kv_manager.allocate_sequences(
+                    seq_ids_cpu, max_ctx_lens.tolist()
+                )
+                assert success, "failed to allocate pages for sequences"
+
+                logits = self.run_prefill(prefill_arguments, batch_mapping)
+                # Must match prefill `positions` dtype (int64). `context_lens` is int32
+                # from the packed buffer; using int32 here breaks RoPE indexing
+                # (`cos_sin_cache[positions]`) on CUDA for decode vs prefill.
+                positions = prefill_arguments.context_lens.to(dtype=torch.int64)
+                token_ids = self.sampler(logits, temps)
+                # Prefill KV writes + bh_seq_lens updates run on STORE_STREAM; reclaim
+                # reads bh_seq_lens on the default stream and must not race.
+                if self.store_stream is not None:
+                    torch.cuda.default_stream().wait_stream(self.store_stream)
+                # TODO: synchronize page counts accross dist
+                if self.world_size == 1:
+                    self.kv_manager.reclaim_pages(
+                        seq_ids_cpu, prefill_arguments.max_new_tokens
+                    )
+                    # with logging_redirect_tqdm():
+                    #     logger.info(
+                    #         f"Reclaimed {reclaimed_bytes / 1e6:.2f} MB from the KV cache"
+                    #     )
+
+                if scheduler.any_pending_sequences():
+                    num_pending_batches = (
+                        0
+                        if decode_batch.token_ids is None
+                        else decode_batch.token_ids.shape[0]
+                    )
+                    occupancy = int((num_pending_batches + len(seq_ids_cpu)) * 0.66)
+                else:
+                    occupancy = -1
+                run_decode = not scheduler.can_prefill_another_batch()
+                decode_batch = decode_batch.update(
+                    batch_mapping,
+                    token_ids,
+                    positions,
+                    max_ctx_lens,
+                    prefill_arguments.seq_ids,
+                    temps,
+                    occupancy,
+                )
+            if self.world_size > 1:
+                decode_flags[0] = int(run_decode)
+                decode_flags[1] = occupancy
+                self.maybe_broadcast(decode_flags, label="decode_flags")
+            if not run_decode:
+                continue
+            if self.store_stream is not None:
+                torch.cuda.default_stream().wait_stream(self.store_stream)
+
+            decode_output, decode_batch = self.run_decode_loop(decode_batch)
+            finished_sequence_ids = scheduler.get_finished_sequence_ids_from_unfinished(
+                decode_batch.seq_ids.tolist()
+            )
+            scheduler.record_finished_sequence_ids(
+                finished_sequence_ids, update_status=True
+            )
+            self.kv_manager.free_sequences(finished_sequence_ids)
+            self.maybe_release_peers(scheduler.is_finished())
+            scheduler.update_sequences(
+                decode_output.output_tokens.tolist(),
+                decode_output.output_seq_ids.tolist(),
+            )
+        scheduler.close()
+
+    @torch.inference_mode()
+    def run_peer_session(self) -> None:
+        """Non-master TP ranks: run one peer session (used when embedded in vLLM)."""
+        if self.embedded_in_vllm_worker:
+            self._embedded_peer_continue = True
+        self._process_batches_peer()
+
+    @torch.inference_mode()
+    def _process_batches_peer(self):
+        assert not self.is_master
+        scheduler = Scheduler([], kv_manager=self.kv_manager)
+        decode_batch = DecodeBatchArguments()
+        decode_flags = torch.empty(2, dtype=torch.int32, device=self._device)
+        while self._peer_outer_loop_active():
+            prefill_arguments = self.packed_args.build_prefill_args()
+
+            B = prefill_arguments.B
+            max_ctx_lens = (
+                prefill_arguments.max_new_tokens + prefill_arguments.context_lens
+            )
+
+            seq_ids_cpu = prefill_arguments.seq_ids.tolist()
+            scheduler.add_running_sequence_ids(seq_ids_cpu)
+            success, batch_mapping = self.kv_manager.allocate_sequences(
+                seq_ids_cpu, max_ctx_lens.tolist()
+            )
+            assert success, "failed to allocate pages for sequences"
+
+            self.run_prefill(prefill_arguments, batch_mapping)
+            positions = prefill_arguments.context_lens.to(dtype=torch.int64)
+            self.maybe_broadcast(decode_flags, label="decode_flags")
+            run_decode = bool(decode_flags[0].item())
+            occupancy = int(decode_flags[1].item())
+            token_ids = torch.empty(B, dtype=torch.int64, device=self._device)
+            decode_batch = decode_batch.update(
+                batch_mapping,
+                token_ids,
+                positions,
+                max_ctx_lens,
+                prefill_arguments.seq_ids,
+                None,  # temps not used in peer process
+                occupancy,
+            )
+
+            if not run_decode:
+                continue
+            if self.store_stream is not None:
+                torch.cuda.default_stream().wait_stream(self.store_stream)
+
+            _, decode_batch = self.run_decode_loop(decode_batch)
+            finished_sequence_ids = scheduler.get_finished_sequence_ids_from_unfinished(
+                decode_batch.seq_ids.tolist()
+            )
+            scheduler.record_finished_sequence_ids(finished_sequence_ids)
+            self.kv_manager.free_sequences(finished_sequence_ids)
+            self.maybe_release_peers()
+        scheduler.close()
+
+    @torch.inference_mode()
+    def run_decode_loop(
+        self,
+        decode_batch: DecodeBatchArguments,
+    ) -> tuple[DecodeBatchOutput, DecodeBatchArguments]:
+        if self.is_master:
+            num_stashed_batches = decode_batch.num_stashed_batches
+            tok_buffer = [
+                decode_batch.token_ids[num_stashed_batches:].to(
+                    "cpu", non_blocking=True
+                )
+            ]
+            seq_buffer = [
+                decode_batch.seq_ids[num_stashed_batches:].to("cpu", non_blocking=True)
+            ]
+        while True:
+            self.maybe_broadcast(decode_batch.token_ids, label="decode_token_ids")
+            not_stopped = ~torch.isin(decode_batch.token_ids, self._stop_token_ids)
+            running_batches = (decode_batch.positions < decode_batch.max_ctx_lens) & (
+                not_stopped
+            )
+            decode_batch.token_ids = torch.masked_select(
+                decode_batch.token_ids, running_batches
+            )
+            decode_batch.positions = torch.masked_select(
+                decode_batch.positions, running_batches
+            )
+            decode_batch.batch_mapping = torch.masked_select(
+                decode_batch.batch_mapping, running_batches
+            )
+            decode_batch.max_ctx_lens = torch.masked_select(
+                decode_batch.max_ctx_lens, running_batches
+            )
+            decode_batch.seq_ids = torch.masked_select(
+                decode_batch.seq_ids, running_batches
+            )
+            if self.is_master:
+                decode_batch.temps = torch.masked_select(
+                    decode_batch.temps, running_batches
+                )
+            num_remaining = decode_batch.token_ids.numel()
+            if (
+                num_remaining == 0
+                or num_remaining <= decode_batch.desired_batch_occupancy
+            ):
+                decode_batch.num_stashed_batches = num_remaining
+                break
+            logits = self._decode_step_logits(decode_batch)
+
+            if self.is_master:
+                decode_batch.token_ids = self.sampler(logits, decode_batch.temps)
+                tok_buffer.append(decode_batch.token_ids.to("cpu", non_blocking=True))
+                seq_buffer.append(decode_batch.seq_ids.to("cpu", non_blocking=True))
+            decode_batch.positions += 1
+
+        if self.is_master:
+            # non_blocking D2H copies must finish before cat/tolist read CPU data.
+            torch.cuda.synchronize()
+            output = DecodeBatchOutput(
+                output_tokens=torch.cat(tok_buffer),
+                output_seq_ids=torch.cat(seq_buffer),
+            )
+        else:
+            output = DecodeBatchOutput(None, None)
+        return output, decode_batch
+
+    def _decode_logits_eager(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        batch_mapping: torch.Tensor,
+    ):
+        set_context(
+            is_prefill=False,
+            do_compression=False,
+            batch_mapping=batch_mapping,
+            attention_schedule=self.config.attention_schedule,
+        )
+        _iid = input_ids if input_ids.dtype == torch.int64 else input_ids.long()
+        _pos = positions if positions.dtype == torch.int64 else positions.long()
+        hidden = self.model(_iid, _pos)
+        return self.model.compute_logits(hidden)
+
+    @torch.inference_mode()
+    def _decode_step_logits(self, decode_batch: DecodeBatchArguments):
+        """Graph decode when possible; otherwise eager (never raises on missing graph)."""
+        if self.enforce_eager or not self.captured_graphs:
+            return self._decode_logits_eager(
+                decode_batch.token_ids,
+                decode_batch.positions,
+                decode_batch.batch_mapping,
+            )
+        try:
+            return self.run_graph_decode(
+                decode_batch.token_ids,
+                decode_batch.positions,
+                decode_batch.batch_mapping,
+            )
+        except Exception as e:
+            logger.warning(
+                "CUDA graph decode failed (%s); switching to eager decode for "
+                "remaining steps.",
+                e,
+            )
+            self.enforce_eager = True
+            return self._decode_logits_eager(
+                decode_batch.token_ids,
+                decode_batch.positions,
+                decode_batch.batch_mapping,
+            )
+
+    @torch.inference_mode()
+    def run_graph_decode(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        batch_mapping: torch.Tensor,
+    ):
+        bs = input_ids.shape[0]
+        max_k = int(positions.max())
+        graph_dict = self.get_cuda_graph(bs, max_k)
+        if graph_dict is None:
+            return self._decode_logits_eager(input_ids, positions, batch_mapping)
+        set_context(
+            is_prefill=False,
+            do_compression=False,
+            batch_mapping=batch_mapping,
+            attention_schedule=self.config.attention_schedule,
+        )
+        graph_dict["input_ids"][:bs] = input_ids
+        graph_dict["positions"][:bs] = positions
+        graph_dict["batch_mapping"].fill_(RESERVED_BATCH)
+        graph_dict["batch_mapping"][:bs] = batch_mapping
+        graph_dict["graph"].replay()
+        logits_out = graph_dict["logits"]
+        return logits_out[:bs].contiguous()
+
+    @torch.inference_mode()
+    def capture_cudagraph(self, batch_size: int, max_seqlen_k: int):
+        barrier_sync(use_tp_group=self.embedded_in_vllm_worker)
+        device = torch.device("cuda")
+        logger.debug(
+            f"Capturing CUDA graph for batch size {batch_size} ({max_seqlen_k} tokens)"
+        )
+        _g_input_ids = torch.zeros(batch_size, dtype=torch.int32, device=device)
+        _g_positions = torch.zeros(batch_size, dtype=torch.int64, device=device)
+        _g_hidden = None
+        key_split = num_splits_heuristic(
+            batch_size * self.kv_manager.num_kv_heads,
+            max_seq_len=max_seqlen_k,
+            num_sms=torch.cuda.get_device_properties(device).multi_processor_count,
+            max_splits=12,
+        )
+
+        success, _g_batch_mapping = self.kv_manager.allocate_sequences(
+            list(range(batch_size)), [256] * batch_size
+        )
+        if not success:
+            # Shared GPU with vLLM: compactor KV pool is small; large batch capture
+            # often cannot reserve [256]*batch_size per sequence. Skip this graph.
+            logger.warning(
+                "Skipping CUDA graph capture for batch_size=%s max_seqlen_k=%s "
+                "(KV allocate_sequences failed; decode will use eager or other graphs).",
+                batch_size,
+                max_seqlen_k,
+            )
+            barrier_sync(use_tp_group=self.embedded_in_vllm_worker)
+            return
+
+        set_context(
+            is_prefill=False,
+            do_compression=False,
+            batch_mapping=_g_batch_mapping,
+            key_split=key_split,
+            attention_schedule=self.config.attention_schedule,
+        )
+        _gw = self.model(_g_input_ids, _g_positions)
+        self.model.compute_logits(_gw)
+        barrier_sync(use_tp_group=self.embedded_in_vllm_worker)
+        decode_graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(decode_graph):
+            _g_hidden = self.model(_g_input_ids, _g_positions)
+            _g_logits = self.model.compute_logits(_g_hidden)
+        graph_vars = {
+            "graph": decode_graph,
+            "input_ids": _g_input_ids,
+            "positions": _g_positions,
+            "batch_mapping": _g_batch_mapping,
+            "hidden": _g_hidden,
+            "logits": _g_logits,
+            "key_split": key_split,
+        }
+        if batch_size not in self.captured_graphs:
+            self.captured_graphs[batch_size] = {}
+            self.min_captured_len[batch_size] = float("inf")
+
+        self.captured_graphs[batch_size][max_seqlen_k] = graph_vars
+        self.min_captured_len[batch_size] = min(
+            max_seqlen_k, self.min_captured_len[batch_size]
+        )
+        self.kv_manager.free_sequences(list(range(batch_size)))
+
+    def get_cuda_graph(
+        self, batch_size: int, max_seqlen_k: int
+    ) -> Optional[dict[str, Any]]:
+        """Return a captured graph dict, or None if no compatible capture exists."""
+        if not self.captured_graphs:
+            return None
+        eligible_bs = [x for x in self.captured_graphs.keys() if x >= batch_size]
+        if not eligible_bs:
+            return None
+        bs_key = min(eligible_bs)
+        batch_size_graphs = self.captured_graphs[bs_key]
+        candidates = [sl for sl in batch_size_graphs.keys() if sl <= max_seqlen_k]
+        if not candidates:
+            return None
+        best_sl = max(candidates)
+        return batch_size_graphs[best_sl]
+
+
--- a/vllm/kvprune/core/scheduler.py
+++ b/vllm/kvprune/core/scheduler.py
+import time
+from typing import Iterable, List
+
+from vllm.kvprune.core.memory_manager import KVCacheManager
+from vllm.kvprune.utils.sequence import Sequence, SequenceStatus
+from tqdm import tqdm
+
+
+def cdiv(a, b):
+    """ceiling division"""
+    return (a + b - 1) // b
+
+
+class Scheduler:
+    """
+    Simple sequence scheduler for prefill + decode with a paged KV cache.
+    The scheduler tracks three disjoint sets of sequence IDs:
+
+      * ``pending_sequence_ids`` 鈥?sequences that have not yet been started.
+      * ``active_sequence_ids`` 鈥?sequences currently running.
+      * ``finished_sequence_ids`` 鈥?sequences that have generated all tokens.
+
+    At prefill time, :meth:`get_prefill_batch` selects a subset of pending
+    sequences that can fit into the available KV cache and per-step token
+    budget, given the constraints from the associated :class:`KVCacheManager`.
+
+    The class also handles basic bookkeeping of sequence statuses.
+
+    Args:
+        :param all_sequences:
+            Iterable of :class:`Sequence` objects to be scheduled. Each
+            sequence must have a unique ``seq_id``.
+        :param kv_manager:
+            A :class:`KVCacheManager` instance that this scheduler will use
+            to determine whether additional batches can be scheduled.
+        :param use_tqdm:
+            If True, two progress bars are created:
+              * "Started Batches" 鈥?increments when a sequence moves from
+                pending to running.
+              * "Finished Batches" 鈥?increments when a sequence finishes.
+    """
+
+    def __init__(
+        self,
+        all_sequences: Iterable[Sequence],
+        kv_manager: KVCacheManager,
+        *,
+        use_tqdm=False,
+    ):
+        self.allseq_mapping: dict[int, Sequence] = {s.seq_id: s for s in all_sequences}
+        self.pending_sequence_ids: set[int] = set([s.seq_id for s in all_sequences])
+        self.active_sequence_ids: set[int] = set()
+        self.finished_sequence_ids: set[int] = set()
+        self.manager = kv_manager
+        self.use_tqdm = use_tqdm
+        self.start_time = time.perf_counter()
+        self.total_tokens_generated = 0
+        self.total_tokens_input = 0
+        self.pbar = None
+        if use_tqdm:
+            self.pbar = tqdm(
+                total=len(self.pending_sequence_ids),
+                desc="Completed Batches",
+            )
+
+    def get_prefill_batch(self) -> List[Sequence]:
+        """
+        Select a batch of pending sequences to prefill under KV/memory constraints.
+
+        The selection is greedy over ``pending_sequence_ids`` in iteration order.
+        A sequence is added to the batch if:
+
+          * The sum of its prompt length and the total prompt tokens selected so
+            far does not exceed ``manager.max_batched_tokens``, and
+          * There is at least one free KV "batch slot" left
+            (``manager.num_free_batches``), and
+          * The total number of KV pages required by the sequence's prompt +
+            max_new_tokens does not exceed the remaining free pages.
+        Returns:
+            :return List[Sequence]:
+                The list of :class:`Sequence` objects chosen for prefill in
+                this step. The caller is responsible for marking them as
+                active via :meth:`add_running_sequence_ids`.
+        """
+        total_tok, sequences = 0, []
+        num_free_batches, num_free_pages = (
+            self.manager.num_free_batches,
+            self.manager.num_free_pages,
+        )
+        for seq_id in self.pending_sequence_ids:
+            seq = self.allseq_mapping[seq_id]
+            prompt_length = seq.prompt_len
+            pages_needed = (
+                cdiv(
+                    prompt_length + seq.sampling_params.max_new_tokens,
+                    self.manager.page_size,
+                )
+                * self.manager.num_kv_heads
+            )
+            if (
+                prompt_length + total_tok <= self.manager.max_batched_tokens
+                and num_free_batches > 0
+                and pages_needed < num_free_pages
+            ):
+                sequences.append(seq)
+                total_tok += prompt_length
+                num_free_pages -= pages_needed
+                num_free_batches -= 1
+        return sequences
+
+    def is_finished(self) -> bool:
+        """
+        Check whether all sequences have completed.
+        """
+        return (
+            len(self.pending_sequence_ids) == 0 and len(self.active_sequence_ids) == 0
+        )
+
+    def any_pending_sequences(self) -> bool:
+        """
+        Check whether any sequences are still pending (not yet started).
+        """
+        return len(self.pending_sequence_ids) != 0
+
+    def add_running_sequence_ids(
+        self, active_sequence_ids: Iterable[int], *, update_status: bool = False
+    ):
+        """
+        Mark a set of sequences as active / running. This moves sequence IDs
+        from ``pending_sequence_ids`` into ``active_sequence_ids``. Optionally,
+        it also updates the per-sequence status and progress bar.
+
+        Args:
+            :param active_sequence_ids:
+                Iterable of sequence IDs that have been scheduled for prefill
+                or decode and should now be considered running.
+            :param update_status:
+                If True, set each corresponding :class:`Sequence`'s
+                ``status = SequenceStatus.RUNNING`` and increment the
+                "Started Batches" progress bar if ``use_tqdm`` is enabled.
+        """
+        self.active_sequence_ids.update(active_sequence_ids)
+        self.pending_sequence_ids.difference_update(self.active_sequence_ids)
+        if update_status:
+            for seq_id in active_sequence_ids:
+                self.allseq_mapping[seq_id].status = SequenceStatus.RUNNING
+                self.total_tokens_input += self.allseq_mapping[seq_id].prompt_len
+
+    def get_finished_sequence_ids_from_unfinished(
+        self, unfinished_sequence_ids: Iterable[int]
+    ) -> set[int]:
+        """
+        Infer which active sequences have finished given the
+        unfinished set (for decode steps where the caller knows
+        which sequences are still generating but not necessarily
+        which have just completed).
+        Args:
+            :param unfinished_sequence_ids:
+                Iterable of sequence IDs that are still running
+        Returns:
+            :return set[int]:
+                The inferred set of sequence IDs that transitioned from active
+                to finished.
+        """
+        return self.active_sequence_ids.difference(unfinished_sequence_ids)
+
+    def record_finished_sequence_ids(
+        self, finished_sequence_ids: Iterable[int], *, update_status: bool = False
+    ):
+        """
+        Record that a set of sequences has finished generation.
+
+        This moves IDs from ``active_sequence_ids`` into
+        ``finished_sequence_ids``.
+
+        Args:
+            :param finished_sequence_ids:
+                Iterable of sequence IDs that have completed generation and
+                no longer require KV cache.
+            :param update_status:
+                If True, set each corresponding :class:`Sequence`'s
+                ``status = SequenceStatus.FINISHED``
+        """
+        self.active_sequence_ids.difference_update(finished_sequence_ids)
+        self.finished_sequence_ids.update(finished_sequence_ids)
+        if update_status:
+            for seq_id in finished_sequence_ids:
+                self.allseq_mapping[seq_id].status = SequenceStatus.FINISHED
+                if self.pbar is not None:
+                    self.pbar.update(1)
+
+    def update_sequences(self, tokens: Iterable[int], seq_ids: Iterable[int]):
+        """
+        Append newly generated tokens to their corresponding sequences.
+        Args:
+            :param tokens:
+                Iterable of generated token IDs, one per sequence.
+            :param seq_ids:
+                Iterable of sequence IDs aligned with ``tokens``.
+        """
+        cur_time = time.perf_counter()
+        for tok, seq_id in zip(tokens, seq_ids):
+            self.allseq_mapping[seq_id].add_new_token(tok)
+            self.total_tokens_generated += 1
+        if self.pbar is not None:
+            self.pbar.set_description(
+                f"Throughput: {(self.total_tokens_generated + self.total_tokens_input) / (cur_time - self.start_time):.2f} tok/s"
+            )
+
+    def close(self):
+        if self.pbar is not None:
+            self.pbar.close()
+
+    def can_prefill_another_batch(self) -> bool:
+        return len(self.get_prefill_batch()) > 0
--- a/vllm/kvprune/integration/__init__.py
+++ b/vllm/kvprune/integration/__init__.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""KV-pruning integration: compactor ``LLMEngine`` sharing weights with :class:`~vllm.LLM`."""
+
+from vllm.kvprune.integration.compression_params import CompressionParams
+
+__all__ = ["CompressionParams"]
--- a/vllm/kvprune/integration/compactor_shared.py
+++ b/vllm/kvprune/integration/compactor_shared.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Construct compactor :class:`LLMEngine` sharing weight tensors with an in-process vLLM ``LLM``."""
+
+from __future__ import annotations
+
+import os
+
+import torch.nn as nn
+
+from vllm.config import VllmConfig
+from vllm.kvprune.config.engine_config import LLMConfig
+from vllm.kvprune.core.llm_engine import LLMEngine
+from vllm.kvprune.integration.config_adapter import vllm_config_to_llm_config
+from vllm.kvprune.integration.vllm_model_access import extract_vllm_causal_lm
+from vllm.kvprune.integration.weight_tie import (
+    delegate_kvprune_compute_logits_to_vllm,
+    delegate_kvprune_embed_tokens_to_vllm,
+    tie_kvprune_rope_buffers_from_vllm,
+    tie_kvprune_weights_from_vllm,
+)
+from vllm.kvprune.models import MODEL_REGISTRY
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def build_llm_config_for_compactor(vc: VllmConfig) -> LLMConfig:
+    """Public helper: vLLM config → compactor :class:`LLMConfig`."""
+    return vllm_config_to_llm_config(vc)
+
+
+def create_compactor_engine_with_shared_weights(llm: object) -> LLMEngine:
+    """Single GPU, TP=1: compactor ``LLMEngine`` whose weights alias vLLM tensors.
+
+    Call after the vLLM ``LLM`` has loaded weights. Requires in-process executor
+    (``VLLM_ENABLE_V1_MULTIPROCESSING=0``).
+    """
+    llm_engine = getattr(llm, "llm_engine", None)
+    if llm_engine is None:
+        raise RuntimeError("Expected ``llm.llm_engine``.")
+    vc: VllmConfig = llm_engine.vllm_config
+    if vc.parallel_config.tensor_parallel_size != 1:
+        raise ValueError(
+            "Shared-weight compactor backend requires tensor_parallel_size=1"
+        )
+
+    cfg = vllm_config_to_llm_config(vc)
+    # ``cfg.enforce_eager`` is for the compactor ``ModelRunner`` only (decode CUDA
+    # graphs), not v1. v1 graph capture is controlled solely by ``LLM(...,
+    # enforce_eager=...)`` / ``kvprune_compression=True`` on the entrypoint ``LLM``.
+    # Large vLLM max_num_seqs blows up compactor page-table GPU memory; sharing the GPU
+    # with v1 leaves little room for metadata + KV tensors. Default cap 32 so physical
+    # KV pages stay usable; set VLLM_KVPRUNE_COMPACTOR_MAX_NUM_SEQS=0 to disable cap,
+    # or raise (e.g. 128) if you have VRAM headroom.
+    _cap = os.environ.get("VLLM_KVPRUNE_COMPACTOR_MAX_NUM_SEQS", "32").strip()
+    if _cap:
+        lim = int(_cap)
+        if lim > 0:
+            cfg.max_num_seqs = min(cfg.max_num_seqs, lim)
+
+    # Compactor decode graphs (``enforce_eager=False``): honored for non-shared-weight
+    # engines. **Shared-weight** path (below) forces ``enforce_eager=True`` after
+    # delegating ``compute_logits`` to vLLM unless ``VLLM_KVPRUNE_SHARED_WEIGHT_GRAPH=1``.
+    # Opt out of graphs for non-shared runs: ``VLLM_KVPRUNE_COMPACTOR_ENFORCE_EAGER=1`` or
+    # ``VLLM_KVPRUNE_COMPACTOR_CUDA_GRAPH=0``.
+    _ce = os.environ.get("VLLM_KVPRUNE_COMPACTOR_ENFORCE_EAGER", "").strip().lower()
+    if _ce in ("1", "true", "yes"):
+        cfg.enforce_eager = True
+        logger.info(
+            "KV-prune compactor: VLLM_KVPRUNE_COMPACTOR_ENFORCE_EAGER=1 → "
+            "enforce_eager=True (skip compactor decode CUDA graphs)."
+        )
+    elif _ce in ("0", "false", "no"):
+        cfg.enforce_eager = False
+        logger.info(
+            "KV-prune compactor: VLLM_KVPRUNE_COMPACTOR_ENFORCE_EAGER=0 → "
+            "enforce_eager=False (try compactor CUDA graph capture)."
+        )
+    else:
+        _dg = os.environ.get(
+            "VLLM_KVPRUNE_COMPACTOR_CUDA_GRAPH", "1"
+        ).strip().lower()
+        if _dg in ("0", "false", "no"):
+            cfg.enforce_eager = True
+            logger.info(
+                "KV-prune compactor: VLLM_KVPRUNE_COMPACTOR_CUDA_GRAPH=0 → "
+                "enforce_eager=True (skip compactor decode CUDA graphs)."
+            )
+        else:
+            cfg.enforce_eager = False
+            logger.info(
+                "KV-prune compactor: default try decode CUDA graphs; ModelRunner "
+                "falls back to eager if capture yields none. Set "
+                "VLLM_KVPRUNE_COMPACTOR_ENFORCE_EAGER=1 or "
+                "VLLM_KVPRUNE_COMPACTOR_CUDA_GRAPH=0 to skip capture."
+            )
+
+    hf = cfg.hf_config
+    assert hf is not None
+    model_type = hf.model_type
+    if model_type not in MODEL_REGISTRY:
+        raise ValueError(
+            f"Compactor MODEL_REGISTRY has no entry for model_type={model_type!r}; "
+            f"supported: {sorted(MODEL_REGISTRY)}"
+        )
+
+    vllm_model = extract_vllm_causal_lm(llm)
+    device = next(vllm_model.parameters()).device
+    dtype = next(vllm_model.parameters()).dtype
+
+    # Build compactor shell on CPU first. **Do not** call ``.to(device)`` before tying:
+    # that allocates a full second copy of weights on GPU; tying then frees the
+    # duplicate but peak memory can OOM on large models. Tie first so parameters
+    # alias vLLM tensors directly (no extra weight VRAM).
+    kv_model: nn.Module = MODEL_REGISTRY[model_type](hf)
+    tie_kvprune_weights_from_vllm(vllm_model, kv_model)
+    # Buffers (e.g. RoPE tables) not in ``named_parameters`` may still be on CPU.
+    kv_model.to(device=device, dtype=dtype)
+    tie_kvprune_rope_buffers_from_vllm(vllm_model, kv_model)
+    delegate_kvprune_embed_tokens_to_vllm(vllm_model, kv_model)
+    delegate_kvprune_compute_logits_to_vllm(vllm_model, kv_model)
+
+    # Compactor decode CUDA graphs capture ``model.forward`` + ``compute_logits`` in one
+    # graph. Here ``compute_logits`` is delegated to vLLM's LM head / LogitsProcessor
+    # (cublas GEMM, padded vocab, etc.). Embedding that in a nested capture commonly
+    # fails with ``CUBLAS_STATUS_EXECUTION_FAILED`` and invalidates stream capture
+    # (``cudaErrorStreamCaptureInvalidated``). Default: skip graphs for this integration.
+    _sw_graph = os.environ.get(
+        "VLLM_KVPRUNE_SHARED_WEIGHT_GRAPH", "0"
+    ).strip().lower() in ("1", "true", "yes")
+    if not _sw_graph:
+        cfg.enforce_eager = True
+        logger.info(
+            "KV-prune shared-weight compactor: enforce_eager=True (skip compactor "
+            "decode CUDA graphs; logits delegated to vLLM). Set "
+            "VLLM_KVPRUNE_SHARED_WEIGHT_GRAPH=1 only to attempt capture (often fails)."
+        )
+
+    return LLMEngine(cfg, external_model=kv_model)
--- a/vllm/kvprune/integration/compressed_generate.py
+++ b/vllm/kvprune/integration/compressed_generate.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""KV-pruning (compactor) path invoked from :meth:`vllm.entrypoints.llm.LLM.generate`."""
+
+from __future__ import annotations
+
+import os
+from collections.abc import Callable, Sequence
+from pathlib import Path
+from typing import Any
+
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer
+
+from vllm.kvprune.compression.compression_config import (
+    BatchCompressionParams,
+    SequenceCompressionParams,
+)
+from vllm.kvprune.config.sampling_params import SamplingParams as CompactorSamplingParams
+from vllm.kvprune.core.compression_bridge import (
+    compression_method_id_to_enum,
+    compression_method_str_to_id,
+)
+from vllm.kvprune.core.llm_engine import LLMEngine, _infer_stop_token_ids
+from vllm.kvprune.integration.compactor_shared import create_compactor_engine_with_shared_weights
+from vllm.kvprune.integration.compression_params import CompressionParams
+from vllm.logger import init_logger
+from vllm.outputs import CompletionOutput, RequestOutput
+from vllm.sampling_params import SamplingParams
+
+logger = init_logger(__name__)
+
+_MP_ENV = "VLLM_ENABLE_V1_MULTIPROCESSING"
+_RELEASE_V1_KV_ENV = "VLLM_KVPRUNE_RELEASE_V1_KV"
+
+
+def _maybe_release_v1_kv_for_compactor(llm: Any) -> None:
+    """Optionally discard v1's KV cache so more GPU memory is free for compactor.
+
+    v1 reserves KV blocks at engine init; shared-weight compactor then competes for
+    the same VRAM. ``sleep(level=1)`` discards v1 KV and may offload tagged weights
+    per v1 sleep policy, then ``wake_up()`` reloads — compactor still ties the same
+    v1 tensors after.
+
+    **Default:** ``vllm.env_override`` sets ``VLLM_KVPRUNE_RELEASE_V1_KV=0`` (no
+    sleep/wake; v1 KV stays on GPU). Set ``=1`` if you need extra VRAM for compactor
+    before the first compressed step (then ``llm.sleep`` / ``CuMemAllocator`` /
+    ``Sleep mode freed …`` logs are expected). This does **not** remove v1's KV
+    reservation at init; it only runs the optional sleep/wake cycle before compactor.
+
+    Tests keep ``VLLM_KVPRUNE_RELEASE_V1_KV=0`` in ``conftest``.
+    """
+    if os.environ.get(_RELEASE_V1_KV_ENV, "0").strip().lower() not in (
+        "1",
+        "true",
+        "yes",
+    ):
+        return
+    try:
+        logger.info(
+            "%s=1: discarding v1 KV via sleep(level=1) then wake_up() "
+            "(reloads model weights to GPU).",
+            _RELEASE_V1_KV_ENV,
+        )
+        llm.sleep(level=1, mode="abort")
+        llm.wake_up()
+    except Exception as e:
+        logger.warning("%s: sleep/wake failed: %s", _RELEASE_V1_KV_ENV, e)
+
+
+def ensure_inprocess_engine_for_weight_sharing() -> None:
+    """Compactor must see ``worker.get_model()`` in the same process as vLLM."""
+    if os.environ.get(_MP_ENV, "1") != "0":
+        os.environ[_MP_ENV] = "0"
+        logger.info(
+            "KV cache pruning: set %s=0 so the model stays in-process for "
+            "shared-weight compactor (no manual env needed).",
+            _MP_ENV,
+        )
+
+
+def _normalize_prompt_list(prompts: Any) -> list[Any]:
+    if isinstance(prompts, str):
+        return [prompts]
+    if isinstance(prompts, dict):
+        return [prompts]
+    return list(prompts)
+
+
+def _normalize_sampling_params(
+    sampling_params: SamplingParams | Sequence[SamplingParams] | None,
+    n: int,
+) -> list[SamplingParams]:
+    if sampling_params is None:
+        return [SamplingParams() for _ in range(n)]
+    if isinstance(sampling_params, SamplingParams):
+        return [sampling_params] * n
+    sps = list(sampling_params)
+    if len(sps) != n:
+        raise ValueError(
+            f"sampling_params length {len(sps)} != prompts length {n}"
+        )
+    return sps
+
+
+def _normalize_compression_params(
+    compression: CompressionParams | Sequence[CompressionParams] | None,
+    n: int,
+) -> list[CompressionParams]:
+    if compression is None:
+        return [CompressionParams(compression_ratio=1.0) for _ in range(n)]
+    if isinstance(compression, CompressionParams):
+        return [compression] * n
+    comp = list(compression)
+    if len(comp) != n:
+        raise ValueError(f"compression length {len(comp)} != prompts length {n}")
+    return comp
+
+
+def _any_compactor(comps: list[CompressionParams]) -> bool:
+    return any(c.compression_ratio < 1.0 for c in comps)
+
+
+_FORCE_COMPACTOR_PATH_ENV = "VLLM_KVPRUNE_FORCE_COMPACTOR_PATH"
+
+
+def _should_use_kvprune_compactor_path(comps: list[CompressionParams]) -> bool:
+    """Use integrated compactor when any prompt requests compression, or when forced.
+
+    If all ``compression_ratio >= 1.0``, the default is to return ``None`` from
+    :func:`try_compressed_generate` and fall back to the standard v1 engine
+    (``Processed prompts`` loop). That hides TP/kvprune bugs behind a different
+    code path. Set ``VLLM_KVPRUNE_FORCE_COMPACTOR_PATH=1`` to run the same
+    compactor + collective RPC path as compression-on, with no KV pruning.
+    """
+    if _any_compactor(comps):
+        return True
+    return os.environ.get(_FORCE_COMPACTOR_PATH_ENV, "").strip().lower() in (
+        "1",
+        "true",
+        "yes",
+    )
+
+
+def _to_compactor_sampling(sp: SamplingParams) -> CompactorSamplingParams:
+    mt = sp.max_tokens
+    if mt is None:
+        mt = 16
+    return CompactorSamplingParams(
+        temperature=float(sp.temperature),
+        max_new_tokens=int(mt),
+    )
+
+
+def _to_sequence_compression(cp: CompressionParams) -> SequenceCompressionParams:
+    return SequenceCompressionParams(
+        compression_ratio=float(cp.compression_ratio),
+        protected_first_tokens=int(cp.protected_first_tokens),
+        protected_last_tokens=int(cp.protected_last_tokens),
+    )
+
+
+def _batch_compression_from_comps(comps: list[CompressionParams]) -> BatchCompressionParams:
+    for c in comps:
+        if c.compression_ratio < 1.0:
+            mid = compression_method_str_to_id(c.compression_method)
+            return BatchCompressionParams(
+                compression_method=compression_method_id_to_enum(mid)
+            )
+    return BatchCompressionParams()
+
+
+def _kvprune_compactor_hf_tokenizer(llm: Any):
+    """HF tokenizer matching :meth:`vllm.kvprune.core.llm_engine.LLMEngine.__init__`.
+
+    Loads from the **resolved on-disk** model tree (local dir or HF cache snapshot), not
+    the bare repo id, to avoid redundant Hub downloads.
+    """
+    cached = getattr(llm, "_kvprune_compactor_hf_tokenizer", None)
+    if cached is not None:
+        return cached
+    mc = llm.llm_engine.vllm_config.model_config
+    model_s = str(mc.model)
+    src = model_s
+    try:
+        p = Path(model_s)
+        if p.is_dir() and (p / "config.json").is_file():
+            src = str(p.resolve())
+        else:
+            from huggingface_hub import snapshot_download
+
+            src = snapshot_download(repo_id=model_s, local_files_only=False)
+    except Exception:
+        src = model_s
+    hf_cfg = mc.hf_config
+    _trust = bool(getattr(hf_cfg, "trust_remote_code", False)) if hf_cfg is not None else False
+    tok = AutoTokenizer.from_pretrained(src, use_fast=True, trust_remote_code=_trust)
+    llm._kvprune_compactor_hf_tokenizer = tok
+    return tok
+
+
+def _prompt_to_compactor_input(prompt: Any) -> str | list[int]:
+    if isinstance(prompt, str):
+        return prompt
+    # Decoder-only `list[int]` token ids (see `vllm.inputs.PromptType`).
+    if isinstance(prompt, list):
+        if not prompt:
+            raise TypeError("Empty token-id prompt is not supported for compactor path.")
+        if all(isinstance(t, int) for t in prompt):
+            return list(prompt)
+    if isinstance(prompt, dict):
+        if "prompt_token_ids" in prompt:
+            ids = prompt["prompt_token_ids"]
+            return list(ids) if not isinstance(ids, list) else ids
+        p = prompt.get("prompt")
+        if isinstance(p, str):
+            return p
+    raise TypeError(
+        f"Unsupported prompt type for compactor path: {type(prompt)}. "
+        "Use str, list[int] token ids, or dict with 'prompt_token_ids' or 'prompt'."
+    )
+
+
+def _prompt_to_token_ids_for_tp(llm: Any, prompt: Any) -> list[int]:
+    """Driver-side token ids for the TP collective path (same tokenizer as vLLM ``LLM``)."""
+    comp_in = _prompt_to_compactor_input(prompt)
+    if isinstance(comp_in, str):
+        return llm.get_tokenizer().encode(comp_in)
+    return list(comp_in)
+
+
+def _compressed_generate_tp_collective(
+    llm: Any,
+    plist: list[Any],
+    sps: list[SamplingParams],
+    comps: list[CompressionParams],
+) -> list[RequestOutput]:
+    """TP>1: run compactor on each worker via ``collective_rpc`` (all ranks)."""
+    vc = llm.llm_engine.vllm_config
+    pc = vc.parallel_config
+    if pc.pipeline_parallel_size != 1 or pc.data_parallel_size != 1:
+        raise NotImplementedError(
+            "KV-prune TP compression requires pipeline_parallel_size=1 and "
+            f"data_parallel_size=1 (got PP={pc.pipeline_parallel_size}, "
+            f"DP={pc.data_parallel_size})."
+        )
+
+    hf = vc.model_config.hf_config
+    tok = llm.get_tokenizer()
+    eos_token_ids = _infer_stop_token_ids(tok, hf)
+
+    prompt_token_ids = [_prompt_to_token_ids_for_tp(llm, p) for p in plist]
+
+    max_len = int(vc.model_config.max_model_len)
+    for i, ids in enumerate(prompt_token_ids):
+        if len(ids) > max_len:
+            raise ValueError(
+                f"KV-prune TP compressed generate: prompt {i} length {len(ids)} "
+                f"exceeds max_model_len ({max_len}). Shorten the prompt or raise "
+                "max_model_len when constructing LLM()."
+            )
+
+    # Payload must be picklable for multiproc/Ray RPC: do not pass multiprocessing
+    # synchronization primitives (workers are separate processes).
+    payload: dict[str, Any] = {
+        "eos_token_ids": eos_token_ids,
+        "prompt_token_ids": prompt_token_ids,
+        "sampling_params": [
+            {
+                "temperature": float(sp.temperature),
+                "max_new_tokens": int(sp.max_tokens if sp.max_tokens is not None else 16),
+            }
+            for sp in sps
+        ],
+        "compression_params": [
+            {
+                "compression_ratio": float(c.compression_ratio),
+                "compression_method": str(c.compression_method),
+                "protected_first_tokens": int(c.protected_first_tokens),
+                "protected_last_tokens": int(c.protected_last_tokens),
+            }
+            for c in comps
+        ],
+    }
+
+    _maybe_release_v1_kv_for_compactor(llm)
+    try:
+        results = llm.llm_engine.collective_rpc(
+            "kvprune_v1_compressed_generate",
+            args=(payload,),
+        )
+    except RuntimeError as e:
+        if "cancelled" in str(e).lower():
+            raise RuntimeError(
+                "collective_rpc was cancelled (a GPU worker likely crashed). "
+                "Scroll up for the first worker traceback — often NCCL/CUDA before "
+                "TCPStore/Broken pipe on the driver."
+            ) from e
+        raise
+    master: dict[str, Any] | None = None
+    for r in results:
+        if isinstance(r, dict) and r.get("tensor_parallel_rank") == 0:
+            master = r
+            break
+    if master is None:
+        raise RuntimeError(
+            "collective_rpc did not return a dict from tensor parallel rank 0."
+        )
+    return _tp_payload_to_request_outputs(llm, master)
+
+
+def _tp_payload_to_request_outputs(llm: Any, master: dict[str, Any]) -> list[RequestOutput]:
+    tok = llm.get_tokenizer()
+    out: list[RequestOutput] = []
+    pids_list = master["prompt_token_ids"]
+    cids_list = master["completion_token_ids"]
+    for i, (pids, cids) in enumerate(zip(pids_list, cids_list)):
+        text = tok.decode(cids, skip_special_tokens=True)
+        # Match ``_sequences_to_request_outputs``: if decode is only special tokens,
+        # skip_special_tokens=True yields blank text while token list is non-empty.
+        if not text.strip() and cids:
+            text = tok.decode(cids, skip_special_tokens=False)
+        co = CompletionOutput(
+            index=0,
+            text=text,
+            token_ids=list(cids),
+            cumulative_logprob=None,
+            logprobs=None,
+            finish_reason="stop",
+        )
+        ro = RequestOutput(
+            request_id=f"kvprune-tp-{i}",
+            prompt=None,
+            prompt_token_ids=list(pids),
+            prompt_logprobs=None,
+            outputs=[co],
+            finished=True,
+        )
+        out.append(ro)
+    return out
+
+
+def _ensure_compactor_engine(llm: Any) -> LLMEngine:
+    if llm._kvprune_compactor_engine is None:
+        pc = llm.llm_engine.vllm_config.parallel_config
+        if pc.tensor_parallel_size != 1:
+            raise ValueError(
+                "KV-pruning compactor path requires tensor_parallel_size=1 "
+                "for shared weights."
+            )
+        llm._kvprune_compactor_engine = create_compactor_engine_with_shared_weights(llm)
+        logger.info("Initialized compactor LLMEngine with weights shared from vLLM.")
+    return llm._kvprune_compactor_engine
+
+
+def try_compressed_generate(
+    llm: Any,
+    prompts: Any,
+    sampling_params: SamplingParams | Sequence[SamplingParams] | None,
+    *,
+    compression: CompressionParams | Sequence[CompressionParams] | None,
+    use_tqdm: bool | Callable[..., tqdm] = True,
+    lora_request: Any = None,
+    priority: list[int] | None = None,
+    tokenization_kwargs: dict[str, Any] | None = None,
+) -> list[RequestOutput] | None:
+    """Return completions on the compactor engine, or ``None`` to use normal v1.
+
+    ``lora_request`` / ``priority`` / ``tokenization_kwargs`` are accepted for API
+    parity with :meth:`~vllm.entrypoints.llm.LLM.generate` but are not passed to the
+    compactor engine yet.
+    """
+    del lora_request, priority, tokenization_kwargs, use_tqdm
+
+    plist = _normalize_prompt_list(prompts)
+    sps = _normalize_sampling_params(sampling_params, len(plist))
+    comps = _normalize_compression_params(compression, len(plist))
+
+    pc = llm.llm_engine.vllm_config.parallel_config
+    # TP>1: every worker must run the same collective_rpc session. If all
+    # compression_ratio >= 1, the old code returned None and only the driver ran
+    # v1 _run_engine — other ranks never joined a matching collective, which can
+    # deadlock NCCL / leave workers unsynchronized (hang at "Processed prompts:").
+    if pc.tensor_parallel_size > 1:
+        if not _should_use_kvprune_compactor_path(comps):
+            comps = [CompressionParams(compression_ratio=1.0) for _ in plist]
+    elif not _should_use_kvprune_compactor_path(comps):
+        return None
+
+    v1_eager = bool(
+        getattr(llm.llm_engine.vllm_config.model_config, "enforce_eager", False)
+    )
+    if not v1_eager:
+        logger.warning(
+            "KV-prune compression: v1 CUDA graphs are still enabled on this LLM. "
+            "The compactor does not reuse v1 graphs; capture wastes VRAM. "
+            "Set kvprune_compression=True, enforce_eager=True, or "
+            "VLLM_KVPRUNE_COMPRESSION_DEFAULT=1 before import vllm."
+        )
+
+    if pc.tensor_parallel_size > 1:
+        return _compressed_generate_tp_collective(llm, plist, sps, comps)
+
+    ensure_inprocess_engine_for_weight_sharing()
+    if llm._kvprune_compactor_engine is None:
+        _maybe_release_v1_kv_for_compactor(llm)
+    engine = _ensure_compactor_engine(llm)
+    comp_sp = [_to_compactor_sampling(sp) for sp in sps]
+    seq_c = [_to_sequence_compression(c) for c in comps]
+    batch_c = _batch_compression_from_comps(comps)
+    comp_in = [_prompt_to_compactor_input(p) for p in plist]
+
+    _, seqs = engine.generate(
+        comp_in,
+        sampling_params=comp_sp,
+        batch_compression_params=batch_c,
+        per_sequence_compression_params=seq_c,
+        return_sequences=True,
+    )
+
+    return _sequences_to_request_outputs(seqs, engine)
+
+
+def _sequences_to_request_outputs(seqs: list[Any], engine: LLMEngine) -> list[RequestOutput]:
+    tok = engine.tokenizer
+    out: list[RequestOutput] = []
+    for i, seq in enumerate(seqs):
+        text = tok.decode(seq.completion_token_ids, skip_special_tokens=True)
+        # If every emitted id is “special” (e.g. EOS / chat boundary), the stripped
+        # string is empty while ``completion_token_ids`` is non-empty — avoid
+        # presenting a blank answer so users can see boundary tokens / debug.
+        if not text.strip() and seq.completion_token_ids:
+            text = tok.decode(seq.completion_token_ids, skip_special_tokens=False)
+        co = CompletionOutput(
+            index=0,
+            text=text,
+            token_ids=list(seq.completion_token_ids),
+            cumulative_logprob=None,
+            logprobs=None,
+            finish_reason="stop",
+        )
+        ro = RequestOutput(
+            request_id=f"kvprune-{i}",
+            prompt=None,
+            prompt_token_ids=list(seq.prompt_token_ids),
+            prompt_logprobs=None,
+            outputs=[co],
+            finished=True,
+        )
+        out.append(ro)
+    return out
+