vllm kvprune:v1.0.1

f81ce56b · chenzk · 2b7160c6 · 2b7160c6 · 2b7160c6 · 2b7160c6
Commit f81ce56b authored Apr 23, 2026 by chenzk
20 changed files
--- a/vllm/kvprune_legacy_save/config/engine_config.py
+++ b/vllm/kvprune_legacy_save/config/engine_config.py
-import os
-from dataclasses import dataclass
-from enum import Enum, auto
-from typing import List, Optional
-
-from transformers import AutoConfig
-
-
-class AttentionBackend(Enum):
-    """Legacy coarse backend toggle (prefer :class:`KvpruneAttentionSchedule`)."""
-
-    FLASH_ATTENTION = auto()
-    COMPACTOR_TRITON = auto()
-
-
-class KvpruneAttentionSchedule(Enum):
-    """FlashAttention vs Triton split for prefill / decode (KV **writes** stay Triton)."""
-
-    # Default: FA varlen prefill; decode uses ``head_sparse_decode_attention`` (Triton).
-    FA_PREFILL_TRITON_DECODE = auto()
-    # Prefill attention uses ``causal_sparse_varlen_with_cache`` (Triton); decode Triton.
-    TRITON_PREFILL_TRITON_DECODE = auto()
-    # "PDFA": FA prefill + FA decode; paged KV **storage** (incl. pruned top-k) unchanged.
-    PDFA = auto()
-
-
-@dataclass
-class LLMConfig:
-    """Configuration for the :class:`LLM` engine.
-    Parameters
-    ----------
-    model : str
-        Hugging Face model identifier (e.g. ``"meta-llama/Meta-Llama-3-8B"``) or
-        a local model name that can be resolved by
-        :func:`transformers.AutoConfig.from_pretrained`.
-    path : str, optional
-        Local directory containing the model weights. If ``None``, the engine
-        will attempt to resolve a local snapshot for ``model`` using
-        :func:`huggingface_hub.snapshot_download`.
-    max_num_seqs : int, default 256
-        Upper bound on the number of concurrent batches that the scheduler and
-        KV-cache manager are allowed to handle. This affects the size of the
-        page table and some internal buffers.
-    max_model_len : int, default 40960
-        Maximum context length (in tokens) that the engine will allocate KV cache
-        and CUDA graphs for. During initialization this value is clamped to
-        ``hf_config.max_position_embeddings`` for the chosen model.
-    gpu_memory_utilization : float, default 0.9
-        Fraction of the total GPU memory that may be used for KV cache and model
-        activations. Values should be in ``(0, 1]``. If this budget is too small,
-        the KV-cache manager may raise an error at warmup time due
-        to insufficient memory.
-    tensor_parallel_size : int, default 1
-        Number of tensor-parallel workers to shard the model
-        across. Must be between 1 and 8, and must evenly divide the model's
-        number of key/value heads.
-    enforce_eager : bool, default False
-        If ``True``, disable CUDA graph capture and always run the model in
-        eager mode during decoding. This reduces throughput. When ``False``,
-        the engine will capture and reuse CUDA graphs for supported
-        batch sizes and sequence lengths.
-    hf_config : transformers.AutoConfig, optional
-        Pre-loaded Hugging Face configuration for the model. If ``None``,
-         it will then be populated automatically based on ``model``.
-    eos : int, default -1
-        Primary stop token id (warmup / single-id paths). If ``-1``, the
-        :class:`LLM` constructor fills this and :attr:`eos_token_ids` from the
-        tokenizer.
-    eos_token_ids : list of int, optional
-        All token ids that terminate generation (e.g. HF tokenizers may expose
-        ``eos_token_id`` as a list for chat models). If ``None``, inferred in
-        :class:`LLM` from the tokenizer and model type.
-    kvcache_page_size : int, default 128
-        Number of tokens stored in a single KV-cache page. Smaller pages improve
-        allocation flexibility but increase page-table overhead; larger pages
-        reduce overhead but have coarser granularity.
-    leverage_sketch_size : int, default 48
-        Sketch dimension used by the Compactor leverage-score estimator.
-    attention_schedule : KvpruneAttentionSchedule, default FA_PREFILL_TRITON_DECODE
-        Which **attention** implementation runs on prefill vs decode. KV **writes**
-        (``prefill_store_*``, ``decode_store_kv``, pruned top-k) always use the
-        existing Triton store kernels. Env ``VLLM_KVPRUNE_ATTENTION_SCHEDULE`` uses
-        short names: ``fa_triton`` (default), ``pdtriton``, ``pdfa``. Enum values:
-        ``FA_PREFILL_TRITON_DECODE`` — FA prefill, Triton decode;
-        ``TRITON_PREFILL_TRITON_DECODE`` — Triton prefill + decode;
-        ``PDFA`` — FA prefill + FA decode (still Triton KV I/O).
-    attention_backend : AttentionBackend, optional
-        Deprecated. Ignored if ``attention_schedule`` is set; otherwise mapped
-        for backward compatibility.
-    """
-
-    model: str
-    path: Optional[str] = None
-    nccl_port: Optional[int] = 1218
-    max_num_seqs: int = 256
-    max_model_len: int = 40960
-    gpu_memory_utilization: float = 0.9
-    tensor_parallel_size: int = 1
-    enforce_eager: bool = False
-    hf_config: AutoConfig | None = None
-    eos: int = -1
-    eos_token_ids: Optional[List[int]] = None
-    kvcache_page_size: int = 128
-    leverage_sketch_size: int = 48
-    attention_schedule: KvpruneAttentionSchedule = (
-        KvpruneAttentionSchedule.FA_PREFILL_TRITON_DECODE
-    )
-    attention_backend: AttentionBackend | None = None
-    show_progress_bar: bool = True
-
-    def __post_init__(self):
-        if self.attention_backend is not None:
-            if self.attention_backend == AttentionBackend.FLASH_ATTENTION:
-                self.attention_schedule = KvpruneAttentionSchedule.FA_PREFILL_TRITON_DECODE
-            else:
-                self.attention_schedule = (
-                    KvpruneAttentionSchedule.TRITON_PREFILL_TRITON_DECODE
-                )
-        if self.path is not None and not os.path.isdir(self.path):
-            raise NotADirectoryError(f"Engine config dir {self.path} does not exist")
-        if self.tensor_parallel_size <= 0 or self.tensor_parallel_size > 8:
-            assert 1 <= self.tensor_parallel_size <= 8
-            raise ValueError("tensor_parallel_size must be >= 1 and <= 8")
-        if self.hf_config is None:
-            self.hf_config = AutoConfig.from_pretrained(self.model)
-        self.max_model_len = min(
-            self.max_model_len, self.hf_config.max_position_embeddings
-        )
-
--- a/vllm/kvprune_legacy_save/config/sampling_params.py
+++ b/vllm/kvprune_legacy_save/config/sampling_params.py
-from dataclasses import dataclass
-
-
-@dataclass
-class SamplingParams:
-    temperature: float = 1.0
-    max_new_tokens: int = 256
-
-    def __post_init__(self):
-        if self.temperature < 0:
-            raise ValueError("Temperature cannot be negative")
--- a/vllm/kvprune_legacy_save/core/__init__.py
+++ b/vllm/kvprune_legacy_save/core/__init__.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Core: compactor ``LLMEngine`` stack (``llm_engine``, ``scheduler``, …) plus helpers
-(``runtime``, ``flash_integration``, ``block_budget``) used **inside** the compactor path.
-
-v1 does not import these; use :meth:`vllm.LLM.generate` with ``compression=`` for the
-``LLM`` + compactor integration.
-"""
-
-from vllm.kvprune.core.block_budget import (
-    TailReclaimHint,
-    build_tail_reclaim_hint,
-    tail_blocks_if_logical_shorter,
-)
-from vllm.kvprune.core.compression_bridge import (
-    VALID_ALIASES_FOR_SAMPLING,
-    compression_method_id_to_enum,
-    compression_method_str_to_id,
-)
-from vllm.kvprune.core.flash_integration import (
-    do_kv_cache_update_kv_prune,
-    merge_seq_lens_with_kv_prune,
-)
-from vllm.kvprune.core.runtime import (
-    KVPruneForwardState,
-    build_kv_prune_forward_state,
-    get_kv_prune_state,
-    layer_index_from_layer_name,
-)
-
-__all__ = [
-    "KVPruneForwardState",
-    "TailReclaimHint",
-    "VALID_ALIASES_FOR_SAMPLING",
-    "build_kv_prune_forward_state",
-    "build_tail_reclaim_hint",
-    "compression_method_id_to_enum",
-    "compression_method_str_to_id",
-    "do_kv_cache_update_kv_prune",
-    "get_kv_prune_state",
-    "layer_index_from_layer_name",
-    "merge_seq_lens_with_kv_prune",
-    "tail_blocks_if_logical_shorter",
-]
--- a/vllm/kvprune_legacy_save/core/block_budget.py
+++ b/vllm/kvprune_legacy_save/core/block_budget.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Block budget helpers for compactor KV pruning (logical vs physical length).
-
-Used by the **compactor** ``LLMEngine`` path (``PagedKVCache`` / logical lengths),
-not by v1's scheduler. The helpers compare logical KV length to a physical token
-count and return how many full tail blocks can be reclaimed when logical shrinks.
-"""
-
-from __future__ import annotations
-
-from dataclasses import dataclass
-
-
-@dataclass(frozen=True)
-class TailReclaimHint:
-    """How many tail blocks could be freed if logical KV shrinks below allocation."""
-
-    request_id: str
-    allocated_tokens: int
-    logical_tokens: int
-    block_size: int
-    reclaimable_tail_blocks: int
-
-
-def tail_blocks_if_logical_shorter(
-    allocated_tokens: int,
-    logical_tokens: int,
-    block_size: int,
-) -> int:
-    """Return count of fully-unused tail blocks when ``logical < allocated``.
-
-    Block-granular: only counts whole blocks past the last block that still
-    contains a retained logical token index.
-    """
-    if block_size <= 0:
-        return 0
-    if logical_tokens >= allocated_tokens:
-        return 0
-    # Last logical token occupies block index floor((logical-1)/bs) if logical>0
-    if logical_tokens <= 0:
-        return (allocated_tokens + block_size - 1) // block_size
-    last_logical_block = (logical_tokens - 1) // block_size
-    last_alloc_block = (allocated_tokens - 1) // block_size
-    return max(0, last_alloc_block - last_logical_block)
-
-
-def build_tail_reclaim_hint(
-    request_id: str,
-    allocated_tokens: int,
-    logical_tokens: int,
-    block_size: int,
-) -> TailReclaimHint:
-    n = tail_blocks_if_logical_shorter(allocated_tokens, logical_tokens, block_size)
-    return TailReclaimHint(
-        request_id=request_id,
-        allocated_tokens=allocated_tokens,
-        logical_tokens=logical_tokens,
-        block_size=block_size,
-        reclaimable_tail_blocks=n,
-    )
-
-
-__all__ = [
-    "TailReclaimHint",
-    "build_tail_reclaim_hint",
-    "tail_blocks_if_logical_shorter",
-]
--- a/vllm/kvprune_legacy_save/core/compression_bridge.py
+++ b/vllm/kvprune_legacy_save/core/compression_bridge.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Map compression method strings (e.g. from :class:`~vllm.kvprune.integration.CompressionParams`) to kvprune GPU / enum IDs."""
-
-from __future__ import annotations
-
-from vllm.kvprune.compression.compression_config import CompressionMethod
-
-# IDs stored on device [num_reqs_padded] (int32). Order is stable for kernels.
-COMPRESSION_METHOD_ID_NONE = 0
-COMPRESSION_METHOD_ID_CRITICALADAKV = 1
-COMPRESSION_METHOD_ID_COMPACTOR = 2
-COMPRESSION_METHOD_ID_SNAPKV = 3
-
-# Aliases accepted for method strings (case-insensitive after strip).
-VALID_ALIASES_FOR_SAMPLING: frozenset[str] = frozenset(
-    {"none", "criticaladakv", "compactor", "snapkv"}
-)
-
-_STR_TO_ID: dict[str, int] = {
-    "none": COMPRESSION_METHOD_ID_NONE,
-    "criticaladakv": COMPRESSION_METHOD_ID_CRITICALADAKV,
-    "compactor": COMPRESSION_METHOD_ID_COMPACTOR,
-    "snapkv": COMPRESSION_METHOD_ID_SNAPKV,
-}
-
-_ID_TO_COMPRESSION_METHOD: dict[int, CompressionMethod] = {
-    COMPRESSION_METHOD_ID_NONE: CompressionMethod.NONE,
-    COMPRESSION_METHOD_ID_CRITICALADAKV: CompressionMethod.CRITICALADAKV,
-    COMPRESSION_METHOD_ID_COMPACTOR: CompressionMethod.COMPACTOR,
-    COMPRESSION_METHOD_ID_SNAPKV: CompressionMethod.SNAPKV,
-}
-
-
-def compression_method_str_to_id(s: str) -> int:
-    """Normalize and map user string to a stable int id (0..3)."""
-    key = (s or "none").strip().lower()
-    if key not in _STR_TO_ID:
-        raise ValueError(
-            f"Unknown compression_method {s!r}; expected one of "
-            f"{sorted(VALID_ALIASES_FOR_SAMPLING)}"
-        )
-    return _STR_TO_ID[key]
-
-
-def compression_method_id_to_enum(method_id: int) -> CompressionMethod:
-    if method_id not in _ID_TO_COMPRESSION_METHOD:
-        return CompressionMethod.NONE
-    return _ID_TO_COMPRESSION_METHOD[method_id]
-
-
-__all__ = [
-    "COMPRESSION_METHOD_ID_NONE",
-    "COMPRESSION_METHOD_ID_CRITICALADAKV",
-    "COMPRESSION_METHOD_ID_COMPACTOR",
-    "COMPRESSION_METHOD_ID_SNAPKV",
-    "VALID_ALIASES_FOR_SAMPLING",
-    "compression_method_id_to_enum",
-    "compression_method_str_to_id",
-]
--- a/vllm/kvprune_legacy_save/core/flash_integration.py
+++ b/vllm/kvprune_legacy_save/core/flash_integration.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""FlashAttention + KV cache hooks for kvprune."""
-
-from __future__ import annotations
-
-import torch
-
-from vllm.kvprune.core.runtime import KVPruneForwardState, get_kv_prune_state
-
-_RATIO_ONE = 1.0 - 1e-6
-
-
-def merge_seq_lens_with_kv_prune(
-    base_seq_lens: torch.Tensor,
-    layer_name: str,
-    max_query_len: int,
-) -> torch.Tensor:
-    """Blend scheduler seq_lens with per-layer logical lengths when pruning."""
-    state = get_kv_prune_state()
-    if state is None:
-        return base_seq_lens
-    # Prefill: only scheduler lengths are reliable unless compactor store ran for
-    # every layer (try_prefill_kv_store); when pruning is requested but ineligible
-    # (e.g. unsupported dtype), logical buffers may still be zero — do not override.
-    if max_query_len > 1:
-        return base_seq_lens
-    layer_idx = _layer_idx(layer_name)
-    num_reqs = state.num_reqs
-    comp = state.compression_ratio_gpu[:num_reqs]
-    logical = state.logical_seq_lens_gpu[layer_idx, :num_reqs]
-    if logical.dim() == 2:
-        logical = logical.max(dim=-1).values
-    out = base_seq_lens.clone()
-    use_logical = comp < _RATIO_ONE
-    out[:num_reqs] = torch.where(
-        use_logical,
-        logical.to(out.dtype),
-        base_seq_lens[:num_reqs],
-    )
-    return out
-
-
-def _layer_idx(layer_name: str) -> int:
-    from vllm.kvprune.core.runtime import layer_index_from_layer_name
-
-    return layer_index_from_layer_name(layer_name)
-
-
-def do_kv_cache_update_kv_prune(
-    layer: torch.nn.Module,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    kv_cache: torch.Tensor,
-    slot_mapping: torch.Tensor,
-    reshape_and_cache_flash,
-    kv_cache_dtype: str,
-) -> bool:
-    """If kvprune handles this step, return True (caller skips default path)."""
-    state = get_kv_prune_state()
-    if state is None:
-        return False
-
-    layer_idx = _layer_idx(layer.layer_name)
-    num_reqs = state.num_reqs
-
-    if state.is_prefill:
-        from vllm.kvprune.compression.prefill import try_prefill_kv_store
-
-        if try_prefill_kv_store(layer, key, value, kv_cache):
-            return True
-        return False
-
-    key_cache, value_cache = kv_cache.unbind(0)
-    reshape_and_cache_flash(
-        key,
-        value,
-        key_cache,
-        value_cache,
-        slot_mapping,
-        kv_cache_dtype,
-        layer._k_scale,
-        layer._v_scale,
-    )
-    comp = state.compression_ratio_gpu[:num_reqs]
-    mask = (comp < _RATIO_ONE).to(torch.int32)
-    layer_buf = state.logical_seq_lens_gpu[layer_idx, :num_reqs]
-    if layer_buf.dim() == 2:
-        layer_buf += mask.unsqueeze(-1)
-    else:
-        layer_buf += mask
-    return True
--- a/vllm/kvprune_legacy_save/core/llm_engine.py
+++ b/vllm/kvprune_legacy_save/core/llm_engine.py
-from __future__ import annotations
-
-import atexit
-import inspect
-import logging
-from pathlib import Path
-from typing import Any, List, Optional, Union
-
-import torch.nn as nn
-import torch.multiprocessing as mp
-from vllm.kvprune.compression.compression_config import (
-    BatchCompressionParams,
-    SequenceCompressionParams,
-)
-from vllm.kvprune.config.engine_config import LLMConfig
-from vllm.kvprune.config.sampling_params import SamplingParams
-from vllm.kvprune.core.model_runner import ModelRunner
-from vllm.kvprune.models import MODEL_REGISTRY
-from vllm.kvprune.utils.sequence import Sequence
-from transformers import AutoTokenizer
-
-logger = logging.getLogger(__name__)
-
-PromptLike = Union[str, List[int]]
-
-
-def _infer_stop_token_ids(tokenizer, hf_config) -> list[int]:
-    """
-    Build the set of token ids that should end generation.
-
-    Newer HF chat tokenizers often expose ``eos_token_id`` as a *list* of ids.
-    The engine must not compare generated ids to that list as a single ``int``;
-    see :attr:`LLMConfig.eos_token_ids` and decode-time ``torch.isin``.
-
-    Qwen chat uses ``</think>`` (im_end) as the assistant turn boundary; include it
-    when present in ``additional_special_tokens`` / ``added_tokens_encoder``. We
-    avoid loose substring matches like ``\"end\"`` that can tag unrelated tokens.
-    """
-    raw = tokenizer.eos_token_id
-    ids: list[int] = []
-    if isinstance(raw, (list, tuple)):
-        ids.extend(int(x) for x in raw)
-    elif raw is not None:
-        ids.append(int(raw))
-    unk_id = getattr(tokenizer, "unk_token_id", None)
-
-    def _maybe_add_tid(tid: int) -> None:
-        if not isinstance(tid, int) or tid < 0:
-            return
-        if unk_id is not None and tid == unk_id:
-            return
-        if tid not in ids:
-            ids.append(tid)
-
-    model_type = getattr(hf_config, "model_type", None)
-    if model_type in ("qwen2", "qwen3", "qwen2_moe", "qwen3_moe"):
-        enc = getattr(tokenizer, "added_tokens_encoder", None)
-        if isinstance(enc, dict):
-            for key, tid in enc.items():
-                if isinstance(key, str) and "im_end" in key:
-                    _maybe_add_tid(int(tid))
-        for extra in getattr(tokenizer, "additional_special_tokens", []) or []:
-            if not isinstance(extra, str) or "im_end" not in extra:
-                continue
-            try:
-                tid = tokenizer.convert_tokens_to_ids(extra)
-            except (TypeError, ValueError, KeyError):
-                continue
-            _maybe_add_tid(tid)
-
-    if not ids:
-        raise ValueError(
-            "Could not infer stop token ids from the tokenizer; set "
-            "LLMConfig(eos_token_ids=[...]) explicitly."
-        )
-    return ids
-
-
-def _merge_apply_chat_template_kwargs(
-    tokenizer,
-    user_kwargs: Optional[dict[str, Any]],
-) -> dict[str, Any]:
-    """
-    Merge user kwargs with defaults for HF chat templates that support them.
-
-    Qwen3 (and similar) instruct models expect `add_generation_prompt=True` so
-    the first generated token continues the assistant turn; without it, output
-    can repeat punctuation / template fragments. `enable_thinking=False` avoids
-    the Qwen3 reasoning channel when the tokenizer supports it.
-    """
-    out = dict(user_kwargs or {})
-    try:
-        sig = inspect.signature(tokenizer.apply_chat_template)
-    except (TypeError, ValueError):
-        return out
-    if "add_generation_prompt" in sig.parameters and "add_generation_prompt" not in out:
-        out["add_generation_prompt"] = True
-    if "enable_thinking" in sig.parameters and "enable_thinking" not in out:
-        out["enable_thinking"] = False
-    return out
-
-
-def _runner_entry(config: LLMConfig, rank: int, evt):
-    runner = None
-    try:
-        runner = ModelRunner(config, rank, evt)
-        runner.loop()
-    except Exception as e:
-        logging.exception(f"Rank {rank}: {repr(e)}")
-    finally:
-        if runner is not None:
-            runner.exit()
-
-
-class LLMEngine:
-    """High-level engine coordinating model runners and scheduling"""
-
-    def __init__(self, config: LLMConfig, external_model: nn.Module | None = None):
-        self.config = config
-        if self.config.hf_config.model_type not in MODEL_REGISTRY:
-            raise ValueError(f"Unknown model {self.config.model}")
-        if config.path is None:
-            # Local directory: use it directly (no Hub round-trip).
-            try:
-                mp = Path(config.model)
-                if mp.is_dir() and (mp / "config.json").is_file():
-                    self.config.path = str(mp.resolve())
-                    logger.info("Using local model directory for tokenizer: %s", self.config.path)
-            except OSError:
-                pass
-        if config.path is None:
-            from huggingface_hub import snapshot_download
-
-            # Hub repo id: allow downloading missing shards/tokenizer files when cache
-            # is incomplete (local_files_only=False). Local dirs are handled above.
-            self.config.path = snapshot_download(
-                repo_id=config.model,
-                local_files_only=False,
-            )
-            logger.info(
-                "Resolved Hugging Face snapshot for %s @ %s",
-                self.config.model,
-                self.config.path,
-            )
-        assert self.config.path is not None
-        _trust = bool(getattr(self.config.hf_config, "trust_remote_code", False))
-        # Always load tokenizer from the resolved on-disk tree so we do not re-hit
-        # the Hub with the repo id (can re-download tokenizer / LFS shards).
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            self.config.path,
-            use_fast=True,
-            trust_remote_code=_trust,
-        )
-        if self.config.eos_token_ids is None:
-            if self.config.eos != -1:
-                self.config.eos_token_ids = [int(self.config.eos)]
-            else:
-                self.config.eos_token_ids = _infer_stop_token_ids(
-                    self.tokenizer, self.config.hf_config
-                )
-        else:
-            self.config.eos_token_ids = [int(x) for x in self.config.eos_token_ids]
-        self.config.eos_token_ids = sorted(set(self.config.eos_token_ids))
-        if self.config.eos == -1:
-            self.config.eos = int(self.config.eos_token_ids[0])
-        else:
-            self.config.eos = int(self.config.eos)
-            if self.config.eos not in self.config.eos_token_ids:
-                self.config.eos_token_ids = sorted(
-                    self.config.eos_token_ids + [self.config.eos]
-                )
-
-        if external_model is not None and int(self.config.tensor_parallel_size) != 1:
-            raise ValueError(
-                "external_model (shared-weight compactor path) only supports "
-                "tensor_parallel_size=1"
-            )
-
-        self.ps = []
-        world_size = int(self.config.tensor_parallel_size)
-        self.events = []
-        if world_size > 1:
-            ctx = mp.get_context("spawn")
-            for r in range(1, world_size):
-                event = ctx.Event()
-                p = ctx.Process(
-                    target=_runner_entry,
-                    args=(self.config, r, event),
-                    daemon=True,
-                )
-                p.start()
-                self.ps.append(p)
-                self.events.append(event)
-
-        self.master_model_runner = ModelRunner(
-            self.config,
-            rank=0,
-            peer_events=self.events,
-            external_model=external_model,
-        )
-        atexit.register(self.exit)
-
-    def exit(self):
-        if getattr(self, "_exited", False):
-            return
-        self._exited = True
-        runner = getattr(self, "master_model_runner", None)
-        if runner is not None:
-            try:
-                runner.exit()
-            except Exception:
-                logger.exception("Failed to exit master ModelRunner cleanly")
-        for p in self.ps:
-            if p.is_alive():
-                p.terminate()
-                p.join(timeout=1.0)
-        if hasattr(self, "events"):
-            self.events.clear()
-
-    def tokenize_prompt(self, prompt: PromptLike, **tokenizer_kwargs) -> List[int]:
-        """
-        Turn a raw prompt into token IDs.
-        """
-        if isinstance(prompt, str):
-            return self.tokenizer(prompt, **tokenizer_kwargs)["input_ids"]
-        else:
-            return list(prompt)
-
-    def detokenize_prompt(
-        self, sequences: List[Sequence], **detokenizer_kwargs
-    ) -> List[str]:
-        """
-        Turn completed Sequences into strings.
-        """
-        defaults: dict[str, Any] = {"skip_special_tokens": True}
-        merged = {**defaults, **detokenizer_kwargs}
-        return self.tokenizer.batch_decode(
-            [s.completion_token_ids for s in sequences], **merged
-        )
-
-    def _build_sequences(
-        self,
-        prompts: List[PromptLike] | PromptLike,
-        sampling_params: SamplingParams | List[SamplingParams],
-        per_sequence_compression_params: Optional[
-            SequenceCompressionParams | List[SequenceCompressionParams]
-        ] = None,
-        tokenizer_kwargs: Optional[dict[str, Any]] = None,
-    ) -> List[Sequence]:
-        """
-        Build Sequence objects from prompts, sampling params, and optional
-        per-sequence compression parameters.
-        """
-        tokenizer_kwargs = {} if tokenizer_kwargs is None else tokenizer_kwargs
-
-        if not isinstance(prompts, list):
-            prompts = [prompts]
-
-        if isinstance(sampling_params, SamplingParams):
-            sampling_params_list: List[SamplingParams] = [sampling_params] * len(
-                prompts
-            )
-        else:
-            sampling_params_list = sampling_params
-            assert len(sampling_params_list) == len(prompts), (
-                "sampling_params list must match prompts length"
-            )
-        if per_sequence_compression_params is None:
-            compression_params_list: List[SequenceCompressionParams] = [
-                SequenceCompressionParams(1.0) for _ in prompts
-            ]
-        elif isinstance(per_sequence_compression_params, SequenceCompressionParams):
-            compression_params_list = [per_sequence_compression_params] * len(prompts)
-        else:
-            # list-like
-            assert len(per_sequence_compression_params) == len(prompts), (
-                "per_sequence_compression_params list must match prompts length"
-            )
-            compression_params_list = list(per_sequence_compression_params)
-
-        seqs: List[Sequence] = []
-        for prompt, sparams, cparams in zip(
-            prompts, sampling_params_list, compression_params_list
-        ):
-            token_ids = self.tokenize_prompt(prompt, **tokenizer_kwargs)
-            if cparams.protected_first_tokens + cparams.protected_last_tokens >= len(token_ids):
-                cparams.compression_ratio = 1.0
-            seqs.append(
-                Sequence(
-                    prompt_token_ids=token_ids,
-                    sampling_params=sparams,
-                    compression_params=cparams,
-                )
-            )
-        return seqs
-
-    def generate(
-        self,
-        prompts: List[PromptLike] | PromptLike,
-        sampling_params: SamplingParams | List[SamplingParams],
-        batch_compression_params: BatchCompressionParams,
-        *,
-        per_sequence_compression_params: Union[
-            List[SequenceCompressionParams], SequenceCompressionParams
-        ] = None,
-        tokenizer_kwargs: Optional[dict[str, Any]] = None,
-        detokenizer_kwargs: Optional[dict[str, Any]] = None,
-        return_sequences: bool = False,
-    ) -> List[str] | tuple[List[str], List[Sequence]]:
-        """
-        Accept prompts and return completed Sequences.
-        Args:
-            :param prompts:
-                Single prompt or list of prompts, each either a raw text prompt,
-                or pre-tokenized input IDs.
-            :param sampling_params:
-                A single SamplingParams for all prompts in this batch or a list of
-                SamplingParams with the same length as ``prompts``.
-            :param batch_compression_params:
-                Compression settings for this batch.
-            :param per_sequence_compression_params:
-                Per-sequence compression parameters, including the compression
-                ratio to be applied and the size of the protected regions of the
-                sequence (how many start tokens and end tokens to keep uncompressed).
-                If a SequenceCompressionParams instance, the same params will be
-                applied to all sequences in this batch; if a list is provided,
-                each SequenceCompressionParams will be attached to the corresponding
-                prompt in the batch.
-            :param tokenizer_kwargs:
-                Extra kwargs forwarded to ``tokenizer(...)`` when tokenizing
-                string prompts.
-            :param detokenizer_kwargs:
-                Passed through to `tokenizer.batch_decode`.
-            :param return_sequences:
-                Whether to return sequence objects or not
-        Returns:
-            :return List[Sequence]:
-                One Sequence per input prompt, with `completion_token_ids`
-                filled in after generation.
-        """
-        tokenizer_kwargs = {} if tokenizer_kwargs is None else tokenizer_kwargs
-        detokenizer_kwargs = {} if detokenizer_kwargs is None else detokenizer_kwargs
-        seqs = self._build_sequences(
-            prompts,
-            sampling_params=sampling_params,
-            per_sequence_compression_params=per_sequence_compression_params,
-            tokenizer_kwargs=tokenizer_kwargs,
-        )
-        self.master_model_runner.generate(seqs, batch_compression_params)
-        output_strings = self.detokenize_prompt(seqs, **detokenizer_kwargs)
-        if return_sequences:
-            return output_strings, seqs
-        return output_strings
-
-    def generate_chat(
-        self,
-        messages_batch: List[List[dict]],
-        sampling_params: SamplingParams | List[SamplingParams],
-        batch_compression_params: BatchCompressionParams,
-        per_sequence_compression_params: Union[
-            SequenceCompressionParams, List[SequenceCompressionParams]
-        ],
-        *,
-        tokenizer_kwargs: Optional[dict[str, Any]] = None,
-        detokenizer_kwargs: Optional[dict[str, Any]] = None,
-        return_sequences: bool = False,
-    ) -> List[str] | tuple[List[str], List[Sequence]]:
-        """
-        Convenience API for chat-style prompts using HF `apply_chat_template`.
-        Args:
-            :param messages_batch:
-                List of conversations, where each conversation is a list of
-                message dicts like:
-                    {"role": "system" | "user" | "assistant", "content": str}
-            :param sampling_params:
-                A single SamplingParams for all prompts in this batch or a list of
-                SamplingParams with the same length as ``prompts``.
-            :param batch_compression_params:
-                Batch Level compression settings. Can set compression_method.
-            :param per_sequence_compression_params:
-                Per-sequence compression parameters, including the compression
-                ratio to be applied and the size of the protected regions of the
-                sequence (how many start tokens and end tokens to keep uncompressed).
-                If a SequenceCompressionParams instance, the same params will be
-                applied to all sequences in this batch; if a list is provided,
-                each SequenceCompressionParams will be attached to the corresponding
-                conversation in the batch.
-            :param tokenizer_kwargs:
-                Passed through to `tokenizer.apply_chat_template`.
-            :param detokenizer_kwargs:
-                Passed through to `tokenizer.batch_decode`.
-            :param return_sequences:
-                Whether to return sequence objects or not
-        Returns:
-            :return List[str] or tuple[List[str], List[Sequence]]:
-                One string per conversation.
-        """
-        prompts_token_ids: List[List[int]] = []
-        tokenizer_kwargs = _merge_apply_chat_template_kwargs(
-            self.tokenizer, tokenizer_kwargs
-        )
-        detokenizer_kwargs = {} if detokenizer_kwargs is None else detokenizer_kwargs
-        for messages in messages_batch:
-            input_ids = self.tokenizer.apply_chat_template(
-                messages,
-                tokenize=True,
-                **tokenizer_kwargs,
-            )
-            if hasattr(input_ids, "tolist"):
-                input_ids = input_ids.tolist()
-            prompts_token_ids.append(input_ids)
-
-        return self.generate(
-            prompts_token_ids,
-            sampling_params=sampling_params,
-            batch_compression_params=batch_compression_params,
-            per_sequence_compression_params=per_sequence_compression_params,
-            tokenizer_kwargs=tokenizer_kwargs,
-            detokenizer_kwargs=detokenizer_kwargs,
-            return_sequences=return_sequences,
-        )
-
-    def generate_from_sequences(
-        self,
-        seqs: List[Sequence],
-        batch_compression_params: BatchCompressionParams,
-    ) -> List[Sequence]:
-        """
-        Args:
-            :param seqs:
-                List of Sequence instances
-            :param batch_compression_params:
-                Compression settings.
-
-        Returns:
-            :return List[Sequence]:
-                Same list, mutated in-place with completions.
-        """
-        self.master_model_runner.generate(seqs, batch_compression_params)
-        return seqs
-
--- a/vllm/kvprune_legacy_save/core/memory_manager.py
+++ b/vllm/kvprune_legacy_save/core/memory_manager.py
-import logging
-import os
-from typing import Iterable, List, Optional
-
-import torch
-from vllm.kvprune.config.engine_config import LLMConfig
-from vllm.kvprune.kv_cache.page_table import KVAllocationStatus, PagedKVCache
-from vllm.kvprune.utils.tp_utils import kv_heads_shard_divisor
-from torch import nn
-
-logger = logging.getLogger(__name__)
-
-
-class KVCacheManager:
-    def __init__(
-        self,
-        rank: int,
-        config: LLMConfig,
-        *,
-        device: str | None = None,
-    ):
-        super().__init__()
-        hf_config = config.hf_config
-        self.rank = rank
-        self.gpu_frac = config.gpu_memory_utilization
-        self.page_size = config.kvcache_page_size
-        self.world_size = config.tensor_parallel_size
-        self.max_num_batches = config.max_num_seqs
-        self.max_model_len = config.max_model_len
-        self.num_layers = hf_config.num_hidden_layers
-        self.model_dtype = hf_config.torch_dtype
-        self.head_dim = getattr(hf_config, "head_dim", None)
-        self.max_pages_per_batch = (
-            self.max_model_len + self.page_size - 1
-        ) // self.page_size
-        _ws = kv_heads_shard_divisor()
-        self.num_kv_heads = hf_config.num_key_value_heads // _ws
-        assert hf_config.num_key_value_heads % _ws == 0, (
-            "tensor-parallel world size needs to divide num_kv_heads"
-        )
-        self._cache_device = device if device is not None else f"cuda:{self.rank}"
-
-        self.num_pages = None
-        self.paged_cache: Optional[PagedKVCache] = None
-        self.max_batched_tokens = None
-
-        self.seq_id_to_batch = {}
-
-    def allocate_sequences(
-        self, seq_ids: List[int], max_positions: List[int]
-    ) -> (bool, Optional[torch.Tensor]):
-        batch_mapping = []
-        for seq_id, len_to_alloc in zip(seq_ids, max_positions):
-            if seq_id not in self.seq_id_to_batch:
-                batch_id = self.paged_cache.new_batch()
-                if batch_id is None:
-                    logger.warning("Failed to allocate batch!")
-                    return False, None
-                self.seq_id_to_batch[seq_id] = int(batch_id)
-            batch_mapping.append(self.seq_id_to_batch[seq_id])
-            if (
-                alloc_status := self.paged_cache.reserve_tokens(
-                    self.seq_id_to_batch[seq_id], len_to_alloc
-                )
-            ) != KVAllocationStatus.SUCCESS:
-                logger.warning(f"Failed to allocate pages ({alloc_status})!")
-                return False, None
-        batch_mapping = torch.as_tensor(batch_mapping, dtype=torch.int32, device="cuda")
-        return True, batch_mapping
-
-    def free_sequences(self, seq_ids: Iterable[int]):
-        for seq_id in seq_ids:
-            global_batch_id = self.seq_id_to_batch.pop(seq_id, None)
-            self.paged_cache.free_batch(global_batch_id)
-
-    def init_cache(self, model: nn.Module):
-        self.num_pages = self.get_num_pages(self.gpu_frac, self.max_pages_per_batch)
-        self.paged_cache = PagedKVCache(
-            num_layers=self.num_layers,
-            H_kv=self.num_kv_heads,
-            head_dim=self.head_dim,
-            page_size=self.page_size,
-            num_pages=int(self.num_pages),
-            max_num_batches=self.max_num_batches,
-            device=self._cache_device,
-            dtype=self.model_dtype,
-            max_logical_pages_per_head=int(self.max_pages_per_batch),
-        )
-        self._assign_cache_to_layers(model)
-
-    def _assign_cache_to_layers(self, model) -> None:
-        for layer_index, layer in enumerate(model.model.layers):
-            attn = layer.self_attn.attn
-            k, v, pt, bh = self.paged_cache.layer_slices(layer_index)
-            attn.k_cache = k
-            attn.v_cache = v
-            attn.page_table = pt
-            attn.bh_seq_lens = bh
-            attn.page_size = self.page_size
-
-    def get_num_pages(self, frac: float, n_logical_pages_max: int):
-        free, total = torch.cuda.mem_get_info()
-        used = total - free
-        stats = torch.cuda.memory_stats()
-        peak = int(stats["allocated_bytes.all.peak"])
-        current = int(stats["allocated_bytes.all.current"])
-        bytes_for_kv_budget = int(total * frac * 0.9) - used - peak + current
-
-        if bytes_for_kv_budget <= 0:
-            # Standalone compactor: ``frac`` is a fraction of total VRAM. When a second
-            # engine shares the GPU with vLLM (shared weights), most VRAM is already
-            # committed; the formula above goes negative. Fall back to a slice of
-            # *currently free* memory for the compactor KV pool.
-            free_frac = float(
-                os.environ.get("VLLM_KVPRUNE_COMPACTOR_KV_FREE_FRAC", "0.55")
-            )
-            free_frac = max(0.05, min(free_frac, 0.95))
-            bytes_for_kv_budget = int(free * free_frac)
-            logger.warning(
-                "KV cache budget from gpu_memory_utilization (%.2f) is exhausted "
-                "(%.2f MiB free on device); using %.0f%% of free memory (~%.2f MiB) "
-                "for compactor KV (set VLLM_KVPRUNE_COMPACTOR_KV_FREE_FRAC to adjust).",
-                frac,
-                free / (1024**2),
-                free_frac * 100,
-                bytes_for_kv_budget / (1024**2),
-            )
-        if bytes_for_kv_budget <= 0:
-            raise RuntimeError(
-                "Insufficient memory for compactor KV cache: no free GPU memory left "
-                "after the primary vLLM engine. Lower vLLM gpu_memory_utilization or "
-                "max_model_len, shorten prompts, or run compactor-only / vLLM-only "
-                "sessions. Raising gpu_memory_utilization here does not help."
-            )
-        # page_table[L, B, H_kv, N_LOGICAL_PAGES_MAX] + bh_seq_lens[L, B, H_kv]
-        int32_sz = torch.empty((), dtype=torch.int32).element_size()  # 4
-        page_table_bytes_per_layer = (
-            self.max_num_batches
-            * self.num_kv_heads
-            * n_logical_pages_max
-            * int32_sz  # page_table
-            + self.max_num_batches * self.num_kv_heads * int32_sz
-        )
-        total_page_table_bytes = self.num_layers * page_table_bytes_per_layer
-        kv_bytes_net = bytes_for_kv_budget - total_page_table_bytes
-        if kv_bytes_net <= 0:
-            # Tight VRAM: metadata alone can exceed the first budget; reserve page
-            # tables plus a slice of remaining free for KV tensors.
-            bytes_for_kv_budget = min(
-                int(free * 0.95),
-                total_page_table_bytes + max(int(free * 0.25), 8 * 1024 * 1024),
-            )
-            kv_bytes_net = bytes_for_kv_budget - total_page_table_bytes
-        if kv_bytes_net <= 0:
-            raise RuntimeError(
-                "page-table footprint exceeds available GPU memory for compactor KV. "
-                f"Reduce vLLM max_num_seqs (compactor uses {self.max_num_batches}) "
-                f"or max_model_len ({self.max_model_len}), or free GPU memory."
-            )
-        dtype_sz = torch.empty((), dtype=self.model_dtype).element_size()
-        bytes_per_page_across_layers = self.num_layers * (
-            2 * self.page_size * self.head_dim * dtype_sz
-        )
-        return max(1, kv_bytes_net // bytes_per_page_across_layers)
-
-    def estimate_max_batched_tokens(
-        self,
-        warmup_tokens: int,
-        bytes_used_before_warmup: int,
-        bytes_peak_after_warmup: int,
-    ) -> int:
-        """
-        Estimate the max total number of tokens that can be processed concurrently
-        without OOM.
-        """
-        assert warmup_tokens > 0, "warmup_tokens must be > 0"
-        # activation bytes per token
-        warmup_delta = max(
-            0, int(bytes_peak_after_warmup) - int(bytes_used_before_warmup)
-        )
-        bytes_per_token = max(1, (warmup_delta + warmup_tokens - 1) // warmup_tokens)
-
-        free, total = torch.cuda.mem_get_info()
-        target = int(total * self.gpu_frac)
-        used_now = int(total - free)
-        # reserve headroom equal to the gap between peak and current allocations seen so far
-        stats = torch.cuda.memory_stats()
-        peak_cur = int(stats.get("allocated_bytes.all.peak", 0))
-        cur_now = int(stats.get("allocated_bytes.all.current", 0))
-        cushion = max(0, peak_cur - cur_now)
-
-        activation_budget = int(max(0, target - used_now - cushion) * 0.95)
-        max_tokens_per_batch = activation_budget // bytes_per_token
-        max_tokens_in_cache = (self.num_pages * self.page_size) // self.num_kv_heads
-        # round to lower multiple of page size
-        max_tokens_per_batch = (max_tokens_per_batch // self.page_size) * self.page_size
-        max_tokens_in_cache = (max_tokens_in_cache // self.page_size) * self.page_size
-
-        # When vLLM shares the same GPU, ``used_now`` often exceeds ``target`` (same
-        # situation as ``get_num_pages``), so activation_budget is ~0 and
-        # ``max_tokens_per_batch`` rounds to 0 or one page. The min(...) would then
-        # cap prefill at ~page_size tokens (e.g. 32) even though the compactor KV pool
-        # is large — no prompt longer than that can be scheduled. Prefer KV capacity
-        # (capped by max_model_len) whenever activation math yields only a token or two.
-        if (
-            max_tokens_in_cache > 0
-            and max_tokens_per_batch <= self.page_size
-            and max_tokens_in_cache > max_tokens_per_batch
-        ):
-            max_tokens_per_batch = min(max_tokens_in_cache, self.max_model_len)
-
-        self.max_batched_tokens = min(max_tokens_in_cache, max_tokens_per_batch)
-        # Last resort: allow at least one page when KV exists but min(...) is still 0.
-        if self.max_batched_tokens == 0 and self.num_pages > 0 and max_tokens_in_cache > 0:
-            self.max_batched_tokens = min(max_tokens_in_cache, self.page_size)
-        return self.max_batched_tokens
-
-    @property
-    def num_free_batches(self) -> int:
-        return len(self.paged_cache.free_batches)
-
-    @property
-    def num_free_pages(self) -> int:
-        return min(len(fp) for fp in self.paged_cache.free_pages)
-
-    def reclaim_pages(
-        self,
-        seq_ids_to_reclaim: Iterable[int],
-        future_reserved_buffer: List[int] | torch.Tensor,
-    ) -> int:
-        approximate_bytes_freed = 0
-        for i, seq_id in enumerate(seq_ids_to_reclaim):
-            batch_idx = self.seq_id_to_batch[seq_id]
-            approximate_bytes_freed += self.paged_cache.reclaim_pages(
-                batch_idx, future_reserved_buffer[i]
-            )
-        return approximate_bytes_freed
--- a/vllm/kvprune_legacy_save/core/model_runner.py
+++ b/vllm/kvprune_legacy_save/core/model_runner.py
-import atexit
-import logging
-import os
-import inspect
-from typing import Any, List, Optional
-
-import torch
-import torch.nn as nn
-import torch.distributed as dist
-from vllm.kvprune.attention.sparse_decode_kernel import num_splits_heuristic
-from vllm.kvprune.compression.compression_config import BatchCompressionParams
-from vllm.kvprune.config.constants import RESERVED_BATCH
-from vllm.kvprune.config.engine_config import LLMConfig, KvpruneAttentionSchedule
-from vllm.kvprune.core.memory_manager import KVCacheManager
-from vllm.kvprune.core.scheduler import Scheduler
-from vllm.kvprune.layers.sampler import Sampler
-from vllm.kvprune.models import MODEL_REGISTRY
-from vllm.kvprune.utils.arguments import (
-    DecodeBatchArguments,
-    DecodeBatchOutput,
-    PackedTensorArguments,
-    PrefillBatchArguments,
-)
-from vllm.kvprune.utils.context import CompressionContext, reset_context, set_context
-from vllm.kvprune.utils.kv_dist import barrier_sync, broadcast_from_tp_rank0
-from vllm.kvprune.utils.sequence import Sequence
-from torch.multiprocessing import Event
-from tqdm import tqdm
-
-logger = logging.getLogger(__name__)
-
-
-class ModelRunner:
-    """Per-rank execution loop. Manages model, sampler, KV cache, and warmup"""
-
-    def __init__(
-        self,
-        config: LLMConfig,
-        rank: int,
-        batch_ready: Optional[Event] = None,
-        peer_events: List[Event] = None,
-        external_model: Optional[nn.Module] = None,
-        *,
-        embedded_in_vllm_worker: bool = False,
-        device: Optional[torch.device] = None,
-    ):
-        self.config = config
-        self.embedded_in_vllm_worker = embedded_in_vllm_worker
-        if embedded_in_vllm_worker:
-            from vllm.distributed.parallel_state import (
-                get_tensor_model_parallel_rank,
-                get_tensor_model_parallel_world_size,
-            )
-
-            tp_ws = get_tensor_model_parallel_world_size()
-            tp_rank = get_tensor_model_parallel_rank()
-            if tp_ws != config.tensor_parallel_size:
-                raise RuntimeError(
-                    f"tensor parallel world size {tp_ws} != "
-                    f"LLMConfig.tensor_parallel_size {config.tensor_parallel_size}"
-                )
-            self.rank = tp_rank
-            _dev = device if device is not None else torch.device(
-                f"cuda:{torch.cuda.current_device()}"
-            )
-            if not dist.is_initialized():
-                raise RuntimeError(
-                    "embedded_in_vllm_worker requires torch.distributed to be "
-                    "initialized (vLLM worker)."
-                )
-            if dist.get_world_size() != tp_ws:
-                raise NotImplementedError(
-                    "KV-prune compactor embedded in vLLM currently requires "
-                    "dist.get_world_size() == tensor_parallel_size "
-                    "(pipeline_parallel_size=1, data_parallel_size=1). "
-                    f"Got dist.get_world_size()={dist.get_world_size()}, "
-                    f"tp_ws={tp_ws}."
-                )
-        else:
-            self.rank = rank
-            _dev = device if device is not None else torch.device(f"cuda:{rank}")
-
-        self._device = _dev
-        assert config.eos_token_ids is not None and len(config.eos_token_ids) > 0, (
-            "LLMConfig.eos_token_ids must be set (filled in LLMEngine from tokenizer)."
-        )
-        self._stop_token_ids = torch.tensor(
-            config.eos_token_ids, dtype=torch.int64, device=_dev
-        )
-        hf_config = config.hf_config
-        self.enforce_eager = config.enforce_eager
-        if config.attention_schedule == KvpruneAttentionSchedule.PDFA:
-            if not self.enforce_eager and self.rank == 0:
-                logger.info(
-                    "attention_schedule=PDFA: disabling compactor decode CUDA graphs "
-                    "(FlashAttention decode path)."
-                )
-            self.enforce_eager = True
-        # Embedded in vLLM worker (TP>1): respect :attr:`LLMConfig.enforce_eager` from
-        # ``v1_tp_runner._apply_compactor_env_overrides``. Set
-        # ``VLLM_KVPRUNE_TP_EMBEDDED_GRAPH=0`` to force eager if graph replay is unstable
-        # with shared vLLM VRAM / streams / NCCL on your stack.
-        if embedded_in_vllm_worker:
-            _tp_graph = os.environ.get(
-                "VLLM_KVPRUNE_TP_EMBEDDED_GRAPH", "1"
-            ).strip().lower()
-            if _tp_graph in ("0", "false", "no"):
-                if not self.enforce_eager:
-                    logger.info(
-                        "embedded_in_vllm_worker: VLLM_KVPRUNE_TP_EMBEDDED_GRAPH=0 → "
-                        "forcing compactor enforce_eager=True (skip compactor CUDA graph "
-                        "capture)."
-                    )
-                self.enforce_eager = True
-        self.world_size = config.tensor_parallel_size
-        self.leverage_sketch_size = config.leverage_sketch_size
-        self.show_progress_bar = config.show_progress_bar
-        self.max_num_batches = config.max_num_seqs
-        self.max_model_len = config.max_model_len
-        self.num_layers = hf_config.num_hidden_layers
-        self.model_dtype = hf_config.torch_dtype
-        self.head_dim = getattr(hf_config, "head_dim", None)
-
-        init_kwargs = {}
-        if not embedded_in_vllm_worker:
-            if "device_id" in inspect.signature(dist.init_process_group).parameters:
-                init_kwargs["device_id"] = torch.device(f"cuda:{rank}")
-            if not dist.is_initialized():
-                dist.init_process_group(
-                    "nccl",
-                    f"tcp://localhost:{config.nccl_port}",
-                    world_size=self.world_size,
-                    rank=rank,
-                    **init_kwargs,
-                )
-            else:
-                ws = dist.get_world_size()
-                if ws != self.world_size:
-                    raise RuntimeError(
-                        "torch.distributed is already initialized with "
-                        f"world_size={ws}, but compactor ModelRunner expects "
-                        f"tensor_parallel_size={self.world_size}. "
-                        "Use tensor_parallel_size matching the active process group "
-                        "(typically 1 when sharing weights with vLLM)."
-                    )
-        torch.cuda.set_device(_dev)
-        default_dtype = torch.get_default_dtype()
-        torch.set_default_dtype(hf_config.torch_dtype)
-        torch.set_default_device("cuda")
-        model_type = hf_config.model_type
-        if external_model is not None:
-            self.model = external_model
-        else:
-            self.model = MODEL_REGISTRY[model_type](hf_config)
-            self.model.load_model(
-                config.path, use_tqdm=self.is_master and self.show_progress_bar
-            )
-        self.sampler = Sampler()
-
-        pre_warmup_mem = torch.cuda.memory_stats().get("allocated_bytes.all.current", 0)
-        # No paged KV yet: FA-only varlen path (see :meth:`warmup`).
-        self.warmup(num_warmup_tokens=self.max_model_len, with_kv=False)
-        post_warmup_peak = torch.cuda.memory_stats().get("allocated_bytes.all.peak", 0)
-
-        self.kv_manager = KVCacheManager(
-            self.rank, config, device=str(self._device)
-        )
-        self.kv_manager.init_cache(self.model)
-
-        self.store_stream: Optional[torch.cuda.Stream] = torch.cuda.Stream()
-        torch.set_default_device("cpu")
-        torch.set_default_dtype(default_dtype)
-
-        self.batch_ready = batch_ready
-        self.peer_events = peer_events if peer_events is not None else []
-        # Embedded TP peers: session end is signaled via TP-group broadcast in
-        # maybe_release_peers (no multiprocessing.Event — not pickleable over RPC).
-        self._embedded_peer_continue = True
-        self.captured_graphs = {}
-        self.min_captured_len = {}
-        self.max_batched_tokens = self.kv_manager.estimate_max_batched_tokens(
-            self.max_model_len, pre_warmup_mem, post_warmup_peak
-        )
-        if self.is_master:
-            logger.info(f"Estimated max batched tokens of {self.max_batched_tokens}")
-        self.warmup(num_warmup_tokens=self.max_model_len, with_kv=True)
-
-        if not self.enforce_eager:
-            bs = [1 << i for i in range(self.max_num_batches.bit_length())]
-            for bs in (
-                tqdm(bs, desc="Capturing CUDA Graphs")
-                if self.is_master and self.show_progress_bar
-                else bs
-            ):
-                for seq_len in [1024, 4096, 8192, 16384]:
-                    self.capture_cudagraph(bs, seq_len)
-
-            if not self.captured_graphs:
-                logger.warning(
-                    "No compactor CUDA graphs were captured (KV budget tight or "
-                    "allocate_sequences failed during capture). Using eager decode "
-                    "for this session."
-                )
-                self.enforce_eager = True
-
-        self.packed_args = PackedTensorArguments(
-            rank=self.rank,
-            max_batched_tokens=self.max_batched_tokens,
-            config=self.config,
-            device=self._device,
-            use_tp_group_for_collectives=embedded_in_vllm_worker,
-        )
-        atexit.register(self.exit)
-
-    @torch.inference_mode()
-    def warmup(self, num_warmup_tokens: int, *, with_kv: bool):
-        sched = (
-            self.config.attention_schedule
-            if with_kv
-            else KvpruneAttentionSchedule.FA_PREFILL_TRITON_DECODE
-        )
-        if self.rank == 0:
-            logger.info(
-                "Warming up compactor attention (%s KV init): schedule=%s",
-                "after" if with_kv else "before",
-                sched.name,
-            )
-        device = self._device
-        input_ids = torch.tensor(
-            [self.config.eos] * num_warmup_tokens, device=device, dtype=torch.int64
-        )
-        positions = torch.arange(num_warmup_tokens, device=device, dtype=torch.int64)
-        cu_seqlens_q = torch.tensor(
-            [0, num_warmup_tokens], device=device, dtype=torch.int32
-        )
-        cu_seqlens_k = torch.tensor(
-            [0, num_warmup_tokens], device=device, dtype=torch.int32
-        )
-        if with_kv:
-            success, batch_mapping = self.kv_manager.allocate_sequences(
-                [-1], [num_warmup_tokens]
-            )
-            assert success
-        else:
-            batch_mapping = None
-        set_context(
-            is_prefill=True,
-            do_compression=False,
-            cu_seqlens_q=cu_seqlens_q,
-            cu_seqlens_k=cu_seqlens_k,
-            cu_seqlens_q_host=(0, num_warmup_tokens),
-            cu_seqlens_k_host=(0, num_warmup_tokens),
-            max_seqlen_q=num_warmup_tokens,
-            max_seqlen_k=num_warmup_tokens,
-            batch_mapping=batch_mapping,
-            attention_schedule=sched,
-        )
-        for _ in range(2):
-            torch.cuda.reset_peak_memory_stats()
-            h = self.model(input_ids, positions)
-            self.model.compute_logits(h)
-            barrier_sync(use_tp_group=self.embedded_in_vllm_worker)
-            if with_kv:
-                self.kv_manager.paged_cache.bh_seq_lens.index_fill_(
-                    1, batch_mapping.to(torch.long), 0
-                )
-        reset_context()
-        if with_kv:
-            self.kv_manager.free_sequences([-1])
-
-    def exit(self):
-        if getattr(self, "_exited", False):
-            return
-        self._exited = True
-        try:
-            if hasattr(self, "captured_graphs"):
-                self.captured_graphs.clear()
-        finally:
-            if getattr(self, "embedded_in_vllm_worker", False):
-                return
-            if dist.is_initialized():
-                dist.destroy_process_group()
-
-    def loop(self):
-        while True:
-            if self.batch_ready.wait(1.0):
-                self._process_batches_peer()
-
-    @torch.inference_mode()
-    def run_prefill(
-        self, prefill_args: PrefillBatchArguments, batch_mapping: torch.Tensor
-    ):
-        assert prefill_args.B > 0 and prefill_args.N > 0
-        max_bh_len = (
-            self.kv_manager.paged_cache.bh_seq_lens.index_select(1, index=batch_mapping)
-            .max()
-            .item()
-        )
-        compression_context = CompressionContext(
-            compression_method=prefill_args.compression_method,
-            compression_chunk_size=prefill_args.compression_chunk_size,
-            batch_tokens_to_retain=prefill_args.batch_tokens_to_retain,
-            max_tokens_to_retain=prefill_args.max_tokens_to_retain,
-            context_lens=prefill_args.context_lens.tolist(),
-            PHI=prefill_args.PHI,
-            sketch_dimension=self.leverage_sketch_size,
-            protected_first_tokens=prefill_args.protected_first,
-            protected_last_tokens=prefill_args.protected_last,
-            compression_ratio=prefill_args.compression_ratio,
-        )
-        cu_q_host = tuple(
-            int(x) for x in prefill_args.cu_seqlens_q.detach().cpu().view(-1).tolist()
-        )
-        cu_k_host = tuple(
-            int(x) for x in prefill_args.cu_seqlens_k.detach().cpu().view(-1).tolist()
-        )
-        set_context(
-            is_prefill=True,
-            do_compression=prefill_args.do_compression,
-            cu_seqlens_q=prefill_args.cu_seqlens_q,
-            cu_seqlens_k=prefill_args.cu_seqlens_k,
-            cu_seqlens_q_host=cu_q_host,
-            cu_seqlens_k_host=cu_k_host,
-            max_seqlen_q=prefill_args.max_seqlen_q,
-            max_seqlen_k=prefill_args.max_seqlen_k,
-            batch_mapping=batch_mapping,
-            max_bh_len=max_bh_len,
-            compression_context=compression_context,
-            STORE_STREAM=self.store_stream,
-            attention_schedule=self.config.attention_schedule,
-        )
-        # int32 token ids break vLLM-delegated embedding (expects long indices) on some paths.
-        _iid = (
-            prefill_args.input_ids
-            if prefill_args.input_ids.dtype == torch.int64
-            else prefill_args.input_ids.long()
-        )
-        _pos = (
-            prefill_args.positions
-            if prefill_args.positions.dtype == torch.int64
-            else prefill_args.positions.long()
-        )
-        hidden = self.model(_iid, _pos)
-        logits = self.model.compute_logits(hidden)
-        reset_context()
-        return logits
-
-    def maybe_broadcast(self, tensor: torch.Tensor, *, label: str = "tensor") -> None:
-        if self.world_size > 1:
-            broadcast_from_tp_rank0(
-                tensor, use_tp_group=self.embedded_in_vllm_worker
-            )
-        return None
-
-    def maybe_release_peers(self, do_release=False):
-        if self.world_size <= 1:
-            return
-        if self.embedded_in_vllm_worker:
-            flag = torch.zeros(1, dtype=torch.int32, device=self._device)
-            if self.is_master:
-                flag[0] = 0 if do_release else 1
-            broadcast_from_tp_rank0(flag, use_tp_group=True)
-            if not self.is_master:
-                self._embedded_peer_continue = bool(flag[0].item())
-            barrier_sync(use_tp_group=True)
-            return
-        if self.is_master:
-            if do_release:
-                for event in self.peer_events:
-                    event.clear()
-            barrier_sync(use_tp_group=False)
-        else:
-            barrier_sync(use_tp_group=False)
-
-    def _peer_outer_loop_active(self) -> bool:
-        if self.batch_ready is not None:
-            return self.batch_ready.is_set()
-        if self.embedded_in_vllm_worker:
-            return self._embedded_peer_continue
-        return False
-
-    @torch.inference_mode()
-    def generate(
-        self,
-        all_sequences: List[Sequence],
-        batch_compression_params: Optional[BatchCompressionParams] = None,
-    ):
-        assert self.is_master, "generate can only be called on the master process"
-        if not self.embedded_in_vllm_worker:
-            for begin_execution_event in self.peer_events:
-                begin_execution_event.set()
-        if batch_compression_params is None:
-            batch_compression_params = BatchCompressionParams()
-        self._process_batches_master(all_sequences, batch_compression_params)
-
-    @property
-    def is_master(self):
-        return self.rank == 0
-
-    @torch.inference_mode()
-    def _process_batches_master(
-        self,
-        all_sequences: List[Sequence],
-        batch_compression_params: BatchCompressionParams,
-    ):
-        assert self.is_master
-        compression_details = f"Applying Compression Method: {batch_compression_params.compression_method}"
-        if any(seq.compression_params.compression_ratio < 1.0 for seq in all_sequences):
-            logger.info(compression_details)
-        scheduler = Scheduler(
-            all_sequences=all_sequences,
-            kv_manager=self.kv_manager,
-            use_tqdm=self.show_progress_bar,
-        )
-        decode_batch = DecodeBatchArguments()
-        decode_flags = torch.empty(2, dtype=torch.int32, device=self._device)
-        while not scheduler.is_finished():
-            sequences = scheduler.get_prefill_batch()
-            if not sequences:
-                if scheduler.pending_sequence_ids:
-                    raise RuntimeError(
-                        "KV-prune compactor cannot schedule any prefill (KV/token budget). "
-                        f"max_batched_tokens={self.kv_manager.max_batched_tokens}, "
-                        f"pending_sequences={len(scheduler.pending_sequence_ids)}. "
-                        "Lower v1 gpu_memory_utilization / max_model_len, set "
-                        "VLLM_KVPRUNE_RELEASE_V1_KV=1 to discard v1 KV (sleep+wake), "
-                        "or free GPU memory. Diagnostics: "
-                        f"{scheduler.diagnose_prefill_failure()}"
-                    )
-                # Pending is empty: either finished or decode-only continuation.
-                if decode_batch.token_ids is None:
-                    break
-                run_decode = True
-                occupancy = -1
-            else:
-                seq_ids_cpu = [seq.seq_id for seq in sequences]
-                scheduler.add_running_sequence_ids(seq_ids_cpu, update_status=True)
-                temps = torch.tensor(
-                    [s.sampling_params.temperature for s in sequences],
-                    dtype=torch.float32,
-                    pin_memory=True,
-                ).to(device=self._device, non_blocking=True)
-                prefill_arguments = self.packed_args.build_prefill_args(
-                    sequences, batch_compression_params=batch_compression_params
-                )
-                max_ctx_lens = (
-                    prefill_arguments.max_new_tokens + prefill_arguments.context_lens
-                )
-
-                success, batch_mapping = self.kv_manager.allocate_sequences(
-                    seq_ids_cpu, max_ctx_lens.tolist()
-                )
-                assert success, "failed to allocate pages for sequences"
-
-                logits = self.run_prefill(prefill_arguments, batch_mapping)
-                # Must match prefill `positions` dtype (int64). `context_lens` is int32
-                # from the packed buffer; using int32 here breaks RoPE indexing
-                # (`cos_sin_cache[positions]`) on CUDA for decode vs prefill.
-                positions = prefill_arguments.context_lens.to(dtype=torch.int64)
-                token_ids = self.sampler(logits, temps)
-                # Prefill KV writes + bh_seq_lens updates run on STORE_STREAM; reclaim
-                # reads bh_seq_lens on the default stream and must not race.
-                if self.store_stream is not None:
-                    torch.cuda.default_stream().wait_stream(self.store_stream)
-                # TODO: synchronize page counts accross dist
-                if self.world_size == 1:
-                    self.kv_manager.reclaim_pages(
-                        seq_ids_cpu, prefill_arguments.max_new_tokens
-                    )
-                    # with logging_redirect_tqdm():
-                    #     logger.info(
-                    #         f"Reclaimed {reclaimed_bytes / 1e6:.2f} MB from the KV cache"
-                    #     )
-
-                if scheduler.any_pending_sequences():
-                    num_pending_batches = (
-                        0
-                        if decode_batch.token_ids is None
-                        else decode_batch.token_ids.shape[0]
-                    )
-                    occupancy = int((num_pending_batches + len(seq_ids_cpu)) * 0.66)
-                else:
-                    occupancy = -1
-                run_decode = not scheduler.can_prefill_another_batch()
-                decode_batch = decode_batch.update(
-                    batch_mapping,
-                    token_ids,
-                    positions,
-                    max_ctx_lens,
-                    prefill_arguments.seq_ids,
-                    temps,
-                    occupancy,
-                )
-            if self.world_size > 1:
-                decode_flags[0] = int(run_decode)
-                decode_flags[1] = occupancy
-                self.maybe_broadcast(decode_flags, label="decode_flags")
-            if not run_decode:
-                continue
-            if self.store_stream is not None:
-                torch.cuda.default_stream().wait_stream(self.store_stream)
-
-            decode_output, decode_batch = self.run_decode_loop(decode_batch)
-            finished_sequence_ids = scheduler.get_finished_sequence_ids_from_unfinished(
-                decode_batch.seq_ids.tolist()
-            )
-            scheduler.record_finished_sequence_ids(
-                finished_sequence_ids, update_status=True
-            )
-            self.kv_manager.free_sequences(finished_sequence_ids)
-            self.maybe_release_peers(scheduler.is_finished())
-            scheduler.update_sequences(
-                decode_output.output_tokens.tolist(),
-                decode_output.output_seq_ids.tolist(),
-            )
-        scheduler.close()
-
-    @torch.inference_mode()
-    def run_peer_session(self) -> None:
-        """Non-master TP ranks: run one peer session (used when embedded in vLLM)."""
-        if self.embedded_in_vllm_worker:
-            self._embedded_peer_continue = True
-        self._process_batches_peer()
-
-    @torch.inference_mode()
-    def _process_batches_peer(self):
-        assert not self.is_master
-        scheduler = Scheduler([], kv_manager=self.kv_manager)
-        decode_batch = DecodeBatchArguments()
-        decode_flags = torch.empty(2, dtype=torch.int32, device=self._device)
-        while self._peer_outer_loop_active():
-            prefill_arguments = self.packed_args.build_prefill_args()
-
-            B = prefill_arguments.B
-            max_ctx_lens = (
-                prefill_arguments.max_new_tokens + prefill_arguments.context_lens
-            )
-
-            seq_ids_cpu = prefill_arguments.seq_ids.tolist()
-            scheduler.add_running_sequence_ids(seq_ids_cpu)
-            success, batch_mapping = self.kv_manager.allocate_sequences(
-                seq_ids_cpu, max_ctx_lens.tolist()
-            )
-            assert success, "failed to allocate pages for sequences"
-
-            self.run_prefill(prefill_arguments, batch_mapping)
-            positions = prefill_arguments.context_lens.to(dtype=torch.int64)
-            self.maybe_broadcast(decode_flags, label="decode_flags")
-            run_decode = bool(decode_flags[0].item())
-            occupancy = int(decode_flags[1].item())
-            token_ids = torch.empty(B, dtype=torch.int64, device=self._device)
-            decode_batch = decode_batch.update(
-                batch_mapping,
-                token_ids,
-                positions,
-                max_ctx_lens,
-                prefill_arguments.seq_ids,
-                None,  # temps not used in peer process
-                occupancy,
-            )
-
-            if not run_decode:
-                continue
-            if self.store_stream is not None:
-                torch.cuda.default_stream().wait_stream(self.store_stream)
-
-            _, decode_batch = self.run_decode_loop(decode_batch)
-            finished_sequence_ids = scheduler.get_finished_sequence_ids_from_unfinished(
-                decode_batch.seq_ids.tolist()
-            )
-            scheduler.record_finished_sequence_ids(finished_sequence_ids)
-            self.kv_manager.free_sequences(finished_sequence_ids)
-            self.maybe_release_peers()
-        scheduler.close()
-
-    @torch.inference_mode()
-    def run_decode_loop(
-        self,
-        decode_batch: DecodeBatchArguments,
-    ) -> tuple[DecodeBatchOutput, DecodeBatchArguments]:
-        if self.is_master:
-            num_stashed_batches = decode_batch.num_stashed_batches
-            tok_buffer = [
-                decode_batch.token_ids[num_stashed_batches:].to(
-                    "cpu", non_blocking=True
-                )
-            ]
-            seq_buffer = [
-                decode_batch.seq_ids[num_stashed_batches:].to("cpu", non_blocking=True)
-            ]
-        while True:
-            self.maybe_broadcast(decode_batch.token_ids, label="decode_token_ids")
-            not_stopped = ~torch.isin(decode_batch.token_ids, self._stop_token_ids)
-            running_batches = (decode_batch.positions < decode_batch.max_ctx_lens) & (
-                not_stopped
-            )
-            decode_batch.token_ids = torch.masked_select(
-                decode_batch.token_ids, running_batches
-            )
-            decode_batch.positions = torch.masked_select(
-                decode_batch.positions, running_batches
-            )
-            decode_batch.batch_mapping = torch.masked_select(
-                decode_batch.batch_mapping, running_batches
-            )
-            decode_batch.max_ctx_lens = torch.masked_select(
-                decode_batch.max_ctx_lens, running_batches
-            )
-            decode_batch.seq_ids = torch.masked_select(
-                decode_batch.seq_ids, running_batches
-            )
-            if self.is_master:
-                decode_batch.temps = torch.masked_select(
-                    decode_batch.temps, running_batches
-                )
-            num_remaining = decode_batch.token_ids.numel()
-            if (
-                num_remaining == 0
-                or num_remaining <= decode_batch.desired_batch_occupancy
-            ):
-                decode_batch.num_stashed_batches = num_remaining
-                break
-            logits = self._decode_step_logits(decode_batch)
-
-            if self.is_master:
-                decode_batch.token_ids = self.sampler(logits, decode_batch.temps)
-                tok_buffer.append(decode_batch.token_ids.to("cpu", non_blocking=True))
-                seq_buffer.append(decode_batch.seq_ids.to("cpu", non_blocking=True))
-            decode_batch.positions += 1
-
-        if self.is_master:
-            # non_blocking D2H copies must finish before cat/tolist read CPU data.
-            torch.cuda.synchronize()
-            output = DecodeBatchOutput(
-                output_tokens=torch.cat(tok_buffer),
-                output_seq_ids=torch.cat(seq_buffer),
-            )
-        else:
-            output = DecodeBatchOutput(None, None)
-        return output, decode_batch
-
-    def _decode_logits_eager(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        batch_mapping: torch.Tensor,
-    ):
-        set_context(
-            is_prefill=False,
-            do_compression=False,
-            batch_mapping=batch_mapping,
-            attention_schedule=self.config.attention_schedule,
-        )
-        _iid = input_ids if input_ids.dtype == torch.int64 else input_ids.long()
-        _pos = positions if positions.dtype == torch.int64 else positions.long()
-        hidden = self.model(_iid, _pos)
-        return self.model.compute_logits(hidden)
-
-    @torch.inference_mode()
-    def _decode_step_logits(self, decode_batch: DecodeBatchArguments):
-        """Graph decode when possible; otherwise eager (never raises on missing graph)."""
-        if self.enforce_eager or not self.captured_graphs:
-            return self._decode_logits_eager(
-                decode_batch.token_ids,
-                decode_batch.positions,
-                decode_batch.batch_mapping,
-            )
-        try:
-            return self.run_graph_decode(
-                decode_batch.token_ids,
-                decode_batch.positions,
-                decode_batch.batch_mapping,
-            )
-        except Exception as e:
-            logger.warning(
-                "CUDA graph decode failed (%s); switching to eager decode for "
-                "remaining steps.",
-                e,
-            )
-            self.enforce_eager = True
-            return self._decode_logits_eager(
-                decode_batch.token_ids,
-                decode_batch.positions,
-                decode_batch.batch_mapping,
-            )
-
-    @torch.inference_mode()
-    def run_graph_decode(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        batch_mapping: torch.Tensor,
-    ):
-        bs = input_ids.shape[0]
-        max_k = int(positions.max())
-        graph_dict = self.get_cuda_graph(bs, max_k)
-        if graph_dict is None:
-            return self._decode_logits_eager(input_ids, positions, batch_mapping)
-        set_context(
-            is_prefill=False,
-            do_compression=False,
-            batch_mapping=batch_mapping,
-            attention_schedule=self.config.attention_schedule,
-        )
-        graph_dict["input_ids"][:bs] = input_ids
-        graph_dict["positions"][:bs] = positions
-        graph_dict["batch_mapping"].fill_(RESERVED_BATCH)
-        graph_dict["batch_mapping"][:bs] = batch_mapping
-        graph_dict["graph"].replay()
-        logits_out = graph_dict["logits"]
-        return logits_out[:bs].contiguous()
-
-    @torch.inference_mode()
-    def capture_cudagraph(self, batch_size: int, max_seqlen_k: int):
-        barrier_sync(use_tp_group=self.embedded_in_vllm_worker)
-        device = torch.device("cuda")
-        logger.debug(
-            f"Capturing CUDA graph for batch size {batch_size} ({max_seqlen_k} tokens)"
-        )
-        _g_input_ids = torch.zeros(batch_size, dtype=torch.int32, device=device)
-        _g_positions = torch.zeros(batch_size, dtype=torch.int64, device=device)
-        _g_hidden = None
-        key_split = num_splits_heuristic(
-            batch_size * self.kv_manager.num_kv_heads,
-            max_seq_len=max_seqlen_k,
-            num_sms=torch.cuda.get_device_properties(device).multi_processor_count,
-            max_splits=12,
-        )
-
-        success, _g_batch_mapping = self.kv_manager.allocate_sequences(
-            list(range(batch_size)), [256] * batch_size
-        )
-        if not success:
-            # Shared GPU with vLLM: compactor KV pool is small; large batch capture
-            # often cannot reserve [256]*batch_size per sequence. Skip this graph.
-            logger.warning(
-                "Skipping CUDA graph capture for batch_size=%s max_seqlen_k=%s "
-                "(KV allocate_sequences failed; decode will use eager or other graphs).",
-                batch_size,
-                max_seqlen_k,
-            )
-            barrier_sync(use_tp_group=self.embedded_in_vllm_worker)
-            return
-
-        set_context(
-            is_prefill=False,
-            do_compression=False,
-            batch_mapping=_g_batch_mapping,
-            key_split=key_split,
-            attention_schedule=self.config.attention_schedule,
-        )
-        _gw = self.model(_g_input_ids, _g_positions)
-        self.model.compute_logits(_gw)
-        barrier_sync(use_tp_group=self.embedded_in_vllm_worker)
-        decode_graph = torch.cuda.CUDAGraph()
-        with torch.cuda.graph(decode_graph):
-            _g_hidden = self.model(_g_input_ids, _g_positions)
-            _g_logits = self.model.compute_logits(_g_hidden)
-        graph_vars = {
-            "graph": decode_graph,
-            "input_ids": _g_input_ids,
-            "positions": _g_positions,
-            "batch_mapping": _g_batch_mapping,
-            "hidden": _g_hidden,
-            "logits": _g_logits,
-            "key_split": key_split,
-        }
-        if batch_size not in self.captured_graphs:
-            self.captured_graphs[batch_size] = {}
-            self.min_captured_len[batch_size] = float("inf")
-
-        self.captured_graphs[batch_size][max_seqlen_k] = graph_vars
-        self.min_captured_len[batch_size] = min(
-            max_seqlen_k, self.min_captured_len[batch_size]
-        )
-        self.kv_manager.free_sequences(list(range(batch_size)))
-
-    def get_cuda_graph(
-        self, batch_size: int, max_seqlen_k: int
-    ) -> Optional[dict[str, Any]]:
-        """Return a captured graph dict, or None if no compatible capture exists."""
-        if not self.captured_graphs:
-            return None
-        eligible_bs = [x for x in self.captured_graphs.keys() if x >= batch_size]
-        if not eligible_bs:
-            return None
-        bs_key = min(eligible_bs)
-        batch_size_graphs = self.captured_graphs[bs_key]
-        candidates = [sl for sl in batch_size_graphs.keys() if sl <= max_seqlen_k]
-        if not candidates:
-            return None
-        best_sl = max(candidates)
-        return batch_size_graphs[best_sl]
-
--- a/vllm/kvprune_legacy_save/core/runtime.py
+++ b/vllm/kvprune_legacy_save/core/runtime.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
-from dataclasses import dataclass
-
-import torch
-
-from vllm.forward_context import get_forward_context
-
-from vllm.kvprune.core.compression_bridge import (
-    COMPRESSION_METHOD_ID_NONE,
-    compression_method_str_to_id,
-)
-
-
-@dataclass
-class KVPruneForwardState:
-    """Per-forward-pass state for KV pruning (per-layer logical lengths)."""
-
-    active: bool
-    compression_ratio_gpu: torch.Tensor
-    """[num_reqs_padded] ratio in (0,1], 1.0 means no pruning for that row."""
-
-    compression_method_id_gpu: torch.Tensor
-    """[num_reqs_padded] int32 — see ``compression_bridge`` ids (0=none)."""
-
-    query_start_loc: torch.Tensor
-    """[num_reqs_padded + 1] int32 on device."""
-
-    num_reqs: int
-    num_reqs_padded: int
-    num_layers: int
-    logical_seq_lens_gpu: torch.Tensor
-    """Logical KV length per layer (and optionally per KV head).
-
-    Shape ``[num_layers, num_reqs_padded]`` or, when ``num_kv_heads > 1``,
-    ``[num_layers, num_reqs_padded, num_kv_heads]`` for per-head lengths.
-    """
-
-    is_prefill: bool
-    device: torch.device
-
-    def logical_seq_lens_for_layer(self, layer_idx: int) -> torch.Tensor:
-        sl = self.logical_seq_lens_gpu[layer_idx]
-        if sl.dim() == 2:
-            return sl.max(dim=-1).values
-        return sl
-
-
-def build_kv_prune_forward_state(
-    *,
-    req_ids: list[str],
-    requests: dict[str, object],
-    query_start_loc: torch.Tensor,
-    num_reqs: int,
-    num_reqs_padded: int,
-    num_layers: int,
-    max_num_scheduled_tokens: int,
-    device: torch.device,
-    logical_seq_lens_gpu: torch.Tensor,
-) -> KVPruneForwardState | None:
-    """Build pruning state when any request uses compression_ratio < 1.0."""
-    if num_reqs <= 0 or num_layers <= 0:
-        return None
-
-    ratios = []
-    method_ids: list[int] = []
-    active_req = False
-    for rid in req_ids[:num_reqs]:
-        req = requests.get(rid)
-        sp = getattr(req, "sampling_params", None) if req is not None else None
-        r = 1.0 if sp is None else float(getattr(sp, "compression_ratio", 1.0))
-        if r < 1.0 - 1e-6:
-            active_req = True
-        ratios.append(r)
-        if sp is None or r >= 1.0 - 1e-6:
-            mid = COMPRESSION_METHOD_ID_NONE
-        else:
-            cm = getattr(sp, "compression_method", "none") or "none"
-            mid = compression_method_str_to_id(str(cm))
-        method_ids.append(mid)
-
-    if not active_req:
-        return None
-
-    compression_ratio_gpu = torch.ones(
-        (num_reqs_padded,), dtype=torch.float32, device=device
-    )
-    compression_ratio_gpu[:num_reqs] = torch.tensor(
-        ratios, dtype=torch.float32, device=device
-    )
-    compression_method_id_gpu = torch.zeros(
-        (num_reqs_padded,), dtype=torch.int32, device=device
-    )
-    compression_method_id_gpu[:num_reqs] = torch.tensor(
-        method_ids, dtype=torch.int32, device=device
-    )
-
-    is_prefill = max_num_scheduled_tokens > 1
-
-    return KVPruneForwardState(
-        active=True,
-        compression_ratio_gpu=compression_ratio_gpu,
-        compression_method_id_gpu=compression_method_id_gpu,
-        query_start_loc=query_start_loc,
-        num_reqs=num_reqs,
-        num_reqs_padded=num_reqs_padded,
-        num_layers=num_layers,
-        logical_seq_lens_gpu=logical_seq_lens_gpu,
-        is_prefill=is_prefill,
-        device=device,
-    )
-
-
-def layer_index_from_layer_name(layer_name: str) -> int:
-    from vllm.model_executor.models.utils import extract_layer_index
-
-    return extract_layer_index(layer_name)
-
-
-def get_kv_prune_state() -> KVPruneForwardState | None:
-    try:
-        fc = get_forward_context()
-    except AssertionError:
-        return None
-    state = fc.additional_kwargs.get("kv_prune")
-    if state is None or not isinstance(state, KVPruneForwardState) or not state.active:
-        return None
-    return state
--- a/vllm/kvprune_legacy_save/core/scheduler.py
+++ b/vllm/kvprune_legacy_save/core/scheduler.py
-import time
-from typing import Iterable, List
-
-from vllm.kvprune.core.memory_manager import KVCacheManager
-from vllm.kvprune.utils.sequence import Sequence, SequenceStatus
-from tqdm import tqdm
-
-
-def cdiv(a, b):
-    """ceiling division"""
-    return (a + b - 1) // b
-
-
-class Scheduler:
-    """
-    Simple sequence scheduler for prefill + decode with a paged KV cache.
-    The scheduler tracks three disjoint sets of sequence IDs:
-
-      * ``pending_sequence_ids`` – sequences that have not yet been started.
-      * ``active_sequence_ids`` – sequences currently running.
-      * ``finished_sequence_ids`` – sequences that have generated all tokens.
-
-    At prefill time, :meth:`get_prefill_batch` selects a subset of pending
-    sequences that can fit into the available KV cache and per-step token
-    budget, given the constraints from the associated :class:`KVCacheManager`.
-
-    The class also handles basic bookkeeping of sequence statuses.
-
-    Args:
-        :param all_sequences:
-            Iterable of :class:`Sequence` objects to be scheduled. Each
-            sequence must have a unique ``seq_id``.
-        :param kv_manager:
-            A :class:`KVCacheManager` instance that this scheduler will use
-            to determine whether additional batches can be scheduled.
-        :param use_tqdm:
-            If True, two progress bars are created:
-              * "Started Batches" – increments when a sequence moves from
-                pending to running.
-              * "Finished Batches" – increments when a sequence finishes.
-    """
-
-    def __init__(
-        self,
-        all_sequences: Iterable[Sequence],
-        kv_manager: KVCacheManager,
-        *,
-        use_tqdm=False,
-    ):
-        self.allseq_mapping: dict[int, Sequence] = {s.seq_id: s for s in all_sequences}
-        self.pending_sequence_ids: set[int] = set([s.seq_id for s in all_sequences])
-        self.active_sequence_ids: set[int] = set()
-        self.finished_sequence_ids: set[int] = set()
-        self.manager = kv_manager
-        self.use_tqdm = use_tqdm
-        self.start_time = time.perf_counter()
-        self.total_tokens_generated = 0
-        self.total_tokens_input = 0
-        self.pbar = None
-        if use_tqdm:
-            self.pbar = tqdm(
-                total=len(self.pending_sequence_ids),
-                desc="Completed Batches",
-            )
-
-    def get_prefill_batch(self) -> List[Sequence]:
-        """
-        Select a batch of pending sequences to prefill under KV/memory constraints.
-
-        The selection is greedy over ``pending_sequence_ids`` in iteration order.
-        A sequence is added to the batch if:
-
-          * The sum of its prompt length and the total prompt tokens selected so
-            far does not exceed ``manager.max_batched_tokens``, and
-          * There is at least one free KV "batch slot" left
-            (``manager.num_free_batches``), and
-          * The total number of KV pages required by the sequence's prompt +
-            max_new_tokens does not exceed the remaining free pages.
-        Returns:
-            :return List[Sequence]:
-                The list of :class:`Sequence` objects chosen for prefill in
-                this step. The caller is responsible for marking them as
-                active via :meth:`add_running_sequence_ids`.
-        """
-        total_tok, sequences = 0, []
-        num_free_batches, num_free_pages = (
-            self.manager.num_free_batches,
-            self.manager.num_free_pages,
-        )
-        for seq_id in self.pending_sequence_ids:
-            seq = self.allseq_mapping[seq_id]
-            prompt_length = seq.prompt_len
-            pages_needed = (
-                cdiv(
-                    prompt_length + seq.sampling_params.max_new_tokens,
-                    self.manager.page_size,
-                )
-                * self.manager.num_kv_heads
-            )
-            if (
-                prompt_length + total_tok <= self.manager.max_batched_tokens
-                and num_free_batches > 0
-                and pages_needed <= num_free_pages
-            ):
-                sequences.append(seq)
-                total_tok += prompt_length
-                num_free_pages -= pages_needed
-                num_free_batches -= 1
-        return sequences
-
-    def diagnose_prefill_failure(self) -> str:
-        """Explain why :meth:`get_prefill_batch` may return empty (debugging)."""
-        num_free_batches = self.manager.num_free_batches
-        num_free_pages = self.manager.num_free_pages
-        parts = [
-            f"num_free_batches={num_free_batches}",
-            f"num_free_pages={num_free_pages}",
-            f"num_pages_per_layer={getattr(self.manager, 'num_pages', None)}",
-        ]
-        seq_id = next(iter(self.pending_sequence_ids), None)
-        if seq_id is None:
-            return "; ".join(parts)
-        seq = self.allseq_mapping[seq_id]
-        pl = seq.prompt_len
-        mn = seq.sampling_params.max_new_tokens
-        pages_needed = (
-            cdiv(pl + mn, self.manager.page_size) * self.manager.num_kv_heads
-        )
-        parts.append(
-            f"first_pending seq_id={seq_id} prompt_len={pl} max_new_tokens={mn} "
-            f"pages_needed~={pages_needed}"
-        )
-        if num_free_batches == 0:
-            parts.append(
-                "likely_cause=no free batch slots (compactor max_num_seqs exhausted)"
-            )
-        elif pl > self.manager.max_batched_tokens:
-            parts.append(
-                f"likely_cause=prompt_len ({pl}) > max_batched_tokens "
-                f"({self.manager.max_batched_tokens})"
-            )
-        elif pages_needed > num_free_pages:
-            parts.append(
-                "likely_cause=KV pool too small: pages_needed exceeds num_free_pages "
-                "(raise VLLM_KVPRUNE_COMPACTOR_KV_FREE_FRAC / lower v1 memory, or cap "
-                "compactor max_num_seqs to shrink page-table overhead)"
-            )
-        else:
-            parts.append(
-                "likely_cause=batched token sum or greedy order (another sequence may "
-                "block first in set iteration)"
-            )
-        return "; ".join(parts)
-
-    def is_finished(self) -> bool:
-        """
-        Check whether all sequences have completed.
-        """
-        return (
-            len(self.pending_sequence_ids) == 0 and len(self.active_sequence_ids) == 0
-        )
-
-    def any_pending_sequences(self) -> bool:
-        """
-        Check whether any sequences are still pending (not yet started).
-        """
-        return len(self.pending_sequence_ids) != 0
-
-    def add_running_sequence_ids(
-        self, active_sequence_ids: Iterable[int], *, update_status: bool = False
-    ):
-        """
-        Mark a set of sequences as active / running. This moves sequence IDs
-        from ``pending_sequence_ids`` into ``active_sequence_ids``. Optionally,
-        it also updates the per-sequence status and progress bar.
-
-        Args:
-            :param active_sequence_ids:
-                Iterable of sequence IDs that have been scheduled for prefill
-                or decode and should now be considered running.
-            :param update_status:
-                If True, set each corresponding :class:`Sequence`'s
-                ``status = SequenceStatus.RUNNING`` and increment the
-                "Started Batches" progress bar if ``use_tqdm`` is enabled.
-        """
-        self.active_sequence_ids.update(active_sequence_ids)
-        self.pending_sequence_ids.difference_update(self.active_sequence_ids)
-        if update_status:
-            for seq_id in active_sequence_ids:
-                self.allseq_mapping[seq_id].status = SequenceStatus.RUNNING
-                self.total_tokens_input += self.allseq_mapping[seq_id].prompt_len
-
-    def get_finished_sequence_ids_from_unfinished(
-        self, unfinished_sequence_ids: Iterable[int]
-    ) -> set[int]:
-        """
-        Infer which active sequences have finished given the
-        unfinished set (for decode steps where the caller knows
-        which sequences are still generating but not necessarily
-        which have just completed).
-        Args:
-            :param unfinished_sequence_ids:
-                Iterable of sequence IDs that are still running
-        Returns:
-            :return set[int]:
-                The inferred set of sequence IDs that transitioned from active
-                to finished.
-        """
-        return self.active_sequence_ids.difference(unfinished_sequence_ids)
-
-    def record_finished_sequence_ids(
-        self, finished_sequence_ids: Iterable[int], *, update_status: bool = False
-    ):
-        """
-        Record that a set of sequences has finished generation.
-
-        This moves IDs from ``active_sequence_ids`` into
-        ``finished_sequence_ids``.
-
-        Args:
-            :param finished_sequence_ids:
-                Iterable of sequence IDs that have completed generation and
-                no longer require KV cache.
-            :param update_status:
-                If True, set each corresponding :class:`Sequence`'s
-                ``status = SequenceStatus.FINISHED``
-        """
-        self.active_sequence_ids.difference_update(finished_sequence_ids)
-        self.finished_sequence_ids.update(finished_sequence_ids)
-        if update_status:
-            for seq_id in finished_sequence_ids:
-                self.allseq_mapping[seq_id].status = SequenceStatus.FINISHED
-                if self.pbar is not None:
-                    self.pbar.update(1)
-
-    def update_sequences(self, tokens: Iterable[int], seq_ids: Iterable[int]):
-        """
-        Append newly generated tokens to their corresponding sequences.
-        Args:
-            :param tokens:
-                Iterable of generated token IDs, one per sequence.
-            :param seq_ids:
-                Iterable of sequence IDs aligned with ``tokens``.
-        """
-        cur_time = time.perf_counter()
-        for tok, seq_id in zip(tokens, seq_ids):
-            self.allseq_mapping[seq_id].add_new_token(tok)
-            self.total_tokens_generated += 1
-        if self.pbar is not None:
-            self.pbar.set_description(
-                f"Throughput: {(self.total_tokens_generated + self.total_tokens_input) / (cur_time - self.start_time):.2f} tok/s"
-            )
-
-    def close(self):
-        if self.pbar is not None:
-            self.pbar.close()
-
-    def can_prefill_another_batch(self) -> bool:
-        return len(self.get_prefill_batch()) > 0
--- a/vllm/kvprune_legacy_save/integration/__init__.py
+++ b/vllm/kvprune_legacy_save/integration/__init__.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""KV-pruning integration: compactor ``LLMEngine`` sharing weights with :class:`~vllm.LLM`."""
-
-from vllm.kvprune.integration.compression_params import CompressionParams
-
-__all__ = ["CompressionParams"]
--- a/vllm/kvprune_legacy_save/integration/compactor_shared.py
+++ b/vllm/kvprune_legacy_save/integration/compactor_shared.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Construct compactor :class:`LLMEngine` sharing weight tensors with an in-process vLLM ``LLM``."""
-
-from __future__ import annotations
-
-import os
-
-import torch.nn as nn
-
-from vllm.config import VllmConfig
-from vllm.kvprune.config.engine_config import LLMConfig
-from vllm.kvprune.core.llm_engine import LLMEngine
-from vllm.kvprune.integration.config_adapter import vllm_config_to_llm_config
-from vllm.kvprune.integration.vllm_model_access import extract_vllm_causal_lm
-from vllm.kvprune.integration.weight_tie import (
-    delegate_kvprune_compute_logits_to_vllm,
-    delegate_kvprune_embed_tokens_to_vllm,
-    tie_kvprune_rope_buffers_from_vllm,
-    tie_kvprune_weights_from_vllm,
-)
-from vllm.kvprune.models import MODEL_REGISTRY
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-
-def build_llm_config_for_compactor(vc: VllmConfig) -> LLMConfig:
-    """Public helper: vLLM config → compactor :class:`LLMConfig`."""
-    return vllm_config_to_llm_config(vc)
-
-
-def create_compactor_engine_with_shared_weights(llm: object) -> LLMEngine:
-    """Single GPU, TP=1: compactor ``LLMEngine`` whose weights alias vLLM tensors.
-
-    Call after the vLLM ``LLM`` has loaded weights. Requires in-process executor
-    (``VLLM_ENABLE_V1_MULTIPROCESSING=0``).
-    """
-    llm_engine = getattr(llm, "llm_engine", None)
-    if llm_engine is None:
-        raise RuntimeError("Expected ``llm.llm_engine``.")
-    vc: VllmConfig = llm_engine.vllm_config
-    if vc.parallel_config.tensor_parallel_size != 1:
-        raise ValueError(
-            "Shared-weight compactor backend requires tensor_parallel_size=1"
-        )
-
-    cfg = vllm_config_to_llm_config(vc)
-    # ``cfg.enforce_eager`` is for the compactor ``ModelRunner`` only (decode CUDA
-    # graphs), not v1. v1 graph capture is controlled solely by ``LLM(...,
-    # enforce_eager=...)`` / ``kvprune_compression=True`` on the entrypoint ``LLM``.
-    # Large vLLM max_num_seqs blows up compactor page-table GPU memory; sharing the GPU
-    # with v1 leaves little room for metadata + KV tensors. Default cap 32 so physical
-    # KV pages stay usable; set VLLM_KVPRUNE_COMPACTOR_MAX_NUM_SEQS=0 to disable cap,
-    # or raise (e.g. 128) if you have VRAM headroom.
-    _cap = os.environ.get("VLLM_KVPRUNE_COMPACTOR_MAX_NUM_SEQS", "32").strip()
-    if _cap:
-        lim = int(_cap)
-        if lim > 0:
-            cfg.max_num_seqs = min(cfg.max_num_seqs, lim)
-
-    # Compactor decode graphs (``enforce_eager=False``): honored for non-shared-weight
-    # engines. **Shared-weight** path (below) forces ``enforce_eager=True`` after
-    # delegating ``compute_logits`` to vLLM unless ``VLLM_KVPRUNE_SHARED_WEIGHT_GRAPH=1``.
-    # Opt out of graphs for non-shared runs: ``VLLM_KVPRUNE_COMPACTOR_ENFORCE_EAGER=1`` or
-    # ``VLLM_KVPRUNE_COMPACTOR_CUDA_GRAPH=0``.
-    _ce = os.environ.get("VLLM_KVPRUNE_COMPACTOR_ENFORCE_EAGER", "").strip().lower()
-    if _ce in ("1", "true", "yes"):
-        cfg.enforce_eager = True
-        logger.info(
-            "KV-prune compactor: VLLM_KVPRUNE_COMPACTOR_ENFORCE_EAGER=1 → "
-            "enforce_eager=True (skip compactor decode CUDA graphs)."
-        )
-    elif _ce in ("0", "false", "no"):
-        cfg.enforce_eager = False
-        logger.info(
-            "KV-prune compactor: VLLM_KVPRUNE_COMPACTOR_ENFORCE_EAGER=0 → "
-            "enforce_eager=False (try compactor CUDA graph capture)."
-        )
-    else:
-        _dg = os.environ.get(
-            "VLLM_KVPRUNE_COMPACTOR_CUDA_GRAPH", "1"
-        ).strip().lower()
-        if _dg in ("0", "false", "no"):
-            cfg.enforce_eager = True
-            logger.info(
-                "KV-prune compactor: VLLM_KVPRUNE_COMPACTOR_CUDA_GRAPH=0 → "
-                "enforce_eager=True (skip compactor decode CUDA graphs)."
-            )
-        else:
-            cfg.enforce_eager = False
-            logger.info(
-                "KV-prune compactor: default try decode CUDA graphs; ModelRunner "
-                "falls back to eager if capture yields none. Set "
-                "VLLM_KVPRUNE_COMPACTOR_ENFORCE_EAGER=1 or "
-                "VLLM_KVPRUNE_COMPACTOR_CUDA_GRAPH=0 to skip capture."
-            )
-
-    hf = cfg.hf_config
-    assert hf is not None
-    model_type = hf.model_type
-    if model_type not in MODEL_REGISTRY:
-        raise ValueError(
-            f"Compactor MODEL_REGISTRY has no entry for model_type={model_type!r}; "
-            f"supported: {sorted(MODEL_REGISTRY)}"
-        )
-
-    vllm_model = extract_vllm_causal_lm(llm)
-    device = next(vllm_model.parameters()).device
-    dtype = next(vllm_model.parameters()).dtype
-
-    # Build compactor shell on CPU first. **Do not** call ``.to(device)`` before tying:
-    # that allocates a full second copy of weights on GPU; tying then frees the
-    # duplicate but peak memory can OOM on large models. Tie first so parameters
-    # alias vLLM tensors directly (no extra weight VRAM).
-    kv_model: nn.Module = MODEL_REGISTRY[model_type](hf)
-    tie_kvprune_weights_from_vllm(vllm_model, kv_model)
-    # Buffers (e.g. RoPE tables) not in ``named_parameters`` may still be on CPU.
-    kv_model.to(device=device, dtype=dtype)
-    tie_kvprune_rope_buffers_from_vllm(vllm_model, kv_model)
-    delegate_kvprune_embed_tokens_to_vllm(vllm_model, kv_model)
-    delegate_kvprune_compute_logits_to_vllm(vllm_model, kv_model)
-
-    # Compactor decode CUDA graphs capture ``model.forward`` + ``compute_logits`` in one
-    # graph. Here ``compute_logits`` is delegated to vLLM's LM head / LogitsProcessor
-    # (cublas GEMM, padded vocab, etc.). Embedding that in a nested capture commonly
-    # fails with ``CUBLAS_STATUS_EXECUTION_FAILED`` and invalidates stream capture
-    # (``cudaErrorStreamCaptureInvalidated``). Default: skip graphs for this integration.
-    _sw_graph = os.environ.get(
-        "VLLM_KVPRUNE_SHARED_WEIGHT_GRAPH", "0"
-    ).strip().lower() in ("1", "true", "yes")
-    if not _sw_graph:
-        cfg.enforce_eager = True
-        logger.info(
-            "KV-prune shared-weight compactor: enforce_eager=True (skip compactor "
-            "decode CUDA graphs; logits delegated to vLLM). Set "
-            "VLLM_KVPRUNE_SHARED_WEIGHT_GRAPH=1 only to attempt capture (often fails)."
-        )
-
-    return LLMEngine(cfg, external_model=kv_model)
--- a/vllm/kvprune_legacy_save/integration/compressed_generate.py
+++ b/vllm/kvprune_legacy_save/integration/compressed_generate.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""KV-pruning (compactor) path invoked from :meth:`vllm.entrypoints.llm.LLM.generate`."""
-
-from __future__ import annotations
-
-import os
-from collections.abc import Callable, Sequence
-from pathlib import Path
-from typing import Any
-
-from tqdm.auto import tqdm
-from transformers import AutoTokenizer
-
-from vllm.kvprune.compression.compression_config import (
-    BatchCompressionParams,
-    SequenceCompressionParams,
-)
-from vllm.kvprune.config.sampling_params import SamplingParams as CompactorSamplingParams
-from vllm.kvprune.core.compression_bridge import (
-    compression_method_id_to_enum,
-    compression_method_str_to_id,
-)
-from vllm.kvprune.core.llm_engine import LLMEngine, _infer_stop_token_ids
-from vllm.kvprune.integration.compactor_shared import create_compactor_engine_with_shared_weights
-from vllm.kvprune.integration.compression_params import CompressionParams
-from vllm.logger import init_logger
-from vllm.outputs import CompletionOutput, RequestOutput
-from vllm.sampling_params import SamplingParams
-
-logger = init_logger(__name__)
-
-_MP_ENV = "VLLM_ENABLE_V1_MULTIPROCESSING"
-_RELEASE_V1_KV_ENV = "VLLM_KVPRUNE_RELEASE_V1_KV"
-
-
-def _maybe_release_v1_kv_for_compactor(llm: Any) -> None:
-    """Optionally discard v1's KV cache so more GPU memory is free for compactor.
-
-    v1 reserves KV blocks at engine init; shared-weight compactor then competes for
-    the same VRAM. ``sleep(level=1)`` discards v1 KV and may offload tagged weights
-    per v1 sleep policy, then ``wake_up()`` reloads — compactor still ties the same
-    v1 tensors after.
-
-    **Default:** ``vllm.env_override`` sets ``VLLM_KVPRUNE_RELEASE_V1_KV=0`` (no
-    sleep/wake; v1 KV stays on GPU). Set ``=1`` if you need extra VRAM for compactor
-    before the first compressed step (then ``llm.sleep`` / ``CuMemAllocator`` /
-    ``Sleep mode freed …`` logs are expected). This does **not** remove v1's KV
-    reservation at init; it only runs the optional sleep/wake cycle before compactor.
-
-    Tests keep ``VLLM_KVPRUNE_RELEASE_V1_KV=0`` in ``conftest``.
-    """
-    if os.environ.get(_RELEASE_V1_KV_ENV, "0").strip().lower() not in (
-        "1",
-        "true",
-        "yes",
-    ):
-        return
-    try:
-        logger.info(
-            "%s=1: discarding v1 KV via sleep(level=1) then wake_up() "
-            "(reloads model weights to GPU).",
-            _RELEASE_V1_KV_ENV,
-        )
-        llm.sleep(level=1, mode="abort")
-        llm.wake_up()
-    except Exception as e:
-        logger.warning("%s: sleep/wake failed: %s", _RELEASE_V1_KV_ENV, e)
-
-
-def ensure_inprocess_engine_for_weight_sharing() -> None:
-    """Compactor must see ``worker.get_model()`` in the same process as vLLM."""
-    if os.environ.get(_MP_ENV, "1") != "0":
-        os.environ[_MP_ENV] = "0"
-        logger.info(
-            "KV cache pruning: set %s=0 so the model stays in-process for "
-            "shared-weight compactor (no manual env needed).",
-            _MP_ENV,
-        )
-
-
-def _normalize_prompt_list(prompts: Any) -> list[Any]:
-    if isinstance(prompts, str):
-        return [prompts]
-    if isinstance(prompts, dict):
-        return [prompts]
-    return list(prompts)
-
-
-def _normalize_sampling_params(
-    sampling_params: SamplingParams | Sequence[SamplingParams] | None,
-    n: int,
-) -> list[SamplingParams]:
-    if sampling_params is None:
-        return [SamplingParams() for _ in range(n)]
-    if isinstance(sampling_params, SamplingParams):
-        return [sampling_params] * n
-    sps = list(sampling_params)
-    if len(sps) != n:
-        raise ValueError(
-            f"sampling_params length {len(sps)} != prompts length {n}"
-        )
-    return sps
-
-
-def _normalize_compression_params(
-    compression: CompressionParams | Sequence[CompressionParams] | None,
-    n: int,
-) -> list[CompressionParams]:
-    if compression is None:
-        return [CompressionParams(compression_ratio=1.0) for _ in range(n)]
-    if isinstance(compression, CompressionParams):
-        return [compression] * n
-    comp = list(compression)
-    if len(comp) != n:
-        raise ValueError(f"compression length {len(comp)} != prompts length {n}")
-    return comp
-
-
-def _any_compactor(comps: list[CompressionParams]) -> bool:
-    return any(c.compression_ratio < 1.0 for c in comps)
-
-
-_FORCE_COMPACTOR_PATH_ENV = "VLLM_KVPRUNE_FORCE_COMPACTOR_PATH"
-
-
-def _should_use_kvprune_compactor_path(comps: list[CompressionParams]) -> bool:
-    """Use integrated compactor when any prompt requests compression, or when forced.
-
-    If all ``compression_ratio >= 1.0``, the default is to return ``None`` from
-    :func:`try_compressed_generate` and fall back to the standard v1 engine
-    (``Processed prompts`` loop). That hides TP/kvprune bugs behind a different
-    code path. Set ``VLLM_KVPRUNE_FORCE_COMPACTOR_PATH=1`` to run the same
-    compactor + collective RPC path as compression-on, with no KV pruning.
-    """
-    if _any_compactor(comps):
-        return True
-    return os.environ.get(_FORCE_COMPACTOR_PATH_ENV, "").strip().lower() in (
-        "1",
-        "true",
-        "yes",
-    )
-
-
-def _to_compactor_sampling(sp: SamplingParams) -> CompactorSamplingParams:
-    mt = sp.max_tokens
-    if mt is None:
-        mt = 16
-    return CompactorSamplingParams(
-        temperature=float(sp.temperature),
-        max_new_tokens=int(mt),
-    )
-
-
-def _to_sequence_compression(cp: CompressionParams) -> SequenceCompressionParams:
-    return SequenceCompressionParams(
-        compression_ratio=float(cp.compression_ratio),
-        protected_first_tokens=int(cp.protected_first_tokens),
-        protected_last_tokens=int(cp.protected_last_tokens),
-    )
-
-
-def _batch_compression_from_comps(comps: list[CompressionParams]) -> BatchCompressionParams:
-    for c in comps:
-        if c.compression_ratio < 1.0:
-            mid = compression_method_str_to_id(c.compression_method)
-            return BatchCompressionParams(
-                compression_method=compression_method_id_to_enum(mid)
-            )
-    return BatchCompressionParams()
-
-
-def _kvprune_compactor_hf_tokenizer(llm: Any):
-    """HF tokenizer matching :meth:`vllm.kvprune.core.llm_engine.LLMEngine.__init__`.
-
-    Loads from the **resolved on-disk** model tree (local dir or HF cache snapshot), not
-    the bare repo id, to avoid redundant Hub downloads.
-    """
-    cached = getattr(llm, "_kvprune_compactor_hf_tokenizer", None)
-    if cached is not None:
-        return cached
-    mc = llm.llm_engine.vllm_config.model_config
-    model_s = str(mc.model)
-    src = model_s
-    try:
-        p = Path(model_s)
-        if p.is_dir() and (p / "config.json").is_file():
-            src = str(p.resolve())
-        else:
-            from huggingface_hub import snapshot_download
-
-            src = snapshot_download(repo_id=model_s, local_files_only=False)
-    except Exception:
-        src = model_s
-    hf_cfg = mc.hf_config
-    _trust = bool(getattr(hf_cfg, "trust_remote_code", False)) if hf_cfg is not None else False
-    tok = AutoTokenizer.from_pretrained(src, use_fast=True, trust_remote_code=_trust)
-    llm._kvprune_compactor_hf_tokenizer = tok
-    return tok
-
-
-def _prompt_to_compactor_input(prompt: Any) -> str | list[int]:
-    if isinstance(prompt, str):
-        return prompt
-    # Decoder-only `list[int]` token ids (see `vllm.inputs.PromptType`).
-    if isinstance(prompt, list):
-        if not prompt:
-            raise TypeError("Empty token-id prompt is not supported for compactor path.")
-        if all(isinstance(t, int) for t in prompt):
-            return list(prompt)
-    if isinstance(prompt, dict):
-        if "prompt_token_ids" in prompt:
-            ids = prompt["prompt_token_ids"]
-            return list(ids) if not isinstance(ids, list) else ids
-        p = prompt.get("prompt")
-        if isinstance(p, str):
-            return p
-    raise TypeError(
-        f"Unsupported prompt type for compactor path: {type(prompt)}. "
-        "Use str, list[int] token ids, or dict with 'prompt_token_ids' or 'prompt'."
-    )
-
-
-def _prompt_to_token_ids_for_tp(llm: Any, prompt: Any) -> list[int]:
-    """Driver-side token ids for the TP collective path (same tokenizer as vLLM ``LLM``)."""
-    comp_in = _prompt_to_compactor_input(prompt)
-    if isinstance(comp_in, str):
-        return llm.get_tokenizer().encode(comp_in)
-    return list(comp_in)
-
-
-def _compressed_generate_tp_collective(
-    llm: Any,
-    plist: list[Any],
-    sps: list[SamplingParams],
-    comps: list[CompressionParams],
-) -> list[RequestOutput]:
-    """TP>1: run compactor on each worker via ``collective_rpc`` (all ranks)."""
-    vc = llm.llm_engine.vllm_config
-    pc = vc.parallel_config
-    if pc.pipeline_parallel_size != 1 or pc.data_parallel_size != 1:
-        raise NotImplementedError(
-            "KV-prune TP compression requires pipeline_parallel_size=1 and "
-            f"data_parallel_size=1 (got PP={pc.pipeline_parallel_size}, "
-            f"DP={pc.data_parallel_size})."
-        )
-
-    hf = vc.model_config.hf_config
-    tok = llm.get_tokenizer()
-    eos_token_ids = _infer_stop_token_ids(tok, hf)
-
-    prompt_token_ids = [_prompt_to_token_ids_for_tp(llm, p) for p in plist]
-
-    max_len = int(vc.model_config.max_model_len)
-    for i, ids in enumerate(prompt_token_ids):
-        if len(ids) > max_len:
-            raise ValueError(
-                f"KV-prune TP compressed generate: prompt {i} length {len(ids)} "
-                f"exceeds max_model_len ({max_len}). Shorten the prompt or raise "
-                "max_model_len when constructing LLM()."
-            )
-
-    # Payload must be picklable for multiproc/Ray RPC: do not pass multiprocessing
-    # synchronization primitives (workers are separate processes).
-    payload: dict[str, Any] = {
-        "eos_token_ids": eos_token_ids,
-        "prompt_token_ids": prompt_token_ids,
-        "sampling_params": [
-            {
-                "temperature": float(sp.temperature),
-                "max_new_tokens": int(sp.max_tokens if sp.max_tokens is not None else 16),
-            }
-            for sp in sps
-        ],
-        "compression_params": [
-            {
-                "compression_ratio": float(c.compression_ratio),
-                "compression_method": str(c.compression_method),
-                "protected_first_tokens": int(c.protected_first_tokens),
-                "protected_last_tokens": int(c.protected_last_tokens),
-            }
-            for c in comps
-        ],
-    }
-
-    _maybe_release_v1_kv_for_compactor(llm)
-    try:
-        results = llm.llm_engine.collective_rpc(
-            "kvprune_v1_compressed_generate",
-            args=(payload,),
-        )
-    except RuntimeError as e:
-        if "cancelled" in str(e).lower():
-            raise RuntimeError(
-                "collective_rpc was cancelled (a GPU worker likely crashed). "
-                "Scroll up for the first worker traceback — often NCCL/CUDA before "
-                "TCPStore/Broken pipe on the driver."
-            ) from e
-        raise
-    master: dict[str, Any] | None = None
-    for r in results:
-        if isinstance(r, dict) and r.get("tensor_parallel_rank") == 0:
-            master = r
-            break
-    if master is None:
-        raise RuntimeError(
-            "collective_rpc did not return a dict from tensor parallel rank 0."
-        )
-    return _tp_payload_to_request_outputs(llm, master)
-
-
-def _tp_payload_to_request_outputs(llm: Any, master: dict[str, Any]) -> list[RequestOutput]:
-    tok = llm.get_tokenizer()
-    out: list[RequestOutput] = []
-    pids_list = master["prompt_token_ids"]
-    cids_list = master["completion_token_ids"]
-    for i, (pids, cids) in enumerate(zip(pids_list, cids_list)):
-        text = tok.decode(cids, skip_special_tokens=True)
-        co = CompletionOutput(
-            index=0,
-            text=text,
-            token_ids=list(cids),
-            cumulative_logprob=None,
-            logprobs=None,
-            finish_reason="stop",
-        )
-        ro = RequestOutput(
-            request_id=f"kvprune-tp-{i}",
-            prompt=None,
-            prompt_token_ids=list(pids),
-            prompt_logprobs=None,
-            outputs=[co],
-            finished=True,
-        )
-        out.append(ro)
-    return out
-
-
-def _ensure_compactor_engine(llm: Any) -> LLMEngine:
-    if llm._kvprune_compactor_engine is None:
-        pc = llm.llm_engine.vllm_config.parallel_config
-        if pc.tensor_parallel_size != 1:
-            raise ValueError(
-                "KV-pruning compactor path requires tensor_parallel_size=1 "
-                "for shared weights."
-            )
-        llm._kvprune_compactor_engine = create_compactor_engine_with_shared_weights(llm)
-        logger.info("Initialized compactor LLMEngine with weights shared from vLLM.")
-    return llm._kvprune_compactor_engine
-
-
-def try_compressed_generate(
-    llm: Any,
-    prompts: Any,
-    sampling_params: SamplingParams | Sequence[SamplingParams] | None,
-    *,
-    compression: CompressionParams | Sequence[CompressionParams] | None,
-    use_tqdm: bool | Callable[..., tqdm] = True,
-    lora_request: Any = None,
-    priority: list[int] | None = None,
-    tokenization_kwargs: dict[str, Any] | None = None,
-) -> list[RequestOutput] | None:
-    """Return completions on the compactor engine, or ``None`` to use normal v1.
-
-    ``lora_request`` / ``priority`` / ``tokenization_kwargs`` are accepted for API
-    parity with :meth:`~vllm.entrypoints.llm.LLM.generate` but are not passed to the
-    compactor engine yet.
-    """
-    del lora_request, priority, tokenization_kwargs, use_tqdm
-
-    plist = _normalize_prompt_list(prompts)
-    sps = _normalize_sampling_params(sampling_params, len(plist))
-    comps = _normalize_compression_params(compression, len(plist))
-
-    pc = llm.llm_engine.vllm_config.parallel_config
-    # TP>1: every worker must run the same collective_rpc session. If all
-    # compression_ratio >= 1, the old code returned None and only the driver ran
-    # v1 _run_engine — other ranks never joined a matching collective, which can
-    # deadlock NCCL / leave workers unsynchronized (hang at "Processed prompts:").
-    if pc.tensor_parallel_size > 1:
-        if not _should_use_kvprune_compactor_path(comps):
-            comps = [CompressionParams(compression_ratio=1.0) for _ in plist]
-    elif not _should_use_kvprune_compactor_path(comps):
-        return None
-
-    v1_eager = bool(
-        getattr(llm.llm_engine.vllm_config.model_config, "enforce_eager", False)
-    )
-    if not v1_eager:
-        logger.warning(
-            "KV-prune compression: v1 CUDA graphs are still enabled on this LLM. "
-            "The compactor does not reuse v1 graphs; capture wastes VRAM. "
-            "Set kvprune_compression=True, enforce_eager=True, or "
-            "VLLM_KVPRUNE_COMPRESSION_DEFAULT=1 before import vllm."
-        )
-
-    if pc.tensor_parallel_size > 1:
-        return _compressed_generate_tp_collective(llm, plist, sps, comps)
-
-    ensure_inprocess_engine_for_weight_sharing()
-    if llm._kvprune_compactor_engine is None:
-        _maybe_release_v1_kv_for_compactor(llm)
-    engine = _ensure_compactor_engine(llm)
-    comp_sp = [_to_compactor_sampling(sp) for sp in sps]
-    seq_c = [_to_sequence_compression(c) for c in comps]
-    batch_c = _batch_compression_from_comps(comps)
-    comp_in = [_prompt_to_compactor_input(p) for p in plist]
-
-    _, seqs = engine.generate(
-        comp_in,
-        sampling_params=comp_sp,
-        batch_compression_params=batch_c,
-        per_sequence_compression_params=seq_c,
-        return_sequences=True,
-    )
-
-    return _sequences_to_request_outputs(seqs, engine)
-
-
-def _sequences_to_request_outputs(seqs: list[Any], engine: LLMEngine) -> list[RequestOutput]:
-    tok = engine.tokenizer
-    out: list[RequestOutput] = []
-    for i, seq in enumerate(seqs):
-        text = tok.decode(seq.completion_token_ids, skip_special_tokens=True)
-        # If every emitted id is “special” (e.g. EOS / chat boundary), the stripped
-        # string is empty while ``completion_token_ids`` is non-empty — avoid
-        # presenting a blank answer so users can see boundary tokens / debug.
-        if not text.strip() and seq.completion_token_ids:
-            text = tok.decode(seq.completion_token_ids, skip_special_tokens=False)
-        co = CompletionOutput(
-            index=0,
-            text=text,
-            token_ids=list(seq.completion_token_ids),
-            cumulative_logprob=None,
-            logprobs=None,
-            finish_reason="stop",
-        )
-        ro = RequestOutput(
-            request_id=f"kvprune-{i}",
-            prompt=None,
-            prompt_token_ids=list(seq.prompt_token_ids),
-            prompt_logprobs=None,
-            outputs=[co],
-            finished=True,
-        )
-        out.append(ro)
-    return out
--- a/vllm/kvprune_legacy_save/integration/compression_params.py
+++ b/vllm/kvprune_legacy_save/integration/compression_params.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Per-request KV compression for :meth:`vllm.LLM.generate` (``compression=`` kwarg)."""
-
-from __future__ import annotations
-
-from dataclasses import dataclass
-
-
-@dataclass
-class CompressionParams:
-    """Per-prompt compression intent for :meth:`vllm.LLM.generate`.
-
-    If **any** prompt in the batch has ``compression_ratio < 1.0``, the **whole** batch
-    is run on the compactor ``LLMEngine`` (same stack as standalone compactor-vllm:
-    ``PagedKVCache`` + pruning kernels). If all prompts have ``compression_ratio >= 1.0``,
-    the batch stays on standard vLLM.
-
-    ``compression_method`` follows :mod:`vllm.kvprune.core.compression_bridge` aliases:
-    ``none``, ``criticaladakv``, ``compactor``, ``snapkv`` (ignored when
-    ``compression_ratio`` is effectively 1).
-
-    ``protected_*`` map to compactor :class:`~vllm.kvprune.compression.compression_config.SequenceCompressionParams`
-    (defaults match standalone compactor-vllm-style usage).
-    """
-
-    compression_ratio: float = 1.0
-    compression_method: str = "compactor"
-    protected_first_tokens: int = 16
-    protected_last_tokens: int = 64
-
-    def __post_init__(self) -> None:
-        if not 0.0 < self.compression_ratio <= 1.0:
-            raise ValueError(
-                f"compression_ratio must be in (0, 1], got {self.compression_ratio}"
-            )
-        self.compression_method = (
-            self.compression_method or "compactor"
-        ).strip().lower()
-        from vllm.kvprune.core.compression_bridge import VALID_ALIASES_FOR_SAMPLING
-
-        if self.compression_method not in VALID_ALIASES_FOR_SAMPLING:
-            raise ValueError(
-                f"compression_method must be one of {sorted(VALID_ALIASES_FOR_SAMPLING)}, "
-                f"got {self.compression_method!r}"
-            )
-        if self.compression_ratio >= 1.0 - 1e-9:
-            self.compression_method = "none"
-        elif self.compression_method == "none":
-            raise ValueError(
-                "When compression_ratio < 1.0, compression_method cannot be 'none'."
-            )
--- a/vllm/kvprune_legacy_save/integration/config_adapter.py
+++ b/vllm/kvprune_legacy_save/integration/config_adapter.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Build :class:`vllm.kvprune.config.engine_config.LLMConfig` from :class:`VllmConfig`."""
-
-from __future__ import annotations
-
-import os
-from pathlib import Path
-
-from vllm.config import VllmConfig
-from vllm.kvprune.config.engine_config import LLMConfig, KvpruneAttentionSchedule
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-
-def _attention_schedule_from_env() -> KvpruneAttentionSchedule:
-    """Resolve :class:`KvpruneAttentionSchedule` from env.
-
-    Primary (``VLLM_KVPRUNE_ATTENTION_SCHEDULE``):
-
-    - ``fa_triton`` — FA prefill, Triton decode (default). Aliases: ``fa_prefill``,
-      ``default``, empty.
-    - ``pdtriton`` — Triton prefill + Triton decode. Aliases: ``triton``,
-      ``triton_prefill``, ``compactor_prefill``, ``pd_triton``.
-    - ``pdfa`` — FA prefill + FA decode (KV stores still Triton). Aliases:
-      ``fa_full``, ``fa_both``.
-
-    Legacy: ``VLLM_KVPRUNE_ATTENTION_BACKEND`` maps ``flash``/``fa`` → ``fa_triton``,
-    ``compactor``/``triton`` → ``pdtriton``.
-    """
-    s = os.environ.get("VLLM_KVPRUNE_ATTENTION_SCHEDULE", "").strip().lower()
-    if s in ("fa_triton", "fa_prefill", "default", ""):
-        return KvpruneAttentionSchedule.FA_PREFILL_TRITON_DECODE
-    if s in ("pdtriton", "pd_triton", "triton", "triton_prefill", "compactor_prefill"):
-        return KvpruneAttentionSchedule.TRITON_PREFILL_TRITON_DECODE
-    if s in ("pdfa", "fa_full", "fa_both"):
-        return KvpruneAttentionSchedule.PDFA
-    if s:
-        logger.warning(
-            "Unknown VLLM_KVPRUNE_ATTENTION_SCHEDULE=%r; using FA_PREFILL_TRITON_DECODE",
-            s,
-        )
-        return KvpruneAttentionSchedule.FA_PREFILL_TRITON_DECODE
-
-    v = os.environ.get("VLLM_KVPRUNE_ATTENTION_BACKEND", "").strip().lower()
-    if v in ("flash", "fa", "flash_attention", "flashattention"):
-        return KvpruneAttentionSchedule.FA_PREFILL_TRITON_DECODE
-    if v in ("compactor", "triton", "compactor_triton", ""):
-        return KvpruneAttentionSchedule.TRITON_PREFILL_TRITON_DECODE
-    logger.warning(
-        "Unknown VLLM_KVPRUNE_ATTENTION_BACKEND=%r; using FA_PREFILL_TRITON_DECODE", v
-    )
-    return KvpruneAttentionSchedule.FA_PREFILL_TRITON_DECODE
-
-
-def _compactor_kvcache_page_size(vllm_block_size: int | None) -> int:
-    """Tokens per physical KV page for compactor :class:`LLMConfig`.
-
-    vLLM ``block_size`` is often 16; compactor ``head_sparse_decode_attention`` requires
-    ``PAGE_SIZE % 32 == 0`` (see ``kvprune/attention/sparse_decode_kernel.py``). Standalone
-    compactor-vllm defaults to 128. Round up to the next multiple of 32 when needed.
-    """
-    if vllm_block_size is None:
-        return 128
-    bs = int(vllm_block_size)
-    if bs <= 0:
-        return 128
-    if bs % 32 == 0:
-        return bs
-    return ((bs + 31) // 32) * 32
-
-
-def vllm_config_to_llm_config(vc: VllmConfig) -> LLMConfig:
-    """Map vLLM engine config to compactor :class:`LLMConfig`."""
-    mc = vc.model_config
-    cc = vc.cache_config
-    pc = vc.parallel_config
-    sc = vc.scheduler_config
-    block_size = cc.block_size
-    if block_size is None:
-        block_size = 16
-    max_num_seqs = getattr(sc, "max_num_seqs", 256)
-    # Do **not** forward ``model_config.enforce_eager`` (v1) into compactor
-    # :class:`LLMConfig`. They are independent flags: v1 uses it only to skip
-    # *v1* ``capture_model()``; kvprune :class:`~vllm.kvprune.core.model_runner.ModelRunner`
-    # uses :attr:`LLMConfig.enforce_eager` only for *compactor* decode CUDA graphs.
-    # Shared-weight setup in ``compactor_shared`` defaults compactor to eager decode;
-    # see ``VLLM_KVPRUNE_COMPACTOR_CUDA_GRAPH`` (default try graphs) /
-    # ``VLLM_KVPRUNE_COMPACTOR_ENFORCE_EAGER``.
-    # Local checkpoint directory: forward so compactor skips redundant Hub fetches.
-    _model_s = str(mc.model)
-    _path: str | None = None
-    try:
-        if _model_s and Path(_model_s).is_dir() and (Path(_model_s) / "config.json").is_file():
-            _path = str(Path(_model_s).resolve())
-    except OSError:
-        pass
-
-    return LLMConfig(
-        model=_model_s,
-        path=_path,
-        nccl_port=1218,
-        max_num_seqs=max_num_seqs,
-        max_model_len=mc.max_model_len,
-        gpu_memory_utilization=cc.gpu_memory_utilization,
-        tensor_parallel_size=pc.tensor_parallel_size,
-        enforce_eager=False,
-        hf_config=mc.hf_config,
-        eos=-1,
-        eos_token_ids=None,
-        kvcache_page_size=_compactor_kvcache_page_size(block_size),
-        leverage_sketch_size=48,
-        attention_schedule=_attention_schedule_from_env(),
-        attention_backend=None,
-    )
--- a/vllm/kvprune_legacy_save/integration/v1_tp_runner.py
+++ b/vllm/kvprune_legacy_save/integration/v1_tp_runner.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""TP>1: one kvprune :class:`~vllm.kvprune.core.model_runner.ModelRunner` per vLLM worker.
-
-Invoked via v1 ``collective_rpc("kvprune_v1_compressed_generate", ...)`` so every tensor-
-parallel rank participates in the same compactor forward/broadcast sequence as the
-standalone multi-process compactor.
-
-Compactor decode CUDA graphs (when not ``enforce_eager``) capture the full decode step
-including ``compute_logits``. To force eager on embedded TP workers, set
-``VLLM_KVPRUNE_TP_EMBEDDED_GRAPH=0`` or ``VLLM_KVPRUNE_COMPACTOR_ENFORCE_EAGER=1``.
-
-Peer/master session boundaries use TP-group ``broadcast``/``barrier`` (see
-``ModelRunner.maybe_release_peers``), not ``multiprocessing.Event`` — RPC payloads must
-be picklable across worker processes.
-"""
-
-from __future__ import annotations
-
-import os
-from typing import Any
-
-import torch
-import torch.nn as nn
-
-from vllm.kvprune.compression.compression_config import (
-    BatchCompressionParams,
-    SequenceCompressionParams,
-)
-from vllm.kvprune.config.sampling_params import SamplingParams as CompactorSamplingParams
-from vllm.kvprune.core.compression_bridge import (
-    compression_method_id_to_enum,
-    compression_method_str_to_id,
-)
-from vllm.kvprune.core.model_runner import ModelRunner
-from vllm.kvprune.integration.config_adapter import vllm_config_to_llm_config
-from vllm.kvprune.utils.kv_dist import barrier_sync
-from vllm.kvprune.integration.weight_tie import (
-    delegate_kvprune_compute_logits_to_vllm,
-    delegate_kvprune_embed_tokens_to_vllm,
-    tie_kvprune_rope_buffers_from_vllm,
-    tie_kvprune_weights_from_vllm,
-)
-from vllm.kvprune.models import MODEL_REGISTRY
-from vllm.kvprune.utils.sequence import Sequence
-
-_ATTR = "_kvprune_tp_embedded_runner"
-
-
-def _apply_compactor_env_overrides(cfg: Any) -> None:
-    """Match :func:`~vllm.kvprune.integration.compactor_shared.create_compactor_engine_with_shared_weights` caps."""
-    _cap = os.environ.get("VLLM_KVPRUNE_COMPACTOR_MAX_NUM_SEQS", "32").strip()
-    if _cap:
-        lim = int(_cap)
-        if lim > 0:
-            cfg.max_num_seqs = min(cfg.max_num_seqs, lim)
-
-    _ce = os.environ.get("VLLM_KVPRUNE_COMPACTOR_ENFORCE_EAGER", "").strip().lower()
-    if _ce in ("1", "true", "yes"):
-        cfg.enforce_eager = True
-    elif _ce in ("0", "false", "no"):
-        cfg.enforce_eager = False
-    else:
-        _dg = os.environ.get("VLLM_KVPRUNE_COMPACTOR_CUDA_GRAPH", "1").strip().lower()
-        cfg.enforce_eager = _dg in ("0", "false", "no")
-
-
-def _build_sequences(payload: dict[str, Any]) -> list[Sequence]:
-    prompt_ids: list[list[int]] = payload["prompt_token_ids"]
-    sps: list[dict[str, Any]] = payload["sampling_params"]
-    cps: list[dict[str, Any]] = payload["compression_params"]
-    seqs: list[Sequence] = []
-    for i, ids in enumerate(prompt_ids):
-        sp = CompactorSamplingParams(
-            temperature=float(sps[i]["temperature"]),
-            max_new_tokens=int(sps[i]["max_new_tokens"]),
-        )
-        cp = SequenceCompressionParams(
-            compression_ratio=float(cps[i]["compression_ratio"]),
-            protected_first_tokens=int(cps[i].get("protected_first_tokens", 16)),
-            protected_last_tokens=int(cps[i].get("protected_last_tokens", 64)),
-        )
-        if cp.protected_first_tokens + cp.protected_last_tokens >= len(ids):
-            cp.compression_ratio = 1.0
-        seqs.append(
-            Sequence(
-                prompt_token_ids=list(ids),
-                sampling_params=sp,
-                compression_params=cp,
-            )
-        )
-    return seqs
-
-
-def _batch_compression_from_payload(payload: dict[str, Any]) -> BatchCompressionParams:
-    cps = payload["compression_params"]
-    for c in cps:
-        if float(c["compression_ratio"]) < 1.0:
-            mid = compression_method_str_to_id(str(c.get("compression_method", "none")))
-            return BatchCompressionParams(
-                compression_method=compression_method_id_to_enum(mid)
-            )
-    return BatchCompressionParams()
-
-
-def _get_or_create_runner(worker: Any, payload: dict[str, Any]) -> ModelRunner:
-    existing = getattr(worker, _ATTR, None)
-    if existing is not None:
-        return existing
-
-    from vllm.distributed.parallel_state import (
-        get_tensor_model_parallel_rank,
-        get_tensor_model_parallel_world_size,
-    )
-
-    vc = worker.vllm_config
-    pc = vc.parallel_config
-    if pc.pipeline_parallel_size != 1 or pc.data_parallel_size != 1:
-        raise NotImplementedError(
-            "KV-prune TP compressed generate requires pipeline_parallel_size=1 and "
-            f"data_parallel_size=1; got PP={pc.pipeline_parallel_size}, "
-            f"DP={pc.data_parallel_size}."
-        )
-
-    tp_ws = get_tensor_model_parallel_world_size()
-    if tp_ws != pc.tensor_parallel_size:
-        raise RuntimeError(
-            f"parallel_state TP world size {tp_ws} != config.tensor_parallel_size "
-            f"{pc.tensor_parallel_size}"
-        )
-
-    hf = vc.model_config.hf_config
-    model_type = getattr(hf, "model_type", None)
-    if model_type not in MODEL_REGISTRY:
-        raise ValueError(
-            f"KV-prune TP path: unsupported model_type={model_type!r}; "
-            f"registry has {sorted(MODEL_REGISTRY)}"
-        )
-
-    cfg = vllm_config_to_llm_config(vc)
-    eos_ids = payload["eos_token_ids"]
-    cfg.eos_token_ids = sorted({int(x) for x in eos_ids})
-    cfg.eos = int(cfg.eos_token_ids[0])
-    _apply_compactor_env_overrides(cfg)
-
-    vllm_model = worker.get_model()
-    kv_model: nn.Module = MODEL_REGISTRY[model_type](hf)
-    tie_kvprune_weights_from_vllm(vllm_model, kv_model)
-
-    dev = next(vllm_model.parameters()).device
-    dtype = next(vllm_model.parameters()).dtype
-    kv_model.to(device=dev, dtype=dtype)
-    tie_kvprune_rope_buffers_from_vllm(vllm_model, kv_model)
-    delegate_kvprune_embed_tokens_to_vllm(vllm_model, kv_model)
-    delegate_kvprune_compute_logits_to_vllm(vllm_model, kv_model)
-
-    tp_rank = get_tensor_model_parallel_rank()
-    device = torch.device(f"cuda:{torch.cuda.current_device()}")
-
-    if tp_rank == 0:
-        runner = ModelRunner(
-            cfg,
-            rank=0,
-            peer_events=[],
-            external_model=kv_model,
-            embedded_in_vllm_worker=True,
-            device=device,
-        )
-    else:
-        runner = ModelRunner(
-            cfg,
-            rank=tp_rank,
-            batch_ready=None,
-            external_model=kv_model,
-            embedded_in_vllm_worker=True,
-            device=device,
-        )
-
-    setattr(worker, _ATTR, runner)
-    return runner
-
-
-def run_kvprune_tp_compressed_generate(worker: Any, payload: dict[str, Any]) -> dict[str, Any]:
-    """Execute one compressed generation session on this worker (all TP ranks)."""
-    from vllm.distributed.parallel_state import get_tensor_model_parallel_rank
-
-    tp_rank = get_tensor_model_parallel_rank()
-    runner = _get_or_create_runner(worker, payload)
-    sequences = _build_sequences(payload)
-    batch_c = _batch_compression_from_payload(payload)
-
-    barrier_sync(use_tp_group=True)
-
-    if tp_rank == 0:
-        runner.generate(sequences, batch_c)
-        return {
-            "tensor_parallel_rank": 0,
-            "prompt_token_ids": [list(s.prompt_token_ids) for s in sequences],
-            "completion_token_ids": [list(s.completion_token_ids) for s in sequences],
-        }
-
-    runner.run_peer_session()
-    return {"tensor_parallel_rank": int(tp_rank), "ok": True}
--- a/vllm/kvprune_legacy_save/integration/vllm_model_access.py
+++ b/vllm/kvprune_legacy_save/integration/vllm_model_access.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Access the in-process vLLM model weights for compactor weight sharing."""
-
-from __future__ import annotations
-
-import torch.nn as nn
-
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-
-def extract_vllm_causal_lm(llm: object) -> nn.Module:
-    """Return the root ``nn.Module`` holding transformer + lm_head from a v1 ``LLM``.
-
-    Requires ``LLMEngine`` to have been constructed with ``multiprocess_mode=False``
-    so ``model_executor`` lives in-process (set ``VLLM_ENABLE_V1_MULTIPROCESSING=0``).
-    """
-    llm_engine = getattr(llm, "llm_engine", None)
-    if llm_engine is None:
-        raise RuntimeError("Expected an object with a ``llm_engine`` attribute (e.g. ``vllm.LLM``).")
-
-    ex = getattr(llm_engine, "model_executor", None)
-    if ex is None:
-        raise RuntimeError(
-            "model_executor is unavailable (multiprocess engine mode). "
-            "Set environment variable VLLM_ENABLE_V1_MULTIPROCESSING=0 for "
-            "in-process weight sharing."
-        )
-
-    driver = getattr(ex, "driver_worker", None)
-    if driver is None:
-        raise RuntimeError(
-            "Executor has no driver_worker (unexpected executor type for weight sharing)."
-        )
-
-    worker = getattr(driver, "worker", None)
-    if worker is None:
-        raise RuntimeError("Worker wrapper has no worker loaded.")
-
-    get_model = getattr(worker, "get_model", None)
-    if not callable(get_model):
-        raise RuntimeError("Worker does not expose get_model().")
-
-    return get_model()
--- a/vllm/kvprune_legacy_save/integration/weight_tie.py
+++ b/vllm/kvprune_legacy_save/integration/weight_tie.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Share vLLM parameter storage with compactor ``MODEL_REGISTRY`` models (TP=1)."""
-
-from __future__ import annotations
-
-import types
-
-import torch
-import torch.nn as nn
-
-from vllm.kvprune.utils.context import get_context
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-
-def tie_kvprune_weights_from_vllm(
-    vllm_model: nn.Module,
-    kvprune_model: nn.Module,
-    *,
-    strict: bool = True,
-) -> int:
-    """Point compactor parameters to the same tensors as vLLM where names match.
-
-    Returns the number of parameters tied. Requires identical parameter names
-    and shapes for overlapping weights (typical when both stacks mirror HF
-    naming for the same architecture).
-
-    Args:
-        vllm_model: Model returned by ``worker.get_model()`` (e.g. ``Qwen3ForCausalLM``).
-        kvprune_model: Instance from ``vllm.kvprune.models.MODEL_REGISTRY``.
-        strict: If True, raise when any ``kvprune`` parameter name is missing from
-            ``vllm_model`` or shapes differ.
-    """
-    vd = dict(vllm_model.named_parameters())
-    kd = dict(kvprune_model.named_parameters())
-    tied = 0
-    for name, kp in kd.items():
-        if name not in vd:
-            if strict:
-                raise ValueError(
-                    f"kvprune parameter {name!r} not found in vLLM model; "
-                    "architecture/layout may differ (disable strict tying only "
-                    "for expert debugging)."
-                )
-            continue
-        vp = vd[name]
-        if vp.shape != kp.shape:
-            raise ValueError(
-                f"Shape mismatch for {name}: vllm {vp.shape} vs kvprune {kp.shape}"
-            )
-        kp.data = vp.data
-        tied += 1
-    if tied == 0:
-        raise ValueError(
-            "No parameters were tied — check that vLLM and kvprune model types match "
-            "and use the same state_dict names."
-        )
-    logger.info("Tied %d parameters from vLLM into compactor model (shared storage).", tied)
-    return tied
-
-
-def tie_kvprune_rope_buffers_from_vllm(
-    vllm_model: nn.Module,
-    kvprune_model: nn.Module,
-) -> int:
-    """Copy RoPE ``cos_sin_cache`` buffers from vLLM into kvprune.
-
-    :func:`tie_kvprune_weights_from_vllm` only aliases :class:`~torch.nn.Parameter`
-    tensors. RoPE tables live in buffers; kvprune's simplified ``RotaryEmbedding``
-    can disagree with vLLM's ``rope_parameters`` (YaRN, etc.). Copying
-    ``cos_sin_cache`` after ``.to(device, dtype)`` keeps Q/K rotation aligned with
-    the main model.
-
-    kvprune uses layout ``[max_len, 1, rotary_dim]``; vLLM uses ``[max_len,
-    rotary_dim]``. The singleton dim is filled via ``unsqueeze(1)`` on the vLLM
-    tensor when copying.
-    """
-    vd = dict(vllm_model.named_buffers())
-    copied = 0
-    for name, kb in kvprune_model.named_buffers():
-        if "cos_sin_cache" not in name:
-            continue
-        if name not in vd:
-            logger.warning(
-                "kvprune RoPE buffer %r not found in vLLM; leaving kvprune cache",
-                name,
-            )
-            continue
-        vb = vd[name]
-        if vb.shape == kb.shape:
-            kb.copy_(vb)
-            copied += 1
-        elif kb.dim() == 3 and vb.dim() == 2:
-            if (
-                kb.shape[0] != vb.shape[0]
-                or kb.shape[2] != vb.shape[1]
-                or kb.shape[1] != 1
-            ):
-                raise ValueError(
-                    f"cos_sin_cache shape mismatch for {name!r}: "
-                    f"vLLM {tuple(vb.shape)} vs kvprune {tuple(kb.shape)}"
-                )
-            kb.copy_(vb.unsqueeze(1))
-            copied += 1
-        else:
-            raise ValueError(
-                f"Unsupported cos_sin_cache layout for {name!r}: "
-                f"vLLM {tuple(vb.shape)} vs kvprune {tuple(kb.shape)}"
-            )
-    if copied:
-        logger.info(
-            "Copied %d RoPE cos_sin_cache buffer(s) from vLLM into kvprune model.",
-            copied,
-        )
-    return copied
-
-
-def delegate_kvprune_embed_tokens_to_vllm(
-    vllm_model: nn.Module,
-    kvprune_model: nn.Module,
-) -> bool:
-    """Use vLLM's ``model.embed_tokens`` forward for kvprune (TP-safe token→shard mapping).
-
-    Even with tied weights, kvprune's simplified contiguous
-    ``VocabParallelEmbedding`` (``vocab_start = rank * partition``) can disagree with
-    vLLM's padded vocabulary and org/added shard ranges, producing invalid indices for
-    ``F.embedding`` on non-zero TP ranks (``index_copy_`` / device-side assert).
-
-    Delegating the forward to vLLM's embedding module keeps masks and indices aligned
-    with the main model while parameters remain shared storage.
-    """
-    if not hasattr(vllm_model, "model") or not hasattr(kvprune_model, "model"):
-        return False
-    vm = getattr(vllm_model.model, "embed_tokens", None)
-    km = getattr(kvprune_model.model, "embed_tokens", None)
-    if vm is None or km is None:
-        logger.warning(
-            "delegate_kvprune_embed_tokens_to_vllm: embed_tokens missing; skipped"
-        )
-        return False
-
-    def _forward(_self_unused: nn.Module, x):
-        return vm(x)
-
-    km.forward = types.MethodType(_forward, km)
-    logger.info(
-        "kvprune model.embed_tokens forward delegated to vLLM (correct vocab-parallel masks)."
-    )
-    return True
-
-
-def delegate_kvprune_compute_logits_to_vllm(
-    vllm_model: nn.Module,
-    kvprune_model: nn.Module,
-) -> bool:
-    """Route ``kvprune_model.compute_logits`` through vLLM's ``compute_logits``.
-
-    Standalone compactor used :class:`~vllm.kvprune.layers.embed_head.ParallelLMHead`
-    with ``F.linear`` + TP gather. vLLM applies :class:`~vllm.model_executor.layers.logits_processor.LogitsProcessor`
-    (gather/all-gather, padded-vocab trim, quant hooks). Mismatch here commonly
-    produces garbage token distributions while the rest of the stack looks fine.
-
-    After weight tying, ``vllm_model.compute_logits(hidden)`` uses the same lm_head
-    storage as kvprune; only the *application* path matches production vLLM.
-    """
-    if not callable(getattr(vllm_model, "compute_logits", None)):
-        logger.warning(
-            "delegate_kvprune_compute_logits_to_vllm: vLLM model has no compute_logits; skipped"
-        )
-        return False
-
-    def _compute_logits(_self: nn.Module, hidden_states):
-        # Match kvprune :class:`~vllm.kvprune.layers.embed_head.ParallelLMHead`:
-        # prefill logits are for the **last** token of each packed sequence only.
-        context = get_context()
-        if context.is_prefill and context.cu_seqlens_q is not None:
-            cuq = context.cu_seqlens_q
-            last_indices = (cuq[1:] - 1).to(torch.long)
-            n_tok = hidden_states.shape[0]
-            if n_tok > 0:
-                last_indices = last_indices.clamp(min=0, max=n_tok - 1)
-            hidden_states = hidden_states[last_indices].contiguous()
-        # vLLM lm_head + gather expect contiguous activations; non-contiguous views have
-        # caused garbage logits under TP in edge cases.
-        hidden_states = hidden_states.contiguous()
-        logits = vllm_model.compute_logits(hidden_states)
-        return logits
-
-    kvprune_model.compute_logits = types.MethodType(_compute_logits, kvprune_model)
-    return True
--- a/vllm/kvprune_legacy_save/kv_cache/__init__.py
+++ b/vllm/kvprune_legacy_save/kv_cache/__init__.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Paged KV cache helpers and Triton KV store."""
-
-from vllm.kvprune.kv_cache.store_kv_cache import (
-    decode_store_kv,
-    prefill_store_all_kv,
-    prefill_store_topk_kv,
-)
-
-__all__ = [
-    "decode_store_kv",
-    "prefill_store_all_kv",
-    "prefill_store_topk_kv",
-]