# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Build :class:`vllm.kvprune.config.engine_config.LLMConfig` from :class:`VllmConfig`.""" from __future__ import annotations import os from pathlib import Path from vllm.config import VllmConfig from vllm.kvprune.config.engine_config import LLMConfig, KvpruneAttentionSchedule from vllm.logger import init_logger logger = init_logger(__name__) def _attention_schedule_from_env() -> KvpruneAttentionSchedule: """Resolve :class:`KvpruneAttentionSchedule` from env. Primary (``VLLM_KVPRUNE_ATTENTION_SCHEDULE``): - ``fa_triton`` — FA prefill, Triton decode (default). Aliases: ``fa_prefill``, ``default``, empty. - ``pdtriton`` — Triton prefill + Triton decode. Aliases: ``triton``, ``triton_prefill``, ``compactor_prefill``, ``pd_triton``. - ``pdfa`` — FA prefill + FA decode (KV stores still Triton). Aliases: ``fa_full``, ``fa_both``. Legacy: ``VLLM_KVPRUNE_ATTENTION_BACKEND`` maps ``flash``/``fa`` → ``fa_triton``, ``compactor``/``triton`` → ``pdtriton``. """ s = os.environ.get("VLLM_KVPRUNE_ATTENTION_SCHEDULE", "").strip().lower() if s in ("fa_triton", "fa_prefill", "default", ""): return KvpruneAttentionSchedule.FA_PREFILL_TRITON_DECODE if s in ("pdtriton", "pd_triton", "triton", "triton_prefill", "compactor_prefill"): return KvpruneAttentionSchedule.TRITON_PREFILL_TRITON_DECODE if s in ("pdfa", "fa_full", "fa_both"): return KvpruneAttentionSchedule.PDFA if s: logger.warning( "Unknown VLLM_KVPRUNE_ATTENTION_SCHEDULE=%r; using FA_PREFILL_TRITON_DECODE", s, ) return KvpruneAttentionSchedule.FA_PREFILL_TRITON_DECODE v = os.environ.get("VLLM_KVPRUNE_ATTENTION_BACKEND", "").strip().lower() if v in ("flash", "fa", "flash_attention", "flashattention"): return KvpruneAttentionSchedule.FA_PREFILL_TRITON_DECODE if v in ("compactor", "triton", "compactor_triton", ""): return KvpruneAttentionSchedule.TRITON_PREFILL_TRITON_DECODE logger.warning( "Unknown VLLM_KVPRUNE_ATTENTION_BACKEND=%r; using FA_PREFILL_TRITON_DECODE", v ) return KvpruneAttentionSchedule.FA_PREFILL_TRITON_DECODE def _compactor_kvcache_page_size(vllm_block_size: int | None) -> int: """Tokens per physical KV page for compactor :class:`LLMConfig`. ``compactor-vllm`` uses ``kvcache_page_size=128`` by default. Keeping that page size is important for correctness comparisons when validating the integrated ``kvprune`` backend against standalone compactor, especially for ``pdtriton`` where paged-KV layout and page-padding behavior are part of the observed divergence on DCU. Override with ``VLLM_KVPRUNE_PAGE_SIZE``: - unset: use standalone-compactor-compatible ``128`` - positive integer: use that exact page size (must be divisible by 32) - ``vllm`` / ``inherit`` / ``block``: derive from vLLM ``block_size`` and round up to the next multiple of 32 (the older integrated behavior) """ env_v = os.environ.get("VLLM_KVPRUNE_PAGE_SIZE", "").strip().lower() if env_v: if env_v in ("vllm", "inherit", "block"): bs = 128 if vllm_block_size is None else int(vllm_block_size) if bs <= 0: return 128 if bs % 32 == 0: return bs return ((bs + 31) // 32) * 32 page_size = int(env_v) if page_size <= 0 or page_size % 32 != 0: raise ValueError( "VLLM_KVPRUNE_PAGE_SIZE must be a positive multiple of 32, " f"got {page_size}." ) return page_size return 128 def vllm_config_to_llm_config(vc: VllmConfig) -> LLMConfig: """Map vLLM engine config to compactor :class:`LLMConfig`.""" mc = vc.model_config cc = vc.cache_config pc = vc.parallel_config sc = vc.scheduler_config block_size = cc.block_size if block_size is None: block_size = 16 max_num_seqs = getattr(sc, "max_num_seqs", 256) # Do **not** forward ``model_config.enforce_eager`` (v1) into compactor # :class:`LLMConfig`. They are independent flags: v1 uses it only to skip # *v1* ``capture_model()``; kvprune :class:`~vllm.kvprune.core.model_runner.ModelRunner` # uses :attr:`LLMConfig.enforce_eager` only for *compactor* decode CUDA graphs. # Shared-weight setup in ``compactor_shared`` defaults compactor to eager decode; # see ``VLLM_KVPRUNE_COMPACTOR_CUDA_GRAPH`` (default try graphs) / # ``VLLM_KVPRUNE_COMPACTOR_ENFORCE_EAGER``. # Local checkpoint directory: forward so compactor skips redundant Hub fetches. _model_s = str(mc.model) _path: str | None = None try: if _model_s and Path(_model_s).is_dir() and (Path(_model_s) / "config.json").is_file(): _path = str(Path(_model_s).resolve()) except OSError: pass page_size = _compactor_kvcache_page_size(block_size) attention_schedule = _attention_schedule_from_env() logger.info( "kvprune compactor config: attention_schedule=%s, kvcache_page_size=%d", attention_schedule.name, page_size, ) return LLMConfig( model=_model_s, path=_path, nccl_port=1218, max_num_seqs=max_num_seqs, max_model_len=mc.max_model_len, gpu_memory_utilization=cc.gpu_memory_utilization, tensor_parallel_size=pc.tensor_parallel_size, enforce_eager=False, hf_config=mc.hf_config, eos=-1, eos_token_ids=None, kvcache_page_size=page_size, leverage_sketch_size=48, attention_schedule=attention_schedule, attention_backend=None, )