import os from dataclasses import dataclass from enum import Enum, auto from typing import List, Optional from transformers import AutoConfig class AttentionBackend(Enum): """Legacy coarse backend toggle (prefer :class:`KvpruneAttentionSchedule`).""" FLASH_ATTENTION = auto() COMPACTOR_TRITON = auto() class KvpruneAttentionSchedule(Enum): """FlashAttention vs Triton split for prefill / decode (KV **writes** stay Triton).""" # Default: FA varlen prefill; decode uses ``head_sparse_decode_attention`` (Triton). FA_PREFILL_TRITON_DECODE = auto() # Prefill attention uses ``causal_sparse_varlen_with_cache`` (Triton); decode Triton. TRITON_PREFILL_TRITON_DECODE = auto() # "PDFA": FA prefill + FA decode; paged KV **storage** (incl. pruned top-k) unchanged. PDFA = auto() @dataclass class LLMConfig: """Configuration for the :class:`LLM` engine. Parameters ---------- model : str Hugging Face model identifier (e.g. ``"meta-llama/Meta-Llama-3-8B"``) or a local model name that can be resolved by :func:`transformers.AutoConfig.from_pretrained`. path : str, optional Local directory containing the model weights. If ``None``, the engine will attempt to resolve a local snapshot for ``model`` using :func:`huggingface_hub.snapshot_download`. max_num_seqs : int, default 256 Upper bound on the number of concurrent batches that the scheduler and KV-cache manager are allowed to handle. This affects the size of the page table and some internal buffers. max_model_len : int, default 40960 Maximum context length (in tokens) that the engine will allocate KV cache and CUDA graphs for. During initialization this value is clamped to ``hf_config.max_position_embeddings`` for the chosen model. gpu_memory_utilization : float, default 0.9 Fraction of the total GPU memory that may be used for KV cache and model activations. Values should be in ``(0, 1]``. If this budget is too small, the KV-cache manager may raise an error at warmup time due to insufficient memory. tensor_parallel_size : int, default 1 Number of tensor-parallel workers to shard the model across. Must be between 1 and 8, and must evenly divide the model's number of key/value heads. enforce_eager : bool, default False If ``True``, disable CUDA graph capture and always run the model in eager mode during decoding. This reduces throughput. When ``False``, the engine will capture and reuse CUDA graphs for supported batch sizes and sequence lengths. hf_config : transformers.AutoConfig, optional Pre-loaded Hugging Face configuration for the model. If ``None``, it will then be populated automatically based on ``model``. eos : int, default -1 Primary stop token id (warmup / single-id paths). If ``-1``, the :class:`LLM` constructor fills this and :attr:`eos_token_ids` from the tokenizer. eos_token_ids : list of int, optional All token ids that terminate generation (e.g. HF tokenizers may expose ``eos_token_id`` as a list for chat models). If ``None``, inferred in :class:`LLM` from the tokenizer and model type. kvcache_page_size : int, default 128 Number of tokens stored in a single KV-cache page. Smaller pages improve allocation flexibility but increase page-table overhead; larger pages reduce overhead but have coarser granularity. leverage_sketch_size : int, default 48 Sketch dimension used by the Compactor leverage-score estimator. attention_schedule : KvpruneAttentionSchedule, default FA_PREFILL_TRITON_DECODE Which **attention** implementation runs on prefill vs decode. KV **writes** (``prefill_store_*``, ``decode_store_kv``, pruned top-k) always use the existing Triton store kernels. Env ``VLLM_KVPRUNE_ATTENTION_SCHEDULE`` uses short names: ``fa_triton`` (default), ``pdtriton``, ``pdfa``. Enum values: ``FA_PREFILL_TRITON_DECODE`` — FA prefill, Triton decode; ``TRITON_PREFILL_TRITON_DECODE`` — Triton prefill + decode; ``PDFA`` — FA prefill + FA decode (still Triton KV I/O). attention_backend : AttentionBackend, optional Deprecated. Ignored if ``attention_schedule`` is set; otherwise mapped for backward compatibility. """ model: str path: Optional[str] = None nccl_port: Optional[int] = 1218 max_num_seqs: int = 256 max_model_len: int = 40960 gpu_memory_utilization: float = 0.9 tensor_parallel_size: int = 1 enforce_eager: bool = False hf_config: AutoConfig | None = None eos: int = -1 eos_token_ids: Optional[List[int]] = None kvcache_page_size: int = 128 leverage_sketch_size: int = 48 attention_schedule: KvpruneAttentionSchedule = ( KvpruneAttentionSchedule.FA_PREFILL_TRITON_DECODE ) attention_backend: AttentionBackend | None = None show_progress_bar: bool = True def __post_init__(self): if self.attention_backend is not None: if self.attention_backend == AttentionBackend.FLASH_ATTENTION: self.attention_schedule = KvpruneAttentionSchedule.FA_PREFILL_TRITON_DECODE else: self.attention_schedule = ( KvpruneAttentionSchedule.TRITON_PREFILL_TRITON_DECODE ) if self.path is not None and not os.path.isdir(self.path): raise NotADirectoryError(f"Engine config dir {self.path} does not exist") if self.tensor_parallel_size <= 0 or self.tensor_parallel_size > 8: assert 1 <= self.tensor_parallel_size <= 8 raise ValueError("tensor_parallel_size must be >= 1 and <= 8") if self.hf_config is None: self.hf_config = AutoConfig.from_pretrained(self.model) self.max_model_len = min( self.max_model_len, self.hf_config.max_position_embeddings )