# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Construct compactor :class:`LLMEngine` sharing weight tensors with an in-process vLLM ``LLM``.""" from __future__ import annotations import os import torch.nn as nn from vllm.config import VllmConfig from vllm.kvprune.config.engine_config import LLMConfig from vllm.kvprune.core.llm_engine import LLMEngine from vllm.kvprune.integration.config_adapter import vllm_config_to_llm_config from vllm.kvprune.integration.vllm_model_access import extract_vllm_causal_lm from vllm.kvprune.integration.weight_tie import ( delegate_kvprune_compute_logits_to_vllm, delegate_kvprune_embed_tokens_to_vllm, tie_kvprune_rope_buffers_from_vllm, tie_kvprune_weights_from_vllm, ) from vllm.kvprune.models import MODEL_REGISTRY from vllm.logger import init_logger logger = init_logger(__name__) def build_llm_config_for_compactor(vc: VllmConfig) -> LLMConfig: """Public helper: vLLM config → compactor :class:`LLMConfig`.""" return vllm_config_to_llm_config(vc) def create_compactor_engine_with_shared_weights(llm: object) -> LLMEngine: """Single GPU, TP=1: compactor ``LLMEngine`` whose weights alias vLLM tensors. Call after the vLLM ``LLM`` has loaded weights. Requires in-process executor (``VLLM_ENABLE_V1_MULTIPROCESSING=0``). """ llm_engine = getattr(llm, "llm_engine", None) if llm_engine is None: raise RuntimeError("Expected ``llm.llm_engine``.") vc: VllmConfig = llm_engine.vllm_config if vc.parallel_config.tensor_parallel_size != 1: raise ValueError( "Shared-weight compactor backend requires tensor_parallel_size=1" ) cfg = vllm_config_to_llm_config(vc) # ``cfg.enforce_eager`` is for the compactor ``ModelRunner`` only (decode CUDA # graphs), not v1. v1 graph capture is controlled solely by ``LLM(..., # enforce_eager=...)`` / ``kvprune_compression=True`` on the entrypoint ``LLM``. # Large vLLM max_num_seqs blows up compactor page-table GPU memory; sharing the GPU # with v1 leaves little room for metadata + KV tensors. Default cap 32 so physical # KV pages stay usable; set VLLM_KVPRUNE_COMPACTOR_MAX_NUM_SEQS=0 to disable cap, # or raise (e.g. 128) if you have VRAM headroom. _cap = os.environ.get("VLLM_KVPRUNE_COMPACTOR_MAX_NUM_SEQS", "32").strip() if _cap: lim = int(_cap) if lim > 0: cfg.max_num_seqs = min(cfg.max_num_seqs, lim) # Compactor decode graphs (``enforce_eager=False``): honored for non-shared-weight # engines. **Shared-weight** path (below) forces ``enforce_eager=True`` after # delegating ``compute_logits`` to vLLM unless ``VLLM_KVPRUNE_SHARED_WEIGHT_GRAPH=1``. # Opt out of graphs for non-shared runs: ``VLLM_KVPRUNE_COMPACTOR_ENFORCE_EAGER=1`` or # ``VLLM_KVPRUNE_COMPACTOR_CUDA_GRAPH=0``. _ce = os.environ.get("VLLM_KVPRUNE_COMPACTOR_ENFORCE_EAGER", "").strip().lower() if _ce in ("1", "true", "yes"): cfg.enforce_eager = True logger.info( "KV-prune compactor: VLLM_KVPRUNE_COMPACTOR_ENFORCE_EAGER=1 → " "enforce_eager=True (skip compactor decode CUDA graphs)." ) elif _ce in ("0", "false", "no"): cfg.enforce_eager = False logger.info( "KV-prune compactor: VLLM_KVPRUNE_COMPACTOR_ENFORCE_EAGER=0 → " "enforce_eager=False (try compactor CUDA graph capture)." ) else: _dg = os.environ.get( "VLLM_KVPRUNE_COMPACTOR_CUDA_GRAPH", "1" ).strip().lower() if _dg in ("0", "false", "no"): cfg.enforce_eager = True logger.info( "KV-prune compactor: VLLM_KVPRUNE_COMPACTOR_CUDA_GRAPH=0 → " "enforce_eager=True (skip compactor decode CUDA graphs)." ) else: cfg.enforce_eager = False logger.info( "KV-prune compactor: default try decode CUDA graphs; ModelRunner " "falls back to eager if capture yields none. Set " "VLLM_KVPRUNE_COMPACTOR_ENFORCE_EAGER=1 or " "VLLM_KVPRUNE_COMPACTOR_CUDA_GRAPH=0 to skip capture." ) hf = cfg.hf_config assert hf is not None model_type = hf.model_type if model_type not in MODEL_REGISTRY: raise ValueError( f"Compactor MODEL_REGISTRY has no entry for model_type={model_type!r}; " f"supported: {sorted(MODEL_REGISTRY)}" ) vllm_model = extract_vllm_causal_lm(llm) device = next(vllm_model.parameters()).device dtype = next(vllm_model.parameters()).dtype # Build compactor shell on CPU first. **Do not** call ``.to(device)`` before tying: # that allocates a full second copy of weights on GPU; tying then frees the # duplicate but peak memory can OOM on large models. Tie first so parameters # alias vLLM tensors directly (no extra weight VRAM). kv_model: nn.Module = MODEL_REGISTRY[model_type](hf) tie_kvprune_weights_from_vllm(vllm_model, kv_model) # Buffers (e.g. RoPE tables) not in ``named_parameters`` may still be on CPU. kv_model.to(device=device, dtype=dtype) tie_kvprune_rope_buffers_from_vllm(vllm_model, kv_model) delegate_kvprune_embed_tokens_to_vllm(vllm_model, kv_model) delegate_kvprune_compute_logits_to_vllm(vllm_model, kv_model) # Compactor decode CUDA graphs capture ``model.forward`` + ``compute_logits`` in one # graph. Here ``compute_logits`` is delegated to vLLM's LM head / LogitsProcessor # (cublas GEMM, padded vocab, etc.). Embedding that in a nested capture commonly # fails with ``CUBLAS_STATUS_EXECUTION_FAILED`` and invalidates stream capture # (``cudaErrorStreamCaptureInvalidated``). Default: skip graphs for this integration. _sw_graph = os.environ.get( "VLLM_KVPRUNE_SHARED_WEIGHT_GRAPH", "0" ).strip().lower() in ("1", "true", "yes") if not _sw_graph: cfg.enforce_eager = True logger.info( "KV-prune shared-weight compactor: enforce_eager=True (skip compactor " "decode CUDA graphs; logits delegated to vLLM). Set " "VLLM_KVPRUNE_SHARED_WEIGHT_GRAPH=1 only to attempt capture (often fails)." ) return LLMEngine(cfg, external_model=kv_model)