"vllm/config/__init__.py" did not exist on "7b455cf1c036d12470374d716800d0fd09290a5a"
compactor_shared.py 6.23 KB
Newer Older
chenzk's avatar
chenzk committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Construct compactor :class:`LLMEngine` sharing weight tensors with an in-process vLLM ``LLM``."""

from __future__ import annotations

import os

import torch.nn as nn

from vllm.config import VllmConfig
from vllm.kvprune.config.engine_config import LLMConfig
from vllm.kvprune.core.llm_engine import LLMEngine
from vllm.kvprune.integration.config_adapter import vllm_config_to_llm_config
from vllm.kvprune.integration.vllm_model_access import extract_vllm_causal_lm
from vllm.kvprune.integration.weight_tie import (
    delegate_kvprune_compute_logits_to_vllm,
    delegate_kvprune_embed_tokens_to_vllm,
    tie_kvprune_rope_buffers_from_vllm,
    tie_kvprune_weights_from_vllm,
)
from vllm.kvprune.models import MODEL_REGISTRY
from vllm.logger import init_logger

logger = init_logger(__name__)


def build_llm_config_for_compactor(vc: VllmConfig) -> LLMConfig:
    """Public helper: vLLM config → compactor :class:`LLMConfig`."""
    return vllm_config_to_llm_config(vc)


def create_compactor_engine_with_shared_weights(llm: object) -> LLMEngine:
    """Single GPU, TP=1: compactor ``LLMEngine`` whose weights alias vLLM tensors.

    Call after the vLLM ``LLM`` has loaded weights. Requires in-process executor
    (``VLLM_ENABLE_V1_MULTIPROCESSING=0``).
    """
    llm_engine = getattr(llm, "llm_engine", None)
    if llm_engine is None:
        raise RuntimeError("Expected ``llm.llm_engine``.")
    vc: VllmConfig = llm_engine.vllm_config
    if vc.parallel_config.tensor_parallel_size != 1:
        raise ValueError(
            "Shared-weight compactor backend requires tensor_parallel_size=1"
        )

    cfg = vllm_config_to_llm_config(vc)
    # ``cfg.enforce_eager`` is for the compactor ``ModelRunner`` only (decode CUDA
    # graphs), not v1. v1 graph capture is controlled solely by ``LLM(...,
    # enforce_eager=...)`` / ``kvprune_compression=True`` on the entrypoint ``LLM``.
    # Large vLLM max_num_seqs blows up compactor page-table GPU memory; sharing the GPU
    # with v1 leaves little room for metadata + KV tensors. Default cap 32 so physical
    # KV pages stay usable; set VLLM_KVPRUNE_COMPACTOR_MAX_NUM_SEQS=0 to disable cap,
    # or raise (e.g. 128) if you have VRAM headroom.
    _cap = os.environ.get("VLLM_KVPRUNE_COMPACTOR_MAX_NUM_SEQS", "32").strip()
    if _cap:
        lim = int(_cap)
        if lim > 0:
            cfg.max_num_seqs = min(cfg.max_num_seqs, lim)

    # Compactor decode graphs (``enforce_eager=False``): honored for non-shared-weight
    # engines. **Shared-weight** path (below) forces ``enforce_eager=True`` after
    # delegating ``compute_logits`` to vLLM unless ``VLLM_KVPRUNE_SHARED_WEIGHT_GRAPH=1``.
    # Opt out of graphs for non-shared runs: ``VLLM_KVPRUNE_COMPACTOR_ENFORCE_EAGER=1`` or
    # ``VLLM_KVPRUNE_COMPACTOR_CUDA_GRAPH=0``.
    _ce = os.environ.get("VLLM_KVPRUNE_COMPACTOR_ENFORCE_EAGER", "").strip().lower()
    if _ce in ("1", "true", "yes"):
        cfg.enforce_eager = True
        logger.info(
            "KV-prune compactor: VLLM_KVPRUNE_COMPACTOR_ENFORCE_EAGER=1 → "
            "enforce_eager=True (skip compactor decode CUDA graphs)."
        )
    elif _ce in ("0", "false", "no"):
        cfg.enforce_eager = False
        logger.info(
            "KV-prune compactor: VLLM_KVPRUNE_COMPACTOR_ENFORCE_EAGER=0 → "
            "enforce_eager=False (try compactor CUDA graph capture)."
        )
    else:
        _dg = os.environ.get(
            "VLLM_KVPRUNE_COMPACTOR_CUDA_GRAPH", "1"
        ).strip().lower()
        if _dg in ("0", "false", "no"):
            cfg.enforce_eager = True
            logger.info(
                "KV-prune compactor: VLLM_KVPRUNE_COMPACTOR_CUDA_GRAPH=0 → "
                "enforce_eager=True (skip compactor decode CUDA graphs)."
            )
        else:
            cfg.enforce_eager = False
            logger.info(
                "KV-prune compactor: default try decode CUDA graphs; ModelRunner "
                "falls back to eager if capture yields none. Set "
                "VLLM_KVPRUNE_COMPACTOR_ENFORCE_EAGER=1 or "
                "VLLM_KVPRUNE_COMPACTOR_CUDA_GRAPH=0 to skip capture."
            )

    hf = cfg.hf_config
    assert hf is not None
    model_type = hf.model_type
    if model_type not in MODEL_REGISTRY:
        raise ValueError(
            f"Compactor MODEL_REGISTRY has no entry for model_type={model_type!r}; "
            f"supported: {sorted(MODEL_REGISTRY)}"
        )

    vllm_model = extract_vllm_causal_lm(llm)
    device = next(vllm_model.parameters()).device
    dtype = next(vllm_model.parameters()).dtype

    # Build compactor shell on CPU first. **Do not** call ``.to(device)`` before tying:
    # that allocates a full second copy of weights on GPU; tying then frees the
    # duplicate but peak memory can OOM on large models. Tie first so parameters
    # alias vLLM tensors directly (no extra weight VRAM).
    kv_model: nn.Module = MODEL_REGISTRY[model_type](hf)
    tie_kvprune_weights_from_vllm(vllm_model, kv_model)
    # Buffers (e.g. RoPE tables) not in ``named_parameters`` may still be on CPU.
    kv_model.to(device=device, dtype=dtype)
    tie_kvprune_rope_buffers_from_vllm(vllm_model, kv_model)
    delegate_kvprune_embed_tokens_to_vllm(vllm_model, kv_model)
    delegate_kvprune_compute_logits_to_vllm(vllm_model, kv_model)

    # Compactor decode CUDA graphs capture ``model.forward`` + ``compute_logits`` in one
    # graph. Here ``compute_logits`` is delegated to vLLM's LM head / LogitsProcessor
    # (cublas GEMM, padded vocab, etc.). Embedding that in a nested capture commonly
    # fails with ``CUBLAS_STATUS_EXECUTION_FAILED`` and invalidates stream capture
    # (``cudaErrorStreamCaptureInvalidated``). Default: skip graphs for this integration.
    _sw_graph = os.environ.get(
        "VLLM_KVPRUNE_SHARED_WEIGHT_GRAPH", "0"
    ).strip().lower() in ("1", "true", "yes")
    if not _sw_graph:
        cfg.enforce_eager = True
        logger.info(
            "KV-prune shared-weight compactor: enforce_eager=True (skip compactor "
            "decode CUDA graphs; logits delegated to vLLM). Set "
            "VLLM_KVPRUNE_SHARED_WEIGHT_GRAPH=1 only to attempt capture (often fails)."
        )

    return LLMEngine(cfg, external_model=kv_model)