# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Build :class:`vllm.kvprune.config.engine_config.LLMConfig` from :class:`VllmConfig`."""

from __future__ import annotations

import os
from pathlib import Path

from vllm.config import VllmConfig
from vllm.kvprune.config.engine_config import LLMConfig, KvpruneAttentionSchedule
from vllm.logger import init_logger

logger = init_logger(__name__)


def _attention_schedule_from_env() -> KvpruneAttentionSchedule:
    """Resolve :class:`KvpruneAttentionSchedule` from env.

    Primary (``VLLM_KVPRUNE_ATTENTION_SCHEDULE``):

    - ``fa_triton`` — FA prefill, Triton decode (default). Aliases: ``fa_prefill``,
      ``default``, empty.
    - ``pdtriton`` — Triton prefill + Triton decode. Aliases: ``triton``,
      ``triton_prefill``, ``compactor_prefill``, ``pd_triton``.
    - ``pdfa`` — FA prefill + FA decode (KV stores still Triton). Aliases:
      ``fa_full``, ``fa_both``.

    Legacy: ``VLLM_KVPRUNE_ATTENTION_BACKEND`` maps ``flash``/``fa`` → ``fa_triton``,
    ``compactor``/``triton`` → ``pdtriton``.
    """
    s = os.environ.get("VLLM_KVPRUNE_ATTENTION_SCHEDULE", "").strip().lower()
    if s in ("fa_triton", "fa_prefill", "default", ""):
        return KvpruneAttentionSchedule.FA_PREFILL_TRITON_DECODE
    if s in ("pdtriton", "pd_triton", "triton", "triton_prefill", "compactor_prefill"):
        return KvpruneAttentionSchedule.TRITON_PREFILL_TRITON_DECODE
    if s in ("pdfa", "fa_full", "fa_both"):
        return KvpruneAttentionSchedule.PDFA
    if s:
        logger.warning(
            "Unknown VLLM_KVPRUNE_ATTENTION_SCHEDULE=%r; using FA_PREFILL_TRITON_DECODE",
            s,
        )
        return KvpruneAttentionSchedule.FA_PREFILL_TRITON_DECODE

    v = os.environ.get("VLLM_KVPRUNE_ATTENTION_BACKEND", "").strip().lower()
    if v in ("flash", "fa", "flash_attention", "flashattention"):
        return KvpruneAttentionSchedule.FA_PREFILL_TRITON_DECODE
    if v in ("compactor", "triton", "compactor_triton", ""):
        return KvpruneAttentionSchedule.TRITON_PREFILL_TRITON_DECODE
    logger.warning(
        "Unknown VLLM_KVPRUNE_ATTENTION_BACKEND=%r; using FA_PREFILL_TRITON_DECODE", v
    )
    return KvpruneAttentionSchedule.FA_PREFILL_TRITON_DECODE


def _compactor_kvcache_page_size(vllm_block_size: int | None) -> int:
    """Tokens per physical KV page for compactor :class:`LLMConfig`.

    ``compactor-vllm`` uses ``kvcache_page_size=128`` by default. Keeping that page
    size is important for correctness comparisons when validating the integrated
    ``kvprune`` backend against standalone compactor, especially for ``pdtriton``
    where paged-KV layout and page-padding behavior are part of the observed
    divergence on DCU.

    Override with ``VLLM_KVPRUNE_PAGE_SIZE``:

    - unset: use standalone-compactor-compatible ``128``
    - positive integer: use that exact page size (must be divisible by 32)
    - ``vllm`` / ``inherit`` / ``block``: derive from vLLM ``block_size`` and round up
      to the next multiple of 32 (the older integrated behavior)
    """
    env_v = os.environ.get("VLLM_KVPRUNE_PAGE_SIZE", "").strip().lower()
    if env_v:
        if env_v in ("vllm", "inherit", "block"):
            bs = 128 if vllm_block_size is None else int(vllm_block_size)
            if bs <= 0:
                return 128
            if bs % 32 == 0:
                return bs
            return ((bs + 31) // 32) * 32
        page_size = int(env_v)
        if page_size <= 0 or page_size % 32 != 0:
            raise ValueError(
                "VLLM_KVPRUNE_PAGE_SIZE must be a positive multiple of 32, "
                f"got {page_size}."
            )
        return page_size

    return 128


def vllm_config_to_llm_config(vc: VllmConfig) -> LLMConfig:
    """Map vLLM engine config to compactor :class:`LLMConfig`."""
    mc = vc.model_config
    cc = vc.cache_config
    pc = vc.parallel_config
    sc = vc.scheduler_config
    block_size = cc.block_size
    if block_size is None:
        block_size = 16
    max_num_seqs = getattr(sc, "max_num_seqs", 256)
    # Do **not** forward ``model_config.enforce_eager`` (v1) into compactor
    # :class:`LLMConfig`. They are independent flags: v1 uses it only to skip
    # *v1* ``capture_model()``; kvprune :class:`~vllm.kvprune.core.model_runner.ModelRunner`
    # uses :attr:`LLMConfig.enforce_eager` only for *compactor* decode CUDA graphs.
    # Shared-weight setup in ``compactor_shared`` defaults compactor to eager decode;
    # see ``VLLM_KVPRUNE_COMPACTOR_CUDA_GRAPH`` (default try graphs) /
    # ``VLLM_KVPRUNE_COMPACTOR_ENFORCE_EAGER``.
    # Local checkpoint directory: forward so compactor skips redundant Hub fetches.
    _model_s = str(mc.model)
    _path: str | None = None
    try:
        if _model_s and Path(_model_s).is_dir() and (Path(_model_s) / "config.json").is_file():
            _path = str(Path(_model_s).resolve())
    except OSError:
        pass

    page_size = _compactor_kvcache_page_size(block_size)
    attention_schedule = _attention_schedule_from_env()
    logger.info(
        "kvprune compactor config: attention_schedule=%s, kvcache_page_size=%d",
        attention_schedule.name,
        page_size,
    )

    return LLMConfig(
        model=_model_s,
        path=_path,
        nccl_port=1218,
        max_num_seqs=max_num_seqs,
        max_model_len=mc.max_model_len,
        gpu_memory_utilization=cc.gpu_memory_utilization,
        tensor_parallel_size=pc.tensor_parallel_size,
        enforce_eager=False,
        hf_config=mc.hf_config,
        eos=-1,
        eos_token_ids=None,
        kvcache_page_size=page_size,
        leverage_sketch_size=48,
        attention_schedule=attention_schedule,
        attention_backend=None,
    )