kernel.py 2.64 KB
Newer Older
1
2
3
4
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from collections.abc import Callable
5
from typing import Any, Literal
6

7
from pydantic import field_validator
8
9
10
11

from vllm.config.utils import config
from vllm.utils.hashing import safe_hash

12
13
14
15
16
17
18
19
20
21
22
23
MoEBackend = Literal[
    "auto",
    "triton",
    "deep_gemm",
    "cutlass",
    "flashinfer_trtllm",
    "flashinfer_cutlass",
    "flashinfer_cutedsl",
    "marlin",
    "aiter",
]

24
25
26
27
28

@config
class KernelConfig:
    """Configuration for kernel selection and warmup behavior."""

29
    enable_flashinfer_autotune: bool = None  # type: ignore[assignment]
30
31
    """If True, run FlashInfer autotuning during kernel warmup."""

32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
    moe_backend: MoEBackend = "auto"
    """Backend for MoE expert computation kernels. Available options:

    - "auto": Automatically select the best backend based on model and hardware\n
    - "triton": Use Triton-based fused MoE kernels\n
    - "deep_gemm": Use DeepGEMM kernels (FP8 block-quantized only)\n
    - "cutlass": Use vLLM CUTLASS kernels\n
    - "flashinfer_trtllm": Use FlashInfer with TRTLLM-GEN kernels\n
    - "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels\n
    - "flashinfer_cutedsl": Use FlashInfer with CuteDSL kernels (FP4 only)\n
    - "marlin": Use Marlin kernels (weight-only quantization)\n
    - "aiter": Use AMD AITer kernels (ROCm only)"""

    @field_validator("moe_backend", mode="before")
    @classmethod
    def _normalize_moe_backend(cls, value: Any) -> Any:
        if isinstance(value, str):
            return value.lower().replace("-", "_")
        return value

52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,
        ensure that it is included in the factors list if
        it affects the computation graph.

        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        """
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str

    @field_validator("enable_flashinfer_autotune", mode="wrap")
    @classmethod
    def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
        """Skip validation if the value is `None` when initialization is delayed."""
        if value is None:
            return value
        return handler(value)