openvino.py 5.37 KB
Newer Older
1
from typing import TYPE_CHECKING, Optional
2

3
4
5
import torch

import vllm.envs as envs
6
from vllm.logger import init_logger
7

8
from .interface import Platform, PlatformEnum, _Backend
9

10
11
12
13
14
if TYPE_CHECKING:
    from vllm.config import VllmConfig
else:
    VllmConfig = None

15
16
logger = init_logger(__name__)

17
18
19
20
21
22
try:
    import openvino as ov
    import openvino.properties.hint as hints
except ImportError as e:
    logger.warning("Failed to import OpenVINO with %r", e)

23
24
25

class OpenVinoPlatform(Platform):
    _enum = PlatformEnum.OPENVINO
26
    device_name: str = "openvino"
27
    device_type: str = "openvino"
28
    dispatch_key: str = "CPU"
29

30
    @classmethod
31
32
33
    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
                             block_size: int, use_v1: bool) -> str:
34
35
        if selected_backend != _Backend.OPENVINO:
            logger.info("Cannot use %s backend on OpenVINO.", selected_backend)
36
37
        logger.info("Using OpenVINO Attention backend.")
        return "vllm.attention.backends.openvino.OpenVINOAttentionBackend"
38

39
    @classmethod
40
    def get_device_name(cls, device_id: int = 0) -> str:
41
42
        return "openvino"

43
44
45
46
    @classmethod
    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
        return False

47
    @classmethod
48
    def inference_mode(cls):
49
50
51
        return torch.inference_mode(mode=True)

    @classmethod
52
    def is_openvino_cpu(cls) -> bool:
53
54
55
        return "CPU" in envs.VLLM_OPENVINO_DEVICE

    @classmethod
56
    def is_openvino_gpu(cls) -> bool:
57
58
59
        return "GPU" in envs.VLLM_OPENVINO_DEVICE

    @classmethod
60
    def is_pin_memory_available(cls) -> bool:
61
        logger.warning("Pin memory is not supported on OpenViNO.")
62
        return False
63
64
65

    @classmethod
    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
66
67
        from vllm.utils import GiB_bytes

68
69
70
71
72
73
74
75
        parallel_config = vllm_config.parallel_config
        assert (
            parallel_config.world_size == 1
        ), "OpenVINOExecutor only supports single CPU socket currently."

        if parallel_config.worker_cls == "auto":
            parallel_config.worker_cls = \
                "vllm.worker.openvino_worker.OpenVINOWorker"
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92

        # check and update model config
        model_config = vllm_config.model_config
        if model_config.dtype != torch.float32:
            logger.warning(
                f"Only float32 dtype is supported on OpenVINO, casting from {model_config.dtype}."  # noqa: G004, E501
            )
            model_config.dtype = torch.float32
        if not model_config.enforce_eager:
            logger.warning(
                "CUDA graph is not supported on OpenVINO backend, fallback to "
                "the eager mode.")
            model_config.enforce_eager = True

        # check and update cache config
        ov_core = ov.Core()
        cache_config = vllm_config.cache_config
93
94
95
        if cache_config and cache_config.block_size is None:
            cache_config.block_size = 16

96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
        if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8":
            if not OpenVinoPlatform.is_openvino_cpu():
                logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is"
                            "ignored for GPU, f16 data type will be used.")
                cache_config.cache_dtype = ov.Type.f16
            else:
                logger.info("KV cache type is overridden to u8 via "
                            "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION env var.")
                cache_config.cache_dtype = ov.Type.u8
        else:
            if OpenVinoPlatform.is_openvino_cpu():
                ov_device = envs.VLLM_OPENVINO_DEVICE
                inference_precision = ov_core.get_property(
                    ov_device, hints.inference_precision)
                if inference_precision == ov.Type.bf16:
                    cache_config.cache_dtype = ov.Type.bf16
                else:
                    cache_config.cache_dtype = ov.Type.f16
            else:
                cache_config.cache_dtype = ov.Type.f16

        if OpenVinoPlatform.is_openvino_cpu():
            if cache_config.block_size != 32:
                logger.info(
                    f"OpenVINO CPU optimal block size is 32, overriding currently set {cache_config.block_size}"  # noqa: G004, E501
                )
                cache_config.block_size = 32
        else:
            if cache_config.block_size != 16:
                logger.info(
                    f"OpenVINO GPU optimal block size is 16, overriding currently set {cache_config.block_size}"  # noqa: G004, E501
                )
                cache_config.block_size = 16

        kv_cache_space = envs.VLLM_OPENVINO_KVCACHE_SPACE
        if kv_cache_space >= 0:
            if kv_cache_space == 0 and OpenVinoPlatform.is_openvino_cpu():
                cache_config.openvino_kvcache_space_bytes = 4 * GiB_bytes  # type: ignore
                logger.warning(
                    "Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) "
                    "for OpenVINO backend is not set, using 4 by default.")
            else:
                cache_config.openvino_kvcache_space_bytes = (  # type: ignore
                    kv_cache_space * GiB_bytes)
        else:
            raise RuntimeError(
                "Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE"
                f" {kv_cache_space}, expect a positive integer value.")