"vscode:/vscode.git/clone" did not exist on "808a7b69df479b6b3a16181711cac7ca28a9b941"
openvino.py 5.75 KB
Newer Older
1
2
# SPDX-License-Identifier: Apache-2.0

3
from typing import TYPE_CHECKING, Optional
4

5
6
7
import torch

import vllm.envs as envs
8
from vllm.logger import init_logger
9

10
from .interface import Platform, PlatformEnum, _Backend
11

12
13
14
15
16
if TYPE_CHECKING:
    from vllm.config import VllmConfig
else:
    VllmConfig = None

17
18
logger = init_logger(__name__)

19
20
21
22
23
24
try:
    import openvino as ov
    import openvino.properties.hint as hints
except ImportError as e:
    logger.warning("Failed to import OpenVINO with %r", e)

25
26
27

class OpenVinoPlatform(Platform):
    _enum = PlatformEnum.OPENVINO
28
    device_name: str = "openvino"
29
    device_type: str = "openvino"
30
    dispatch_key: str = "CPU"
31

32
    @classmethod
33
34
    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
35
36
                             block_size: int, use_v1: bool,
                             use_mla: bool) -> str:
37
38
        if selected_backend != _Backend.OPENVINO:
            logger.info("Cannot use %s backend on OpenVINO.", selected_backend)
39
40
        logger.info("Using OpenVINO Attention backend.")
        return "vllm.attention.backends.openvino.OpenVINOAttentionBackend"
41

42
    @classmethod
43
    def get_device_name(cls, device_id: int = 0) -> str:
44
45
        return "openvino"

46
47
48
49
    @classmethod
    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
        return False

50
    @classmethod
51
    def inference_mode(cls):
52
53
54
        return torch.inference_mode(mode=True)

    @classmethod
55
    def is_openvino_cpu(cls) -> bool:
56
57
58
        return "CPU" in envs.VLLM_OPENVINO_DEVICE

    @classmethod
59
    def is_openvino_gpu(cls) -> bool:
60
61
62
        return "GPU" in envs.VLLM_OPENVINO_DEVICE

    @classmethod
63
    def is_pin_memory_available(cls) -> bool:
64
        logger.warning("Pin memory is not supported on OpenViNO.")
65
        return False
66
67
68

    @classmethod
    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
69
70
        from vllm.utils import GiB_bytes

71
        parallel_config = vllm_config.parallel_config
72
73
        assert (parallel_config.world_size == 1
                ), "OpenVINO only supports single CPU socket currently."
74
75
76
77

        if parallel_config.worker_cls == "auto":
            parallel_config.worker_cls = \
                "vllm.worker.openvino_worker.OpenVINOWorker"
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94

        # check and update model config
        model_config = vllm_config.model_config
        if model_config.dtype != torch.float32:
            logger.warning(
                f"Only float32 dtype is supported on OpenVINO, casting from {model_config.dtype}."  # noqa: G004, E501
            )
            model_config.dtype = torch.float32
        if not model_config.enforce_eager:
            logger.warning(
                "CUDA graph is not supported on OpenVINO backend, fallback to "
                "the eager mode.")
            model_config.enforce_eager = True

        # check and update cache config
        ov_core = ov.Core()
        cache_config = vllm_config.cache_config
95
96
97
        if cache_config and cache_config.block_size is None:
            cache_config.block_size = 16

98
99
        if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8":
            if not OpenVinoPlatform.is_openvino_cpu():
100
                logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is "
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
                            "ignored for GPU, f16 data type will be used.")
                cache_config.cache_dtype = ov.Type.f16
            else:
                logger.info("KV cache type is overridden to u8 via "
                            "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION env var.")
                cache_config.cache_dtype = ov.Type.u8
        else:
            if OpenVinoPlatform.is_openvino_cpu():
                ov_device = envs.VLLM_OPENVINO_DEVICE
                inference_precision = ov_core.get_property(
                    ov_device, hints.inference_precision)
                if inference_precision == ov.Type.bf16:
                    cache_config.cache_dtype = ov.Type.bf16
                else:
                    cache_config.cache_dtype = ov.Type.f16
            else:
                cache_config.cache_dtype = ov.Type.f16

        if OpenVinoPlatform.is_openvino_cpu():
            if cache_config.block_size != 32:
                logger.info(
                    f"OpenVINO CPU optimal block size is 32, overriding currently set {cache_config.block_size}"  # noqa: G004, E501
                )
                cache_config.block_size = 32
        else:
            if cache_config.block_size != 16:
                logger.info(
                    f"OpenVINO GPU optimal block size is 16, overriding currently set {cache_config.block_size}"  # noqa: G004, E501
                )
                cache_config.block_size = 16

        kv_cache_space = envs.VLLM_OPENVINO_KVCACHE_SPACE
        if kv_cache_space >= 0:
            if kv_cache_space == 0 and OpenVinoPlatform.is_openvino_cpu():
                cache_config.openvino_kvcache_space_bytes = 4 * GiB_bytes  # type: ignore
                logger.warning(
                    "Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) "
                    "for OpenVINO backend is not set, using 4 by default.")
            else:
                cache_config.openvino_kvcache_space_bytes = (  # type: ignore
                    kv_cache_space * GiB_bytes)
        else:
            raise RuntimeError(
                "Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE"
                f" {kv_cache_space}, expect a positive integer value.")
146
147
148
149
150
151
152

        assert vllm_config.device_config.device_type == "openvino"
        assert vllm_config.lora_config is None, \
            "OpenVINO backend doesn't support LoRA"
        assert cls.is_openvino_cpu() or \
            cls.is_openvino_gpu(), \
            "OpenVINO backend supports only CPU and GPU devices"