openvino.py 5.13 KB
Newer Older
1
from typing import TYPE_CHECKING, Optional
2

3
4
5
import torch

import vllm.envs as envs
6
from vllm.logger import init_logger
7

8
from .interface import Platform, PlatformEnum, _Backend
9

10
11
12
13
14
if TYPE_CHECKING:
    from vllm.config import VllmConfig
else:
    VllmConfig = None

15
16
logger = init_logger(__name__)

17
18
19
20
21
22
try:
    import openvino as ov
    import openvino.properties.hint as hints
except ImportError as e:
    logger.warning("Failed to import OpenVINO with %r", e)

23
24
25

class OpenVinoPlatform(Platform):
    _enum = PlatformEnum.OPENVINO
26
    device_name: str = "openvino"
27
    device_type: str = "openvino"
28
    dispatch_key: str = "CPU"
29

30
31
32
33
34
35
    @classmethod
    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
        if selected_backend != _Backend.OPENVINO:
            logger.info("Cannot use %s backend on OpenVINO.", selected_backend)
        return _Backend.OPENVINO

36
    @classmethod
37
    def get_device_name(cls, device_id: int = 0) -> str:
38
39
        return "openvino"

40
41
42
43
    @classmethod
    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
        return False

44
    @classmethod
45
    def inference_mode(cls):
46
47
48
        return torch.inference_mode(mode=True)

    @classmethod
49
    def is_openvino_cpu(cls) -> bool:
50
51
52
        return "CPU" in envs.VLLM_OPENVINO_DEVICE

    @classmethod
53
    def is_openvino_gpu(cls) -> bool:
54
55
56
        return "GPU" in envs.VLLM_OPENVINO_DEVICE

    @classmethod
57
    def is_pin_memory_available(cls) -> bool:
58
        logger.warning("Pin memory is not supported on OpenViNO.")
59
        return False
60
61
62

    @classmethod
    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
63
64
        from vllm.utils import GiB_bytes

65
66
67
68
69
70
71
72
        parallel_config = vllm_config.parallel_config
        assert (
            parallel_config.world_size == 1
        ), "OpenVINOExecutor only supports single CPU socket currently."

        if parallel_config.worker_cls == "auto":
            parallel_config.worker_cls = \
                "vllm.worker.openvino_worker.OpenVINOWorker"
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89

        # check and update model config
        model_config = vllm_config.model_config
        if model_config.dtype != torch.float32:
            logger.warning(
                f"Only float32 dtype is supported on OpenVINO, casting from {model_config.dtype}."  # noqa: G004, E501
            )
            model_config.dtype = torch.float32
        if not model_config.enforce_eager:
            logger.warning(
                "CUDA graph is not supported on OpenVINO backend, fallback to "
                "the eager mode.")
            model_config.enforce_eager = True

        # check and update cache config
        ov_core = ov.Core()
        cache_config = vllm_config.cache_config
90
91
92
        if cache_config and cache_config.block_size is None:
            cache_config.block_size = 16

93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
        if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8":
            if not OpenVinoPlatform.is_openvino_cpu():
                logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is"
                            "ignored for GPU, f16 data type will be used.")
                cache_config.cache_dtype = ov.Type.f16
            else:
                logger.info("KV cache type is overridden to u8 via "
                            "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION env var.")
                cache_config.cache_dtype = ov.Type.u8
        else:
            if OpenVinoPlatform.is_openvino_cpu():
                ov_device = envs.VLLM_OPENVINO_DEVICE
                inference_precision = ov_core.get_property(
                    ov_device, hints.inference_precision)
                if inference_precision == ov.Type.bf16:
                    cache_config.cache_dtype = ov.Type.bf16
                else:
                    cache_config.cache_dtype = ov.Type.f16
            else:
                cache_config.cache_dtype = ov.Type.f16

        if OpenVinoPlatform.is_openvino_cpu():
            if cache_config.block_size != 32:
                logger.info(
                    f"OpenVINO CPU optimal block size is 32, overriding currently set {cache_config.block_size}"  # noqa: G004, E501
                )
                cache_config.block_size = 32
        else:
            if cache_config.block_size != 16:
                logger.info(
                    f"OpenVINO GPU optimal block size is 16, overriding currently set {cache_config.block_size}"  # noqa: G004, E501
                )
                cache_config.block_size = 16

        kv_cache_space = envs.VLLM_OPENVINO_KVCACHE_SPACE
        if kv_cache_space >= 0:
            if kv_cache_space == 0 and OpenVinoPlatform.is_openvino_cpu():
                cache_config.openvino_kvcache_space_bytes = 4 * GiB_bytes  # type: ignore
                logger.warning(
                    "Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) "
                    "for OpenVINO backend is not set, using 4 by default.")
            else:
                cache_config.openvino_kvcache_space_bytes = (  # type: ignore
                    kv_cache_space * GiB_bytes)
        else:
            raise RuntimeError(
                "Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE"
                f" {kv_cache_space}, expect a positive integer value.")