openvino.py 4.78 KB
Newer Older
1
2
from typing import TYPE_CHECKING

3
4
import openvino as ov
import openvino.properties.hint as hints
5
6
7
import torch

import vllm.envs as envs
8
from vllm.logger import init_logger
9

10
from .interface import Platform, PlatformEnum, _Backend
11

12
13
14
15
16
if TYPE_CHECKING:
    from vllm.config import VllmConfig
else:
    VllmConfig = None

17
18
logger = init_logger(__name__)

19
20
21

class OpenVinoPlatform(Platform):
    _enum = PlatformEnum.OPENVINO
22
    device_type: str = "openvino"
23
    dispatch_key: str = "CPU"
24

25
26
27
28
29
30
    @classmethod
    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
        if selected_backend != _Backend.OPENVINO:
            logger.info("Cannot use %s backend on OpenVINO.", selected_backend)
        return _Backend.OPENVINO

31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
    @classmethod
    def get_device_name(self, device_id: int = 0) -> str:
        return "openvino"

    @classmethod
    def inference_mode(self):
        return torch.inference_mode(mode=True)

    @classmethod
    def is_openvino_cpu(self) -> bool:
        return "CPU" in envs.VLLM_OPENVINO_DEVICE

    @classmethod
    def is_openvino_gpu(self) -> bool:
        return "GPU" in envs.VLLM_OPENVINO_DEVICE

    @classmethod
    def is_pin_memory_available(self) -> bool:
49
        logger.warning("Pin memory is not supported on OpenViNO.")
50
        return False
51
52
53

    @classmethod
    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
54
55
        from vllm.utils import GiB_bytes

56
57
58
59
60
61
62
63
        parallel_config = vllm_config.parallel_config
        assert (
            parallel_config.world_size == 1
        ), "OpenVINOExecutor only supports single CPU socket currently."

        if parallel_config.worker_cls == "auto":
            parallel_config.worker_cls = \
                "vllm.worker.openvino_worker.OpenVINOWorker"
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128

        # check and update model config
        model_config = vllm_config.model_config
        if model_config.dtype != torch.float32:
            logger.warning(
                f"Only float32 dtype is supported on OpenVINO, casting from {model_config.dtype}."  # noqa: G004, E501
            )
            model_config.dtype = torch.float32
        if not model_config.enforce_eager:
            logger.warning(
                "CUDA graph is not supported on OpenVINO backend, fallback to "
                "the eager mode.")
            model_config.enforce_eager = True

        # check and update cache config
        ov_core = ov.Core()
        cache_config = vllm_config.cache_config
        if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8":
            if not OpenVinoPlatform.is_openvino_cpu():
                logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is"
                            "ignored for GPU, f16 data type will be used.")
                cache_config.cache_dtype = ov.Type.f16
            else:
                logger.info("KV cache type is overridden to u8 via "
                            "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION env var.")
                cache_config.cache_dtype = ov.Type.u8
        else:
            if OpenVinoPlatform.is_openvino_cpu():
                ov_device = envs.VLLM_OPENVINO_DEVICE
                inference_precision = ov_core.get_property(
                    ov_device, hints.inference_precision)
                if inference_precision == ov.Type.bf16:
                    cache_config.cache_dtype = ov.Type.bf16
                else:
                    cache_config.cache_dtype = ov.Type.f16
            else:
                cache_config.cache_dtype = ov.Type.f16

        if OpenVinoPlatform.is_openvino_cpu():
            if cache_config.block_size != 32:
                logger.info(
                    f"OpenVINO CPU optimal block size is 32, overriding currently set {cache_config.block_size}"  # noqa: G004, E501
                )
                cache_config.block_size = 32
        else:
            if cache_config.block_size != 16:
                logger.info(
                    f"OpenVINO GPU optimal block size is 16, overriding currently set {cache_config.block_size}"  # noqa: G004, E501
                )
                cache_config.block_size = 16

        kv_cache_space = envs.VLLM_OPENVINO_KVCACHE_SPACE
        if kv_cache_space >= 0:
            if kv_cache_space == 0 and OpenVinoPlatform.is_openvino_cpu():
                cache_config.openvino_kvcache_space_bytes = 4 * GiB_bytes  # type: ignore
                logger.warning(
                    "Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) "
                    "for OpenVINO backend is not set, using 4 by default.")
            else:
                cache_config.openvino_kvcache_space_bytes = (  # type: ignore
                    kv_cache_space * GiB_bytes)
        else:
            raise RuntimeError(
                "Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE"
                f" {kv_cache_space}, expect a positive integer value.")