"vscode:/vscode.git/clone" did not exist on "f919d4cb8faac8c869ab87ee705dbd340fae4679"
openvino_executor.py 4.68 KB
Newer Older
1
2
3
4
5
6
7
8
from typing import List, Set, Tuple

import openvino as ov

import vllm.envs as envs
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
9
from vllm.model_executor.layers.sampler import SamplerOutput
10
from vllm.platforms import current_platform
11
from vllm.sequence import ExecuteModelRequest
12
13
from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
                        make_async)
14
from vllm.worker.worker_base import WorkerWrapperBase
15
16
17
18
19
20

logger = init_logger(__name__)


class OpenVINOExecutor(ExecutorBase):

21
22
    uses_ray: bool = False

23
24
25
    def _init_executor(self) -> None:
        assert self.device_config.device_type == "openvino"
        assert self.lora_config is None, "OpenVINO backend doesn't support LoRA"
26
27
        assert current_platform.is_openvino_cpu() or \
            current_platform.is_openvino_gpu(), \
28
29
            "OpenVINO backend supports only CPU and GPU devices"

30
31
32
33
34
        # Instantiate the worker and load the model to CPU.
        self._init_worker()

    def _init_worker(self):

35
        wrapper = WorkerWrapperBase(vllm_config=self.vllm_config)
36
37
38

        distributed_init_method = get_distributed_init_method(
            get_ip(), get_open_port())
39
        self.driver_worker = wrapper.init_worker(
40
            ov_core=ov.Core(),
41
            vllm_config=self.vllm_config,
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
            local_rank=0,
            rank=0,
            distributed_init_method=distributed_init_method,
            kv_cache_dtype=self.cache_config.cache_dtype,
            is_driver_worker=True,
        )
        self.driver_worker.init_device()
        self.driver_worker.load_model()

    def determine_num_available_blocks(self) -> Tuple[int, int]:
        """Determine the number of available KV blocks by invoking the
        underlying worker.
        """
        return self.driver_worker.determine_num_available_blocks()

    def initialize_cache(self, num_gpu_blocks: int,
                         num_cpu_blocks: int) -> None:
        """Initialize the KV cache by invoking the underlying worker."""
        # NOTE: We log here to avoid multiple logs when number of workers is
        # greater than one. We could log in the engine, but not all executors
        # have GPUs.
63
64
65
66
67
68
69
        # NOTE: In case of a CPU device, `cpu block` for OpenVINO backend
        # is located on CPU memory but is referred as `gpu block`.
        # Because we want to reuse the existing block management procedure.
        device_blocks = num_gpu_blocks
        swap_blocks = num_cpu_blocks
        logger.info("OpenVINO %s: # device blocks: %d; # swap blocks: %d",
                    envs.VLLM_OPENVINO_DEVICE, device_blocks, swap_blocks)
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
        self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)

    def execute_model(
            self,
            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
        output = self.driver_worker.execute_model(execute_model_req)
        return output

    def add_lora(self, lora_request: LoRARequest) -> bool:
        return self.driver_worker.add_lora(lora_request)

    def remove_lora(self, lora_id: int) -> bool:
        return self.driver_worker.remove_lora(lora_id)

    def pin_lora(self, lora_id: int) -> bool:
        return self.driver_worker.pin_lora(lora_id)

    def list_loras(self) -> Set[int]:
        return self.driver_worker.list_loras()

90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
    def add_prompt_adapter(self, prompt_adapter_request) -> bool:
        raise NotImplementedError(
            "Soft prompt is currently not supported by the OPENVINO backend.")

    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
        raise NotImplementedError(
            "Soft prompt is currently not supported by the OPENVINO backend.")

    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
        raise NotImplementedError(
            "Soft prompt is currently not supported by the OPENVINO backend.")

    def list_prompt_adapters(self) -> Set[int]:
        raise NotImplementedError(
            "Soft prompt is currently not supported by the OPENVINO backend.")

106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
    def check_health(self) -> None:
        # OpenVINOExecutor will always be healthy as long as
        # it's running.
        return


class OpenVINOExecutorAsync(OpenVINOExecutor, ExecutorAsyncBase):

    async def execute_model_async(
            self,
            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
        output = await make_async(self.driver_worker.execute_model
                                  )(execute_model_req=execute_model_req, )
        return output

    async def check_health_async(self) -> None:
        # OpenVINOExecutor will always be healthy as long as
        # it's running.
        return