executor_base.py 5.72 KB
Newer Older
1
import asyncio
2
from abc import ABC, abstractmethod
3
from typing import List, Optional, Set, Tuple
4

5
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
6
                         ModelConfig, MultiModalConfig, ParallelConfig,
7
8
                         PromptAdapterConfig, SchedulerConfig,
                         SpeculativeConfig)
9
from vllm.lora.request import LoRARequest
10
from vllm.prompt_adapter.request import PromptAdapterRequest
11
from vllm.sequence import ExecuteModelRequest, SamplerOutput
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28


class ExecutorBase(ABC):
    """Base class for all executors.

    An executor is responsible for executing the model on a specific device
    type (e.g., CPU, GPU, Neuron, etc.). Or it can be a distributed executor
    that can execute the model on multiple devices.
    """

    def __init__(
        self,
        model_config: ModelConfig,
        cache_config: CacheConfig,
        parallel_config: ParallelConfig,
        scheduler_config: SchedulerConfig,
        device_config: DeviceConfig,
29
        load_config: LoadConfig,
30
        lora_config: Optional[LoRAConfig],
31
        multimodal_config: Optional[MultiModalConfig],
32
        speculative_config: Optional[SpeculativeConfig],
33
        prompt_adapter_config: Optional[PromptAdapterConfig],
34
    ) -> None:
35
36
37
        self.model_config = model_config
        self.cache_config = cache_config
        self.lora_config = lora_config
38
        self.load_config = load_config
39
40
41
        self.parallel_config = parallel_config
        self.scheduler_config = scheduler_config
        self.device_config = device_config
42
        self.multimodal_config = multimodal_config
43
        self.speculative_config = speculative_config
44
        self.prompt_adapter_config = prompt_adapter_config
45
46
47
48
49
50

        self._init_executor()

    @abstractmethod
    def _init_executor(self) -> None:
        pass
51

52
    @abstractmethod
53
    def determine_num_available_blocks(self) -> Tuple[int, int]:
54
55
56
57
58
59
60
        """Determine the number of available blocks for the GPU KV cache and
        swappable CPU KV cache.

        Normally, this should simply delegate to the underlying Worker. Some
        ExecutorBase may require modification of the result, e.g. to ensure the
        selected cache sizes are compatible with all workers.

61
        Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
62
63
64
65
66
67
68
69
70
71
72
73
74
        are blocks that are "active" on the device and can be appended to.
        num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
        appended to.
        """
        raise NotImplementedError

    @abstractmethod
    def initialize_cache(self, num_gpu_blocks: int,
                         num_cpu_blocks: int) -> None:
        """Initialize the KV cache with the given size in blocks.
        """
        raise NotImplementedError

75
    @abstractmethod
76
    def execute_model(
77
78
        self, execute_model_req: ExecuteModelRequest
    ) -> Optional[List[SamplerOutput]]:
79
        """Executes at least one model step on the given sequences."""
80
81
        raise NotImplementedError

82
83
84
85
    def stop_remote_worker_execution_loop(self) -> None:
        """Releases parallel workers from model loop."""
        return

86
87
88
89
90
91
92
93
    @abstractmethod
    def add_lora(self, lora_request: LoRARequest) -> bool:
        raise NotImplementedError

    @abstractmethod
    def remove_lora(self, lora_id: int) -> bool:
        raise NotImplementedError

94
95
96
97
    @abstractmethod
    def pin_lora(self, lora_id: int) -> bool:
        raise NotImplementedError  # type: ignore

98
    @abstractmethod
99
    def list_loras(self) -> Set[int]:
100
101
        raise NotImplementedError

102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
    @abstractmethod
    def add_prompt_adapter(
            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
        raise NotImplementedError

    @abstractmethod
    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
        raise NotImplementedError

    @abstractmethod
    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
        raise NotImplementedError  # type: ignore

    @abstractmethod
    def list_prompt_adapters(self) -> Set[int]:
        raise NotImplementedError

119
120
121
122
123
124
    @abstractmethod
    def check_health(self) -> None:
        """Checks if the executor is healthy. If not, it should raise an
        exception."""
        raise NotImplementedError

125
126
127
128
129
130
131
    def shutdown(self) -> None:
        """Shutdown the executor."""
        return

    def __del__(self):
        self.shutdown()

132
133
134

class ExecutorAsyncBase(ExecutorBase):

135
136
137
138
139
140
141
142
143
    def __init__(
        self,
        model_config: ModelConfig,
        cache_config: CacheConfig,
        parallel_config: ParallelConfig,
        scheduler_config: SchedulerConfig,
        device_config: DeviceConfig,
        load_config: LoadConfig,
        lora_config: Optional[LoRAConfig],
144
        multimodal_config: Optional[MultiModalConfig],
145
        speculative_config: Optional[SpeculativeConfig],
146
        prompt_adapter_config: Optional[PromptAdapterConfig],
147
    ) -> None:
148
        self.pp_locks: Optional[List[asyncio.Lock]] = None
149
150
151

        super().__init__(model_config, cache_config, parallel_config,
                         scheduler_config, device_config, load_config,
152
153
                         lora_config, multimodal_config, speculative_config,
                         prompt_adapter_config)
154

155
156
    @abstractmethod
    async def execute_model_async(
157
158
            self,
            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
159
160
161
        """Executes one model step on the given sequences."""
        raise NotImplementedError

162
163
164
165
    async def stop_remote_worker_execution_loop_async(self) -> None:
        """Releases parallel workers from model loop."""
        return

166
167
168
    async def check_health_async(self) -> None:
        """Checks if the executor is healthy. If not, it should raise an
        exception."""
169
        self.check_health()