worker_manager.py 11.3 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
from contextlib import contextmanager
5
from typing import Any, Literal
6
7
8

import torch

9
from vllm.config import VllmConfig
10
from vllm.logger import init_logger
11
12
from vllm.lora.lora_model import LoRAModel
from vllm.lora.model_manager import (
13
14
15
16
    LoRAModelManager,
    LRUCacheLoRAModelManager,
    create_lora_manager,
)
17
from vllm.lora.peft_helper import PEFTHelper
18
from vllm.lora.request import LoRARequest
19
from vllm.lora.utils import get_adapter_absolute_path
20

21
logger = init_logger(__name__)
22
23


24
class WorkerLoRAManager:
25
26
27
28
29
    """WorkerLoRAManager that manages LoRA models on the worker side.

    Every request, the requested LoRAs will be loaded (unless they are already
    loaded), and every other LoRA will be unloaded."""

30
    _manager_cls: type[LoRAModelManager] = LoRAModelManager
31
32
33

    def __init__(
        self,
34
        vllm_config: VllmConfig,
35
        device: torch.device,
36
37
        embedding_modules: dict[str, str],
        lora_model_cls: type[LoRAModel] = LoRAModel,
38
39
    ):
        self._lora_model_cls = lora_model_cls
Terry's avatar
Terry committed
40
        self.embedding_modules = embedding_modules
41
        self._cached_dummy_lora: None | Literal[False] | LoRAModel = False
42
43
        self.max_num_seqs = vllm_config.scheduler_config.max_num_seqs
        self.max_num_batched_tokens = (
44
45
            vllm_config.scheduler_config.max_num_batched_tokens
        )
46
47
48
49
50
51
52
        self.vocab_size = vllm_config.model_config.get_vocab_size()
        self.lora_config = vllm_config.lora_config

        # Use get_text_config() in case of multimodal models
        text_config = vllm_config.model_config.hf_config.get_text_config()

        self.max_position_embeddings = text_config.max_position_embeddings
53
        self.device = device
54
        # Lazily initialized by create_lora_manager.
55
56
57
58
59
60
61
62
63
        self._adapter_manager: LoRAModelManager

    @contextmanager
    def dummy_lora_cache(self):
        """Use this context manager to reuse the dummy lora model
        to avoid creating it repeatedly."""
        self._cached_dummy_lora = None
        yield
        self._cached_dummy_lora = False
64
65
66
67
68
69
70
71

    @property
    def is_enabled(self) -> bool:
        return True

    def create_lora_manager(
        self,
        model: torch.nn.Module,
72
        vllm_config: VllmConfig | None = None,
73
74
75
76
77
78
79
    ) -> Any:
        lora_manager = create_lora_manager(
            model,
            max_num_seqs=self.max_num_seqs,
            max_num_batched_tokens=self.max_num_batched_tokens,
            vocab_size=self.vocab_size,
            lora_config=self.lora_config,
80
            device=self.device,
81
            lora_manager_cls=self._manager_cls,
82
            vllm_config=vllm_config,
83
        )
84
        self._adapter_manager = lora_manager
85
86
        return lora_manager.model

87
    def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
88
        try:
89
90
            supported_lora_modules = self._adapter_manager.supported_lora_modules
            packed_modules_mapping = self._adapter_manager.packed_modules_mapping
91
            expected_lora_lst: list[str] = []
92
93
            for module in supported_lora_modules:
                if module in packed_modules_mapping:
94
                    expected_lora_lst.extend(packed_modules_mapping[module])
95
                else:
96
                    expected_lora_lst.append(module)
97
                if module == "experts":
98
99
                    expected_lora_lst.append(module)
            expected_lora_modules = set(expected_lora_lst)
100
            lora_path = get_adapter_absolute_path(lora_request.lora_path)
101

102
            peft_helper = PEFTHelper.from_local_dir(
103
104
105
106
                lora_path,
                self.max_position_embeddings,
                lora_request.tensorizer_config_dict,
            )
107
108
109
110
111

            # Validates the LoRA configuration against requirements before
            # loading weights, throwing an exception if validation fails.
            peft_helper.validate_legal(self.lora_config)

112
113
            # For some models like Qwen2VL, we need to use hf_to_vllm_mapper
            # to ensure correct loading of lora weights.
114
            model = self._adapter_manager.model
115
            hf_to_vllm_mapper = getattr(model, "hf_to_vllm_mapper", None)
116

117
118
119
            # Get model-defined prefixes to skip during LoRA loading.
            lora_skip_prefixes = getattr(model, "lora_skip_prefixes", None)

120
            lora = self._lora_model_cls.from_local_checkpoint(
121
                lora_path,
122
                expected_lora_modules,
123
                peft_helper=peft_helper,
124
125
126
                lora_model_id=lora_request.lora_int_id,
                device="cpu",
                dtype=self.lora_config.lora_dtype,
127
                model_vocab_size=self.vocab_size,
128
                tensorizer_config_dict=lora_request.tensorizer_config_dict,
129
                weights_mapper=hf_to_vllm_mapper,
130
                skip_prefixes=lora_skip_prefixes,
131
            )
132

133
134
135
136
137
        except FileNotFoundError as e:
            # FileNotFoundError should be raised if both
            # - No adapter found to download from huggingface (or in
            #       offline mode)
            # - No local adapter files found at `lora_request.lora_path`
138
            # For NotFoundError
139
140
            raise ValueError(
                f"Loading lora {lora_request.lora_name} failed: No adapter "
141
142
                f"found for {lora_request.lora_path}"
            ) from e
143
        except Exception as e:
144
145
146
            # For BadRequestError
            raise e

147
148
149
        return lora

    def add_dummy_lora(self, lora_request: LoRARequest, rank: int) -> bool:
150
        if lora_request.lora_int_id in self.list_adapters():
151
            return False
152
        if isinstance(self._cached_dummy_lora, LoRAModel):
153
            dummy_lora = self._cached_dummy_lora.clone(lora_request.lora_int_id)
154
        else:
155
            dummy_lora = self._adapter_manager.create_dummy_lora(
156
157
                lora_request.lora_int_id, rank, self.embedding_modules
            )
158
159
            if self._cached_dummy_lora is None:
                self._cached_dummy_lora = dummy_lora
160
        return self._adapter_manager.add_adapter(dummy_lora)
161

162
163
164
    def pin_adapter(self, adapter_id: int) -> bool:
        return self._adapter_manager.pin_adapter(adapter_id)

165
    def set_active_adapters(self, requests: set[Any], mapping: Any | None) -> None:
166
167
168
        self._apply_adapters(requests)
        if mapping is not None:
            self._adapter_manager.set_adapter_mapping(mapping)
169

170
171
172
173
174
175
    def supports_tower_connector_lora(self) -> bool:
        return (
            self._adapter_manager.supports_mm
            and self._adapter_manager.supports_tower_connector_lora
        )

176
    def _apply_adapters(self, adapter_requests: set[Any]) -> None:
177
178
179
        existing_adapters = self.list_adapters()
        models_map = {
            adapter_request.adapter_id: adapter_request
180
181
            for adapter_request in adapter_requests
            if adapter_request
182
183
184
185
186
        }
        if len(models_map) > self._adapter_manager.adapter_slots:
            raise RuntimeError(
                f"Number of requested models ({len(models_map)}) is greater "
                "than the number of GPU model slots "
187
188
                f"({self._adapter_manager.adapter_slots})."
            )
189
190
191
192
193
        requested_ids = set(models_map)
        for adapter_id in existing_adapters - requested_ids:
            self.remove_adapter(adapter_id)
        for adapter_id in requested_ids - existing_adapters:
            self.add_adapter(models_map[adapter_id])
194

195
    def add_adapter(self, adapter_request: Any) -> bool:
196
197
198
199
200
201
        if adapter_request.adapter_id in self.list_adapters():
            return False
        loaded_adapter = self._load_adapter(adapter_request)
        loaded = self._adapter_manager.add_adapter(loaded_adapter)
        self._adapter_manager.activate_adapter(loaded_adapter.id)
        return loaded
202

203
204
    def remove_adapter(self, adapter_id: int) -> bool:
        return self._adapter_manager.remove_adapter(adapter_id)
205

206
207
    def remove_all_adapters(self):
        self._adapter_manager.remove_all_adapters()
208

209
    def list_adapters(self) -> set[int]:
210
        return set(self._adapter_manager.list_adapters())
211
212
213
214
215
216
217
218
219


class LRUCacheWorkerLoRAManager(WorkerLoRAManager):
    """WorkerLoRAManager that manages LoRA models on the worker side.

    Uses an LRU Cache. Every request, the requested LoRAs will be loaded
    (unless they are already loaded) and least recently used LoRAs will
    be unloaded if the cache is above capacity."""

220
    _manager_cls: type[LRUCacheLoRAModelManager] = LRUCacheLoRAModelManager
221
222
223
224

    def create_lora_manager(
        self,
        model: torch.nn.Module,
225
        vllm_config: VllmConfig | None = None,
226
227
228
    ) -> Any:
        lora_manager = create_lora_manager(
            model,
229
            lora_manager_cls=self._manager_cls,
230
231
232
            max_num_seqs=self.max_num_seqs,
            vocab_size=self.vocab_size,
            lora_config=self.lora_config,
233
            device=self.device,
234
            max_num_batched_tokens=self.max_num_batched_tokens,
235
            vllm_config=vllm_config,
236
        )
237
        self._adapter_manager = lora_manager
238
239
        return lora_manager.model

240
    def _apply_adapters(self, lora_requests: set[LoRARequest]) -> None:
241
242
        loras_map = {
            lora_request.lora_int_id: lora_request
243
244
            for lora_request in lora_requests
            if lora_request
245
        }
246
        if len(loras_map) > self._adapter_manager.lora_slots:
247
248
249
            raise RuntimeError(
                f"Number of requested LoRAs ({len(loras_map)}) is greater "
                "than the number of GPU LoRA slots "
250
251
                f"({self._adapter_manager.lora_slots})."
            )
252
        for lora in loras_map.values():
253
            self.add_adapter(lora)
254

255
    def add_adapter(self, lora_request: LoRARequest) -> bool:
256
257
258
259
260
        # Note that this method is not thread-safe. It may be invoked multiple
        # times for the same adapter when using multiple API servers.
        # This is ok because it's currently only called from
        # the single-threaded core engine loop.

261
262
263
264
        if (
            lora_request.lora_int_id not in self.list_adapters()
            or lora_request.load_inplace
        ):
265
266
267
268
269
270
            # Load the new adapter first to ensure it is actually valid, before
            # evicting any existing adapters.
            # This may cause the # of loaded lora adapters to very temporarily
            # exceed `--max-cpu-loras`.
            lora = self._load_adapter(lora_request)

271
272
273
274
            # Remove the existing adapter if it exists
            # Use case for LoRA inplace
            self._adapter_manager.remove_adapter(lora.id)

275
276
            # Loading succeeded, now check if we will exceed cache capacity and
            # evict if the oldest adapter if so
277
            if len(self._adapter_manager) + 1 > self._adapter_manager.capacity:
278
                assert isinstance(self._adapter_manager, LRUCacheLoRAModelManager)
279
                self._adapter_manager.remove_oldest_adapter()
280
            # Then add the new adapter to the cache
281
            loaded = self._adapter_manager.add_adapter(lora)
282
283
284
        else:
            # If the lora is already loaded, just touch it to
            # update its position in the caches
285
286
287
            loaded = (
                self._adapter_manager.get_adapter(lora_request.lora_int_id) is not None
            )
288
        self._adapter_manager.activate_adapter(lora_request.lora_int_id)
289
        return loaded