worker_manager.py 8.82 KB
Newer Older
1
from contextlib import contextmanager
2
from typing import Any, Dict, List, Literal, Optional, Set, Type, Union
3
4
5

import torch

6
7
8
9
10
from vllm.adapter_commons.utils import (add_adapter_worker,
                                        apply_adapters_worker,
                                        list_adapters_worker,
                                        set_active_adapters_worker)
from vllm.adapter_commons.worker_manager import AbstractWorkerManager
11
from vllm.config import LoRAConfig
12
from vllm.logger import init_logger
Terry's avatar
Terry committed
13
from vllm.lora.models import (LoRAModel, LoRAModelManager,
14
15
16
                              LRUCacheLoRAModelManager, create_lora_manager)
from vllm.lora.request import LoRARequest

17
logger = init_logger(__name__)
18
19


20
class WorkerLoRAManager(AbstractWorkerManager):
21
22
23
24
25
    """WorkerLoRAManager that manages LoRA models on the worker side.

    Every request, the requested LoRAs will be loaded (unless they are already
    loaded), and every other LoRA will be unloaded."""

26
    _manager_cls: Type[LoRAModelManager] = LoRAModelManager
27
28
29
30
31
32
33
34

    def __init__(
        self,
        max_num_seqs: int,
        max_num_batched_tokens: int,
        vocab_size: int,
        lora_config: LoRAConfig,
        device: torch.device,
Terry's avatar
Terry committed
35
36
        embedding_modules: Dict[str, str],
        embedding_padding_modules: List[str],
37
        lora_model_cls: Type[LoRAModel] = LoRAModel,
38
        max_position_embeddings: Optional[int] = None,
39
40
    ):
        self._lora_model_cls = lora_model_cls
Terry's avatar
Terry committed
41
42
        self.embedding_modules = embedding_modules
        self.embedding_padding_modules = embedding_padding_modules
43
44
45
46
47
48
49
        self._cached_dummy_lora: Union[None, Literal[False], LoRAModel] = False
        self.max_num_seqs = max_num_seqs
        self.max_num_batched_tokens = max_num_batched_tokens
        self.vocab_size = vocab_size
        self.lora_config = lora_config
        self.max_position_embeddings = max_position_embeddings
        super().__init__(device)
50
        # Lazily initialized by create_lora_manager.
51
52
53
54
55
56
57
58
59
        self._adapter_manager: LoRAModelManager

    @contextmanager
    def dummy_lora_cache(self):
        """Use this context manager to reuse the dummy lora model
        to avoid creating it repeatedly."""
        self._cached_dummy_lora = None
        yield
        self._cached_dummy_lora = False
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74

    @property
    def is_enabled(self) -> bool:
        return True

    def create_lora_manager(
        self,
        model: torch.nn.Module,
    ) -> Any:
        lora_manager = create_lora_manager(
            model,
            max_num_seqs=self.max_num_seqs,
            max_num_batched_tokens=self.max_num_batched_tokens,
            vocab_size=self.vocab_size,
            lora_config=self.lora_config,
75
            lora_manager_cls=self._manager_cls,
76
        )
77
        self._adapter_manager = lora_manager
78
79
        return lora_manager.model

80
    def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
81
        try:
82
            model = self._adapter_manager.model
83
84
            supported_lora_modules = model.supported_lora_modules
            packed_modules_mapping = model.packed_modules_mapping
85
            expected_lora_modules: List[str] = []
86
87
88
89
90
91
            for module in supported_lora_modules:
                if module in packed_modules_mapping:
                    expected_lora_modules.extend(
                        packed_modules_mapping[module])
                else:
                    expected_lora_modules.append(module)
92
93
            lora = self._lora_model_cls.from_local_checkpoint(
                lora_request.lora_local_path,
94
                expected_lora_modules,
95
                max_position_embeddings=self.max_position_embeddings,
96
97
98
99
100
                lora_model_id=lora_request.lora_int_id,
                device="cpu",
                dtype=self.lora_config.lora_dtype,
                target_embedding_padding=self.vocab_size +
                self.lora_config.lora_extra_vocab_size,
Terry's avatar
Terry committed
101
102
                embedding_modules=self.embedding_modules,
                embedding_padding_modules=self.embedding_padding_modules,
103
104
105
106
107
108
109
110
111
            )
        except Exception as e:
            raise RuntimeError(
                f"Loading lora {lora_request.lora_local_path} failed") from e
        if lora.rank > self.lora_config.max_lora_rank:
            raise ValueError(
                f"LoRA rank {lora.rank} is greater than max_lora_rank "
                f"{self.lora_config.max_lora_rank}.")
        if lora.extra_vocab_size > self.lora_config.lora_extra_vocab_size:
112
113
114
            raise ValueError(f"LoRA added vocab size {lora.extra_vocab_size} "
                             f"is greater than lora_extra_vocab_size "
                             f"{self.lora_config.lora_extra_vocab_size}.")
115
116
117
        return lora

    def add_dummy_lora(self, lora_request: LoRARequest, rank: int) -> bool:
118
        if lora_request.lora_int_id in self.list_adapters():
119
            return False
120
121
122
123
        if isinstance(self._cached_dummy_lora, LoRAModel):
            dummy_lora = self._cached_dummy_lora.clone(
                lora_request.lora_int_id)
        else:
124
            dummy_lora = self._adapter_manager.create_dummy_lora(
125
                lora_request.lora_int_id, rank, 1, self.embedding_modules)
126
127
            if self._cached_dummy_lora is None:
                self._cached_dummy_lora = dummy_lora
128
        return self._adapter_manager.add_adapter(dummy_lora)
129

130
131
132
133
134
135
136
137
138
139
140
141
    def pin_adapter(self, adapter_id: int) -> bool:
        return self._adapter_manager.pin_adapter(adapter_id)

    def set_active_adapters(self, requests: Set[Any],
                            mapping: Optional[Any]) -> None:
        set_active_adapters_worker(requests, mapping, self._apply_adapters,
                                   self._adapter_manager.set_adapter_mapping)

    def _apply_adapters(self, adapter_requests: Set[Any]) -> None:
        apply_adapters_worker(adapter_requests, self.list_adapters,
                              self._adapter_manager.adapter_slots,
                              self.remove_adapter, self.add_adapter)
142

143
144
145
146
147
    def add_adapter(self, adapter_request: Any) -> bool:
        return add_adapter_worker(adapter_request, self.list_adapters,
                                  self._load_adapter,
                                  self._adapter_manager.add_adapter,
                                  self._adapter_manager.activate_adapter)
148

149
150
    def remove_adapter(self, adapter_id: int) -> bool:
        return self._adapter_manager.remove_adapter(adapter_id)
151

152
153
    def remove_all_adapters(self):
        self._adapter_manager.remove_all_adapters()
154

155
156
    def list_adapters(self) -> Set[int]:
        return list_adapters_worker(self._adapter_manager.list_adapters)
157
158
159
160
161
162
163
164
165


class LRUCacheWorkerLoRAManager(WorkerLoRAManager):
    """WorkerLoRAManager that manages LoRA models on the worker side.

    Uses an LRU Cache. Every request, the requested LoRAs will be loaded
    (unless they are already loaded) and least recently used LoRAs will
    be unloaded if the cache is above capacity."""

166
    _manager_cls: Type[LRUCacheLoRAModelManager] = LRUCacheLoRAModelManager
167
168
169
170
171
172
173

    def create_lora_manager(
        self,
        model: torch.nn.Module,
    ) -> Any:
        lora_manager = create_lora_manager(
            model,
174
            lora_manager_cls=self._manager_cls,
175
176
177
178
179
            max_num_seqs=self.max_num_seqs,
            vocab_size=self.vocab_size,
            lora_config=self.lora_config,
            max_num_batched_tokens=self.max_num_batched_tokens,
        )
180
        self._adapter_manager = lora_manager
181
182
        return lora_manager.model

183
    def _apply_adapters(self, lora_requests: Set[LoRARequest]) -> None:
184
185
186
187
        loras_map = {
            lora_request.lora_int_id: lora_request
            for lora_request in lora_requests if lora_request
        }
188
        if len(loras_map) > self._adapter_manager.lora_slots:
189
190
191
            raise RuntimeError(
                f"Number of requested LoRAs ({len(loras_map)}) is greater "
                "than the number of GPU LoRA slots "
192
                f"({self._adapter_manager.lora_slots}).")
193
        for lora in loras_map.values():
194
            self.add_adapter(lora)
195

196
197
    def add_adapter(self, lora_request: LoRARequest) -> bool:
        if lora_request.lora_int_id not in self.list_adapters():
198
            # Remove before we load the new lora to save memory
199
200
201
202
203
204
            if len(self._adapter_manager) + 1 > self._adapter_manager.capacity:
                assert isinstance(self._adapter_manager,
                                  LRUCacheLoRAModelManager)
                self._adapter_manager.remove_oldest_adapter()
            lora = self._load_adapter(lora_request)
            loaded = self._adapter_manager.add_adapter(lora)
205
206
207
        else:
            # If the lora is already loaded, just touch it to
            # update its position in the caches
208
            loaded = self._adapter_manager.get_adapter(
209
                lora_request.lora_int_id) is not None
210
        self._adapter_manager.activate_adapter(lora_request.lora_int_id)
211
        return loaded