"vllm/tool_parsers/hermes_tool_parser.py" did not exist on "05a4324f8e3932c25554791ff248e3e0200eef92"
models.py 34.1 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
5
import math
import os
6
from collections.abc import Sequence
7
from typing import Callable, Optional, TypeVar, Union
8

9
import regex as re
10
11
12
13
import safetensors.torch
import torch
from torch import nn

14
from vllm.config.lora import LoRAConfig
15
from vllm.logger import init_logger
16
from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping
17
from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
18
from vllm.lora.peft_helper import PEFTHelper
19
from vllm.lora.punica_wrapper import get_punica_wrapper
20
21
22
23
24
25
26
27
from vllm.lora.utils import (
    from_layer,
    from_layer_logits_processor,
    get_supported_lora_modules,
    is_regex_target_modules,
    parse_fine_tuned_lora_name,
    replace_submodule,
)
28
from vllm.model_executor.layers.fused_moe import FusedMoE
29
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
30
from vllm.model_executor.models import SupportsLoRA, supports_multimodal
31
from vllm.model_executor.models.interfaces import is_pooling_model
32
from vllm.model_executor.models.module_mapping import MultiModelKeys
33
from vllm.model_executor.models.utils import PPMissingLayer, WeightsMapper
34
from vllm.model_executor.utils import get_packed_modules_mapping
35
36
from vllm.utils import is_pin_memory_available
from vllm.utils.cache import LRUCache
37

38
logger = init_logger(__name__)
39

40
41
42
43
44
45
46
47
48
49
50
51
52
53
T = TypeVar("T")


class AdapterLRUCache(LRUCache[int, T]):
    def __init__(self, capacity: int, deactivate_fn: Callable[[int], object]):
        super().__init__(capacity)
        self.deactivate_fn = deactivate_fn

    def _on_remove(self, key: int, value: Optional[T]):
        logger.debug("Removing adapter int id: %d", key)
        self.deactivate_fn(key)
        return super()._on_remove(key, value)


54
55
56
57
58
59
60
61
62
_GLOBAL_LORA_ID = 0


def get_lora_id():
    global _GLOBAL_LORA_ID
    _GLOBAL_LORA_ID += 1
    return _GLOBAL_LORA_ID


63
64
65
66
67
68
def is_moe_model(model: nn.Module) -> bool:
    """Checks if the model contains FusedMoE layers and warns the user."""
    if any(isinstance(module, FusedMoE) for module in model.modules()):
        logger.warning_once(
            "For MoE models, vLLM currently does not support fused MoE LoRA "
            "inference. Please ensure that the loaded LoRA model does not "
69
70
            "contain expert weights."
        )
71
72
73
74
        return True
    return False


75
class LoRAModel:
76
77
78
79
80
81
    """A LoRA fine-tuned model."""

    def __init__(
        self,
        lora_model_id: int,
        rank: int,
82
        loras: dict[str, LoRALayerWeights],
83
    ) -> None:
84
85
86
87
88
        """
        Args:
            lora_model_id: The integer id for the lora model.
            rank: lora rank.
            loras: module name -> weights for lora-replaced layers.
89

90
        """
91
        self.id = lora_model_id
92

93
94
95
        assert lora_model_id > 0, (
            f"a valid lora id should be greater than 0, got {self.id}"
        )
96
        self.rank = rank
97
        self.loras: dict[str, LoRALayerWeights] = loras
98

99
100
101
102
103
104
105
106
107
108
    def clone(self, lora_model_id: int) -> "LoRAModel":
        """Return a copy of the object with different ids.

        Will share the underlying tensors."""
        return self.__class__(
            lora_model_id,
            rank=self.rank,
            loras=self.loras.copy(),
        )

109
110
    @property
    def extra_vocab_size(self) -> int:
111
112
113
114
115
        return (
            max(lora.extra_vocab_size for lora in self.loras.values())
            if self.loras
            else 0
        )
116
117
118
119
120

    def get_lora(self, module_name: str) -> Optional[LoRALayerWeights]:
        """Get LoRA for a given module by name"""
        return self.loras.get(module_name, None)

121
122
123
    def check_lora_name(self, lora_name: str) -> bool:
        return lora_name in self.loras

124
125
126
127
128
    # (yard1): TODO see if we can derive target_embedding_padding automatically
    @classmethod
    def from_lora_tensors(
        cls,
        lora_model_id: int,
129
        tensors: dict[str, torch.Tensor],
130
        peft_helper: PEFTHelper,
131
132
        device: str = "cuda",
        dtype: Optional[torch.dtype] = None,
133
        embeddings: Optional[dict[str, torch.Tensor]] = None,
134
        target_embedding_padding: Optional[int] = None,
135
136
        embedding_modules: Optional[dict[str, str]] = None,
        embedding_padding_modules: Optional[list[str]] = None,
137
        weights_mapper: Optional[WeightsMapper] = None,
138
139
    ) -> "LoRAModel":
        """Create a LoRAModel from a dictionary of tensors."""
140
        pin_memory = str(device) == "cpu" and is_pin_memory_available()
141
        loras: dict[str, LoRALayerWeights] = {}
142
        for tensor_name, tensor in tensors.items():
143
            module_name, is_lora_a, is_bias = parse_fine_tuned_lora_name(
144
145
                tensor_name, weights_mapper
            )
146
147
148
            if module_name not in loras:
                lora_embeddings_tensor = None
                if embeddings:
149
                    assert embedding_modules is not None
150
                    embeddings_module = next(
151
152
                        (k for k in embedding_modules if k in module_name), None
                    )
153
154
                    if embeddings_module:
                        lora_embeddings_tensor = embeddings[
155
156
                            embedding_modules[embeddings_module]
                        ].to(device=device, dtype=dtype)
157
                        if pin_memory:
158
                            lora_embeddings_tensor = lora_embeddings_tensor.pin_memory()
159
                loras[module_name] = LoRALayerWeights.from_config(
160
161
                    module_name, peft_helper, lora_embeddings_tensor
                )
162

163
            if is_bias:
164
165
                loras[module_name].bias = tensor.to(device=device, dtype=dtype)
                bias = tensor.to(device=device, dtype=dtype)
166
167
168
169
                if pin_memory:
                    bias = bias.pin_memory()
                loras[module_name].bias = bias
            elif is_lora_a:
170
                loras[module_name].lora_a = tensor.to(device=device, dtype=dtype)
171
                if pin_memory:
172
                    loras[module_name].lora_a = loras[module_name].lora_a.pin_memory()
173
            else:
174
                loras[module_name].lora_b = tensor.to(device=device, dtype=dtype)
175
                assert embedding_padding_modules is not None
176
177
178
179
                if (
                    any(name in module_name for name in embedding_padding_modules)
                    and target_embedding_padding is not None
                ):
180
                    lora_b = loras[module_name].lora_b
181
182
                    assert target_embedding_padding >= lora_b.shape[0]
                    addition = target_embedding_padding - lora_b.shape[0]
183
                    loras[module_name].lora_b = torch.nn.functional.pad(
184
185
                        lora_b, (0, 0, 0, addition)
                    )
186
                if pin_memory:
187
                    loras[module_name].lora_b = loras[module_name].lora_b.pin_memory()
188
189
190

        for lora in loras.values():
            lora.optimize()
191

192
        return cls(lora_model_id, peft_helper.r, loras)
193
194
195

    @classmethod
    def from_local_checkpoint(
196
197
198
199
200
201
202
203
204
205
206
207
208
209
        cls,
        lora_dir: str,
        expected_lora_modules: list[str],
        peft_helper: PEFTHelper,
        *,
        lora_model_id: Optional[int] = None,
        device: str = "cuda",
        dtype: Optional[torch.dtype] = None,
        target_embedding_padding: Optional[int] = None,
        embedding_modules: Optional[dict[str, str]] = None,
        embedding_padding_modules: Optional[list[str]] = None,
        weights_mapper: Optional[WeightsMapper] = None,
        tensorizer_config_dict: Optional[dict] = None,
    ) -> "LoRAModel":
210
        """Create a LoRAModel from a local checkpoint.
211

212
213
214
215
        Args:
            lora_dir: The local path that has lora data.
            expected_lora_modules: Name of modules that are expected to be
                replaced by lora.
216
            peft_helper: Loaded lora configuration information.
217
            lora_model_id: LoRA model id. If not given, automatically set by
218
219
220
221
222
223
224
                a global counter.
            device: Device where the lora model is loaded.
            dtype: dtype of the lora model weights.

        Returns:
            Loaded LoRA Model.
        """
225
226
        lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors")
        lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin")
227
        lora_pt_file_path = os.path.join(lora_dir, "adapter_model.pt")
228
        new_embeddings_tensor_path = os.path.join(
229
230
231
            lora_dir, "new_embeddings.safetensors"
        )
        new_embeddings_bin_file_path = os.path.join(lora_dir, "new_embeddings.bin")
232
233
234
235
236
237
        tensors: dict[str, torch.Tensor] = {}
        unexpected_modules: list[Union[list[str], str]] = []

        def check_unexpected_modules(modules: dict):
            for lora_module in modules.keys():  # noqa
                module_name, _, _ = parse_fine_tuned_lora_name(
238
239
                    lora_module, weights_mapper
                )
240
241
242
243
244
245
246
247
                part_name = module_name.split(".")[-1]
                if part_name not in expected_lora_modules:
                    unexpected_modules.append(module_name)
            if unexpected_modules:
                raise ValueError(
                    f"While loading {lora_dir}, expected"
                    f" target modules in {expected_lora_modules}"
                    f" but received {unexpected_modules}."
248
249
                    f" Please verify that the loaded LoRA module is correct"
                )
250
251
252
253
254

        if tensorizer_config_dict:
            from tensorizer import TensorDeserializer

            tensorizer_config = TensorizerConfig(**tensorizer_config_dict)
255
256
257
            lora_tensor_path = os.path.join(
                tensorizer_config.tensorizer_dir, "adapter_model.tensors"
            )
258
            tensorizer_args = tensorizer_config._construct_tensorizer_args()
259
260
261
            tensors = TensorDeserializer(
                lora_tensor_path,
                dtype=tensorizer_config.dtype,
262
263
                **tensorizer_args.deserialization_kwargs,
            )
264
            check_unexpected_modules(tensors)
265

266
        elif os.path.isfile(lora_tensor_path):
267
268
269
270
271
272
273
            # Find unexpected modules.
            # Use safetensor key as a source of truth to find expected modules.
            # in peft if you have target_modules A, B, C and C does not exist
            # in the model it won’t error and model will be trained with A, B
            # loraified. C won’t exist in the safetensor but it will exist in
            # the target_modules of the adapter_config.json.
            unexpected_modules = []
274
            with safetensors.safe_open(lora_tensor_path, framework="pt") as f:  # type: ignore
275
                # Load tensors if there are only expected modules.
276
                check_unexpected_modules(f)
277
278
                for module in f.keys():  # noqa
                    tensors[module] = f.get_tensor(module)
279
        elif os.path.isfile(lora_bin_file_path) or os.path.isfile(lora_pt_file_path):
280
281
            # When a bin/pt file is provided, we rely on config to find
            # unexpected modules.
282
            unexpected_modules = []
283
            target_modules = peft_helper.target_modules
284
285
            if not isinstance(target_modules, list):
                target_modules = [target_modules]
286
287
288
289
290
291
292
293
294
295
            for module in target_modules:
                # Compatible with more modules,
                # such as:layers.11.self_attn.k_proj
                part_name = module.split(".")[-1]
                if part_name not in expected_lora_modules:
                    unexpected_modules.append(module)
            # loaded lora's target modules must be a subset of
            # expected_lora_modules. It is not reliable. See
            # https://github.com/vllm-project/vllm/pull/5909. But there's no
            # other better mechanism.
296
            if unexpected_modules and not is_regex_target_modules(
297
298
                peft_helper.target_modules, expected_lora_modules
            ):
299
300
301
302
                raise ValueError(
                    f"While loading {lora_dir}, expected"
                    f" target modules in {expected_lora_modules}"
                    f" but received {unexpected_modules}."
303
304
305
306
307
308
309
310
                    f" Please verify that the loaded LoRA module is correct"
                )
            lora_file_path = (
                lora_bin_file_path
                if os.path.isfile(lora_bin_file_path)
                else lora_pt_file_path
            )
            tensors = torch.load(lora_file_path, map_location=device, weights_only=True)
311
312
313
314
315
        else:
            raise ValueError(f"{lora_dir} doesn't contain tensors")

        embeddings = None
        if os.path.isfile(new_embeddings_tensor_path):
316
            embeddings = safetensors.torch.load_file(new_embeddings_tensor_path)
317
        elif os.path.isfile(new_embeddings_bin_file_path):
318
319
320
            embeddings = torch.load(
                new_embeddings_bin_file_path, map_location=device, weights_only=True
            )
321
322

        return cls.from_lora_tensors(
323
            lora_model_id=get_lora_id() if lora_model_id is None else lora_model_id,
324
            tensors=tensors,
325
            peft_helper=peft_helper,
326
327
328
329
            device=device,
            dtype=dtype,
            embeddings=embeddings,
            target_embedding_padding=target_embedding_padding,
Terry's avatar
Terry committed
330
            embedding_modules=embedding_modules,
331
            embedding_padding_modules=embedding_padding_modules,
332
333
            weights_mapper=weights_mapper,
        )
334
335


336
class LoRAModelManager:
337
338
339
340
    """A manager that manages multiple LoRA-fine-tuned models."""

    def __init__(
        self,
341
        model: SupportsLoRA,
342
343
344
345
        max_num_seqs: int,
        max_num_batched_tokens: int,
        vocab_size: int,
        lora_config: LoRAConfig,
346
        device: torch.device,
347
348
349
350
351
352
353
354
355
356
357
358
    ):
        """Create a LoRAModelManager and adapter for a given model.

        Args:
            model: the model to be adapted.
            max_num_seqs: the maximum number of sequences model can run in a
                single batch.
            max_num_batched_tokens: the maximum number of tokens model can run
                in a single batch.
            vocab_size: the vocab size of the model.
            lora_config: the LoRA configuration.
        """
359
360
361
362
363
        self.model: SupportsLoRA = model
        self._registered_adapters: dict[int, LoRAModel] = {}
        # Dict instead of a set for compatibility with LRUCache.
        self._active_adapters: dict[int, None] = {}
        self.adapter_type = "LoRA"
364
        self.lora_config = lora_config
365
        self.device = device
366
367
368
        self.max_num_seqs = max_num_seqs
        assert self.capacity >= self.lora_slots
        self.max_num_batched_tokens = math.ceil(max_num_batched_tokens / 8) * 8
369
        self.lora_index_to_id: list[Optional[int]] = [None] * self.lora_slots
370
        self.vocab_size = vocab_size
371
372
373
374
        self.punica_wrapper = get_punica_wrapper(
            max_num_batched_tokens,
            max_batches=self.max_num_seqs,
            device=self.device,
375
376
            max_loras=self.lora_config.max_loras,
        )
377

378
379
        self.supported_lora_modules = get_supported_lora_modules(self.model)
        assert self.supported_lora_modules, "No supported LoRA modules found in"
380
        f" {self.model.__class__.__name__}."
381
382

        self.packed_modules_mapping = get_packed_modules_mapping(self.model)
383
        # Used to indicate whether the model is a multimodal model
384
385
386
387
        self.supports_mm: bool = (
            supports_multimodal(self.model)
            # In case the model only supports LoRA for
            # text modules (e.g. ChatGLM)
388
389
            and hasattr(self.model, "get_mm_mapping")
        )
390
        self.is_pooling_model = is_pooling_model(self.model)
391
        self.is_moe_model = is_moe_model(self.model)
392
393
394
        self.packed_modules: dict[str, list[str]] = {}
        self.modules: dict[str, BaseLayerWithLoRA] = {}
        # Dict instead of a set for compatibility with LRUCache.
395
        self._last_mapping: Optional[LoRAMapping] = None
396
        self._create_lora_modules()
397
        self.model.lora_manager = self
398
399
400

    def __len__(self) -> int:
        return len(self._registered_adapters)
401
402
403
404
405
406
407
408
409

    @property
    def capacity(self) -> int:
        return self.lora_config.max_cpu_loras

    @property
    def lora_slots(self) -> int:
        return self.lora_config.max_loras

410
411
412
    @property
    def adapter_slots(self) -> int:
        return self.lora_slots
413

414
    def activate_adapter(
415
416
417
418
        self,
        lora_id: int,
    ) -> bool:
        """Move LoRA into a GPU buffer to be used in the forward pass."""
419
        if lora_id in self._active_adapters:
420
421
            return False
        first_free_slot = next(
422
423
424
425
426
427
428
            (
                (i, lora_id)
                for i, lora_id in enumerate(self.lora_index_to_id)
                if lora_id is None
            ),
            None,
        )
429
430
431
        if first_free_slot is None:
            raise ValueError("No free lora slots")
        index, _ = first_free_slot
432
433
        self._active_adapters[lora_id] = None
        lora_model = self._registered_adapters[lora_id]
434
435
436
        logger.debug(
            "Activating LoRA. int id: %d, slot index: %d", lora_model.id, index
        )
437
438
        self.lora_index_to_id[index] = lora_model.id
        for module_name, module in self.modules.items():
439
            module_lora = self._get_lora_layer_weights(lora_model, module_name)
440
441
            if module_lora:
                module_lora.optimize()
442
443
                # Bias is not explicitly enabled with the flag enable_lora_bias.
                bias = module_lora.bias
444
445
446
447
                if (
                    torch.is_tensor(bias)
                    or (isinstance(bias, Sequence) and any(b is not None for b in bias))
                ) and not self.lora_config.bias_enabled:
448
449
450
                    module_lora.bias = None
                    raise ValueError(
                        f"Adapter bias cannot be used for {module_name}"
451
452
453
454
455
456
457
458
459
                        " without --enable-lora-bias."
                    )
                module.set_lora(
                    index,
                    module_lora.lora_a,
                    module_lora.lora_b,
                    module_lora.embeddings_tensor,
                    module_lora.bias,
                )
460
461
462
463
            else:
                module.reset_lora(index)
        return True

464
    def _deactivate_adapter(self, lora_id: int):
465
466
467
468
469
470
        try:
            index = self.lora_index_to_id.index(lora_id)
            self.lora_index_to_id[index] = None
        except ValueError:
            pass

471
    def _add_adapter(self, lora: LoRAModel):
472
        self._create_merged_loras_inplace(lora)
473
        self._registered_adapters[lora.id] = lora
474

475
    def pin_adapter(self, lora_id: int) -> bool:
476
477
        """Pin a LoRAModel in the manager cache."""
        raise NotImplementedError(
478
            "Pinning is not supported in LoRAModelManager. "
479
480
            "Use LRUCacheLoRAModelManager for pinning"
        )  # type: ignore
481

482
    def _set_adapter_mapping(self, mapping: LoRAMapping) -> None:
483
484
485
486
487
488
489
490
        # update lora states
        self.punica_wrapper.update_metadata(
            mapping,
            self.lora_index_to_id,
            self.lora_slots + 1,
            self.vocab_size,
            self.lora_config.lora_extra_vocab_size,
        )
491

492
    def remove_all_adapters(self):
493
        """Remove all LoRAModels from the manager."""
494
        self._registered_adapters.clear()
495
        self.lora_index_to_id = [None] * self.lora_slots
496
        self._active_adapters.clear()
497
498

    def _create_lora_modules(self):
499
500
501
502
503
        def _parent_module(module_name: str) -> str:
            # module name is a dot separated name.
            # for example:
            #  - given an input 'x.y.z' return 'x.y'
            #  - given an input 'x' return ''
504
            return module_name.rpartition(".")[0]
505

506
        for module_name, module in self.model.named_modules(remove_duplicate=False):
507
508
            if isinstance(module, PPMissingLayer):
                continue
509
510
            if not self._match_target_modules(module_name):
                continue
511
512
513
514
515
516
517
518
519
            # A temporary approach for multimodal models to support LoRA
            # TODO: Remove this restriction
            if self._filter_unsupported_mm_module(module_name):
                logger.warning(
                    "Regarding multimodal models, vLLM currently only supports "
                    "adding LoRA to language model, %s will be ignored.",
                    module_name,
                )
                continue
520
521
            parts = module_name.split(".")[-1]
            packed_moduled_lst = self.packed_modules_mapping.get(parts, [])
522
            new_module = replace_submodule(
523
524
525
526
527
528
529
530
531
532
                self.model,
                module_name,
                from_layer(
                    module,
                    self.lora_slots,
                    self.lora_config,
                    packed_moduled_lst,
                    self.model.config,
                ),
            )
533

534
535
            # (yard1): TODO make this more robust
            if "lm_head" in module_name:
536
                logits_processor_module_name = "logits_processor"
537
538
539
                parent_module = _parent_module(module_name)
                if parent_module:
                    logits_processor_module_name = (
540
541
                        f"{parent_module}.{logits_processor_module_name}"
                    )
542

543
                logits_processor_module = self.model.get_submodule(
544
545
                    logits_processor_module_name
                )
546

547
                new_module = replace_submodule(
548
549
550
551
552
553
554
555
556
557
                    self.model,
                    logits_processor_module_name,
                    from_layer_logits_processor(
                        logits_processor_module,
                        module,
                        self.lora_slots,
                        self.lora_config,
                        self.model.config,
                    ),
                )
558
559
560
561
562
563

            # In some models, especially multimodal ones, layers with the same
            # name may have different types, such as nn.Linear and
            # ReplicatedLinear. The nn.Linear layers cannot be replaced with
            # LoRA layers, leading to assertion error. The following check
            # aims to prevent this error
564
            if self.supports_mm and not isinstance(new_module, BaseLayerWithLoRA):
565
                continue
566
567
            self.register_module(module_name, new_module)
            self._register_packed_modules(module_name)
568
569
            # All lora layers share the same punica_wrapper based on reference.
            new_module.set_mapping(self.punica_wrapper)
570
571
572
573
574

    def register_module(self, module_name: str, module: "BaseLayerWithLoRA"):
        assert isinstance(module, BaseLayerWithLoRA)
        self.modules[module_name] = module

Terry's avatar
Terry committed
575
    def create_dummy_lora(
576
577
578
579
580
        self,
        lora_id: int,
        rank: int,
        embedding_modules: Optional[dict[str, str]] = None,
    ) -> LoRAModel:
581
        """Create zero-initialized LoRAModel for warmup."""
582
        model = LoRAModel(lora_id, rank, {})
583
        for module_name, module in self.model.named_modules():
584
            bias_enabled = self.lora_config.bias_enabled
585
586
587
588
589
            if (
                not self._match_target_modules(module_name)
                or not isinstance(module, BaseLayerWithLoRA)
                or self._filter_unsupported_mm_module(module_name)
            ):
590
591
592
                continue
            parts = module_name.split(".")
            if module_name not in self.packed_modules:
593
                assert embedding_modules is not None
Terry's avatar
Terry committed
594
                if parts[-1] in embedding_modules:
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
                    input_dim = (
                        module.base_layer.org_vocab_size
                        + self.lora_config.lora_extra_vocab_size
                        if hasattr(module.base_layer, "org_vocab_size")
                        else module.base_layer.weight.shape[1]
                    )
                    output_dim = (
                        module.base_layer.embedding_dim
                        if hasattr(module.base_layer, "embedding_dim")
                        else module.base_layer.weight.shape[0]
                    )
                    embeddings_tensor_dim = (
                        module.base_layer.embedding_dim
                        if hasattr(module.base_layer, "embedding_dim")
                        else module.base_layer.weight.shape[1]
                    )
611
612
613
614
615
                    lora = LoRALayerWeights.create_dummy_lora_weights(
                        module_name,
                        input_dim,
                        output_dim,
                        rank,
616
                        module.lora_a_stacked[0].dtype,
617
                        "cpu",
618
                        embeddings_tensor_dim=embeddings_tensor_dim,
619
620
                        bias_enabled=bias_enabled,
                    )
621
622
623
                else:
                    lora = LoRALayerWeights.create_dummy_lora_weights(
                        module_name,
624
625
                        module.lora_a_stacked[0].shape[-1],
                        module.lora_b_stacked[0].shape[-2],
626
                        rank,
627
                        module.lora_a_stacked[0].dtype,
628
                        "cpu",
629
                        bias_enabled=bias_enabled,
630
631
632
633
                    )
            else:
                parts = module_name.split(".")
                replacements = self.packed_modules_mapping[parts[-1]]
634
                subloras: list[Optional[LoRALayerWeights]] = []
635
636
637
638
639
640
641
642
                for i, r in enumerate(replacements):
                    lora = LoRALayerWeights.create_dummy_lora_weights(
                        module_name + "." + r,
                        module.lora_a_stacked[i].shape[-1],
                        module.lora_b_stacked[i].shape[-2],
                        rank,
                        module.lora_a_stacked[i].dtype,
                        "cpu",
643
                        bias_enabled=bias_enabled,
644
645
646
647
648
649
650
651
652
                    )
                    subloras.append(lora)
                lora = PackedLoRALayerWeights.pack(subloras)
            model.loras[module_name] = lora
        return model

    def _match_target_modules(self, module_name: str):
        return any(
            re.match(
653
654
655
656
657
                r".*\.{target_module}$".format(target_module=target_module), module_name
            )
            or target_module == module_name
            for target_module in self.supported_lora_modules
        )
658

659
660
661
    def _filter_unsupported_mm_module(self, module_name: str) -> bool:
        """
        Regarding multimodal models, vLLM currently only supports adding LoRA to
662
        language model. LoRA for other modules, such as the vision tower, will
663
664
665
666
        be filtered out.
        """
        if self.supports_mm:
            module_mapping: MultiModelKeys = self.model.get_mm_mapping()
667
            prefix_lst = module_mapping.connector + module_mapping.tower_model
668
            return any([module_name.startswith(prefix) for prefix in prefix_lst])
669
670
        return False

671
672
673
    def _register_packed_modules(self, module_full_name: str) -> None:
        parts = module_full_name.split(".")
        module_name = parts[-1]
674
675
676
677
        replacements = self.packed_modules_mapping.get(module_name, [])
        # When replacements is less than or equal to 1, it indicates that this
        # module is not a packed module.
        if len(replacements) <= 1:
678
679
680
681
682
683
684
685
            return
        prefix = ".".join(parts[:-1])
        self.packed_modules[module_full_name] = [
            prefix + "." + r if prefix else r for r in replacements
        ]

    def _create_merged_loras_inplace(self, lora_model: LoRAModel) -> None:
        for module_name, new_module_names in self.packed_modules.items():
686
687
            replacement_loras: list[Optional[LoRALayerWeights]] = []
            replaced_module: set[str] = set()
688
689
            has_replacement = False
            for r in new_module_names:
690
                lora = self._get_lora_layer_weights(lora_model, r)
691
692
693
                replacement_loras.append(lora)
                if lora:
                    has_replacement = True
694
                    replaced_module.add(r)
695
696
697
698
699
700
            if not has_replacement:
                continue
            for i in range(len(replacement_loras)):
                if replacement_loras[i]:
                    continue
                replacement_loras[i] = None
701
            # HACK Temporary solution for the pool model.
702
            if self.is_pooling_model and not lora_model.check_lora_name(module_name):
703
704
705
                replaced_module_name = module_name.replace("model.", "")
                if lora_model.check_lora_name(module_name):
                    module_name = replaced_module_name
706
            lora_model.loras[module_name] = PackedLoRALayerWeights.pack(
707
708
                replacement_loras
            )
709
710
711
            # Remove the modules that have been replaced.
            for module in replaced_module:
                lora_model.loras.pop(module, None)
712

713
    def _get_lora_layer_weights(
714
715
        self, lora_model: LoRAModel, module_name: str
    ) -> Optional[LoRALayerWeights]:
716
        org_module_name = module_name
717
        if self.is_pooling_model and not lora_model.check_lora_name(module_name):
718
719
720
721
722
723
724
            # If it's a pool model, and the layer name is not found,
            # remove the prefix 'model.' and search again.
            module_name = module_name.replace("model.", "")
            if lora_model.check_lora_name(module_name):
                org_module_name = module_name
                logger.info_once(
                    "For the pool model, successfully loaded the LoRA weights "
725
726
                    "after removing the prefix 'model.'."
                )
727
728
        return lora_model.get_lora(org_module_name)

729
    def deactivate_adapter(self, adapter_id: int) -> bool:
730
731
732
733
734
        if adapter_id not in self._active_adapters:
            return False
        self._deactivate_adapter(adapter_id)
        self._active_adapters.pop(adapter_id, None)
        return True
735
736

    def add_adapter(self, adapter: LoRAModel) -> bool:
737
        logger.debug("Adding lora. Model id: %d, int id: %d", adapter.id, adapter.id)
738
739
740
741
742
743
        if adapter.id in self._registered_adapters:
            return False
        if len(self._registered_adapters) >= self.capacity:
            raise RuntimeError("No free adapter slots.")
        self._add_adapter(adapter)
        return True
744

745
    def set_adapter_mapping(self, mapping: LoRAMapping) -> None:
746
747
748
        if self._last_mapping != mapping:
            self._set_adapter_mapping(mapping)
            self._last_mapping = mapping
749
750

    def remove_adapter(self, adapter_id: int) -> bool:
751
752
753
754
755
        self.deactivate_adapter(adapter_id)
        if adapter_id not in self._registered_adapters:
            return False
        self._registered_adapters.pop(adapter_id, None)
        return True
756

757
758
    def list_adapters(self) -> dict[int, LoRAModel]:
        return dict(self._registered_adapters)
759

760
761
    def get_adapter(self, adapter_id: int) -> Optional[LoRAModel]:
        return self._registered_adapters.get(adapter_id)
762
763
764


class LoRALRUCache(AdapterLRUCache[LoRAModel]):
765
    def __init__(self, capacity: int, deactivate_lora_fn: Callable[[int], bool]):
766
        super().__init__(capacity, deactivate_lora_fn)
767
768
769
770
771


class LRUCacheLoRAModelManager(LoRAModelManager):
    """A model manager that manages multiple LoRAs with LRU cache."""

772
773
774
775
776
777
778
779
780
781
782
783
    def __init__(
        self,
        model: nn.Module,
        max_num_seqs: int,
        max_num_batched_tokens: int,
        vocab_size: int,
        lora_config: LoRAConfig,
        device: torch.device,
    ):
        super().__init__(
            model, max_num_seqs, max_num_batched_tokens, vocab_size, lora_config, device
        )
784
        self._registered_adapters: LoRALRUCache = LoRALRUCache(
785
786
            self.capacity, self.deactivate_adapter
        )
787
        self._active_adapters: LoRALRUCache = LoRALRUCache(
788
789
            self.lora_slots, self._deactivate_adapter
        )
790

791
    def list_adapters(self) -> dict[int, LoRAModel]:
792
        """List all registered LoRAModels."""
793
        return dict(self._registered_adapters.cache)
794

795
    def add_adapter(self, lora: LoRAModel) -> bool:
796
        """Add a LoRAModel to the manager."""
797
        logger.debug("Adding lora. Model id: %d, int id: %d", lora.id, lora.id)
798
799
        if lora.id not in self._registered_adapters:
            self._add_adapter(lora)
800
801
802
            was_added = True
        else:
            # We always touch to update the LRU cache order
803
            self._registered_adapters.touch(lora.id)
804
805
806
            was_added = False
        return was_added

807
    def activate_adapter(
808
809
810
        self,
        lora_id: int,
    ) -> bool:
811
812
813
814
        if (
            lora_id not in self._active_adapters
            and len(self._active_adapters) >= self.lora_slots
        ):
815
816
            self._active_adapters.remove_oldest()
        result = super().activate_adapter(lora_id)
817
        # We always touch to update the LRU cache order
818
        self._active_adapters.touch(lora_id)
819
820
        return result

821
822
823
    def remove_oldest_adapter(self) -> bool:
        if len(self._registered_adapters) > 0:
            self._registered_adapters.remove_oldest()
824
825
826
            return True
        return False

827
    def pin_adapter(self, lora_id: int) -> bool:
828
829
830
831
832
833
834
        """Pin a LoRAModel in the manager cache."""
        self._pin_lora_in_cpu_cache(lora_id)
        self._pin_lora_in_gpu_cache(lora_id)
        return True

    def _pin_lora_in_cpu_cache(self, lora_id: int):
        try:
835
            self._registered_adapters.pin(lora_id)
836
        except ValueError as err:
837
838
839
            raise ValueError(
                f"Pinning failed. LoRA {lora_id} is not registered."
            ) from err
840
841

    def _pin_lora_in_gpu_cache(self, lora_id: int):
842
        if lora_id not in self._active_adapters:
843
            # move lora to gpu if not already active
844
            self.activate_adapter(lora_id)
845

846
        self._active_adapters.pin(lora_id)
847

848
849

def create_lora_manager(
850
851
852
853
854
855
856
857
858
    model: nn.Module,
    max_num_seqs: int,
    max_num_batched_tokens: int,
    vocab_size: int,
    lora_config: LoRAConfig,
    device: torch.device,
    lora_manager_cls: type[LoRAModelManager] = LoRAModelManager,
    **kwargs,
) -> LoRAModelManager:
859
    """Create a LoRA adapter for a given model."""
860
    if not isinstance(model, SupportsLoRA):
861
862
863
864
865
866
867
        raise ValueError(f"Model {type(model)} is not supported for LoRA.")
    lora_manager = lora_manager_cls(
        model=model,
        max_num_seqs=max_num_seqs,
        max_num_batched_tokens=max_num_batched_tokens,
        vocab_size=vocab_size,
        lora_config=lora_config,
868
        device=device,
869
870
        **kwargs,
    )
871
    return lora_manager