replicated_linear.py 2.15 KB
Newer Older
Jee Jee Li's avatar
Jee Jee Li committed
1
2
3
4
5
6
7
8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project


import torch
import torch.nn as nn
from transformers import PretrainedConfig

9
from vllm.config.lora import LoRAConfig
Jee Jee Li's avatar
Jee Jee Li committed
10
11
12
13
14
15
16
from vllm.model_executor.layers.linear import ReplicatedLinear

from .base_linear import BaseLinearLayerWithLoRA


class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA):
    def __init__(self, base_layer: ReplicatedLinear) -> None:
17
18
19
        super().__init__(
            base_layer,
        )
Jee Jee Li's avatar
Jee Jee Li committed
20
21
22
23
24
25
        # To ensure interface compatibility, set to 1 always.
        self.output_size = self.base_layer.output_size
        self.n_slices = 1

    def forward(
        self, input_: torch.Tensor
26
    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor | None]:
Jee Jee Li's avatar
Jee Jee Li committed
27
28
29
30
31
32
33
34
35
        """Forward of ReplicatedLinearWithLoRA

        Args:
            input_: Tensor whose last dimension is `input_size`.

        Returns:
            - output
            - bias
        """
36
        bias = self.base_layer.bias if not self.base_layer.skip_bias_add else None
Jee Jee Li's avatar
Jee Jee Li committed
37
38
39
40

        # Matrix multiply.
        output = self.apply(input_, bias)

41
        output_bias = self.base_layer.bias if self.base_layer.skip_bias_add else None
Jee Jee Li's avatar
Jee Jee Li committed
42
43
44
45
46
47
48
49
50
51
52
53
54
55

        if not self.base_layer.return_bias:
            return output

        return output, output_bias

    # ReplicatedLinear should always be replaced, regardless of the fully
    # sharded LoRAs setting, because it is, by definition, copied per GPU.
    @classmethod
    def can_replace_layer(
        cls,
        source_layer: nn.Module,
        lora_config: LoRAConfig,
        packed_modules_list: list,
56
        model_config: PretrainedConfig | None,
Jee Jee Li's avatar
Jee Jee Li committed
57
58
    ) -> bool:
        return type(source_layer) is ReplicatedLinear
59
60
61
62
63
64
65
66
67
68
69
70

    def slice_lora_a(
        self, lora_a: torch.Tensor | list[torch.Tensor | None]
    ) -> torch.Tensor | list[torch.Tensor | None]:
        """Slice lora a if splitting for tensor parallelism."""
        return lora_a

    def slice_lora_b(
        self, lora_b: torch.Tensor | list[torch.Tensor | None]
    ) -> torch.Tensor | list[torch.Tensor | None]:
        """Slice lora b if splitting with tensor parallelism."""
        return lora_b