replicated_linear.py 2.6 KB
Newer Older
Jee Jee Li's avatar
Jee Jee Li committed
1
2
3
4
5
6
7
8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project


import torch
import torch.nn as nn
from transformers import PretrainedConfig

9
from vllm.config.lora import LoRAConfig
10
from vllm.model_executor.custom_op import maybe_get_oot_by_class
Jee Jee Li's avatar
Jee Jee Li committed
11
12
13
14
15
16
17
from vllm.model_executor.layers.linear import ReplicatedLinear

from .base_linear import BaseLinearLayerWithLoRA


class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA):
    def __init__(self, base_layer: ReplicatedLinear) -> None:
18
19
20
        super().__init__(
            base_layer,
        )
Jee Jee Li's avatar
Jee Jee Li committed
21
22
23
24
25
26
        # To ensure interface compatibility, set to 1 always.
        self.output_size = self.base_layer.output_size
        self.n_slices = 1

    def forward(
        self, input_: torch.Tensor
27
    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor | None]:
Jee Jee Li's avatar
Jee Jee Li committed
28
29
30
31
32
33
34
35
36
        """Forward of ReplicatedLinearWithLoRA

        Args:
            input_: Tensor whose last dimension is `input_size`.

        Returns:
            - output
            - bias
        """
37
        bias = self.base_layer.bias if not self.base_layer.skip_bias_add else None
Jee Jee Li's avatar
Jee Jee Li committed
38
39
40
41

        # Matrix multiply.
        output = self.apply(input_, bias)

42
        output_bias = self.base_layer.bias if self.base_layer.skip_bias_add else None
Jee Jee Li's avatar
Jee Jee Li committed
43
44
45
46
47
48

        if not self.base_layer.return_bias:
            return output

        return output, output_bias

49
50
51
52
53
54
    def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor:
        # ReplicatedLinear subclasses such as GateLinear override forward() to
        # dispatch custom kernels and/or adjust the output dtype. Apply LoRA on
        # top of the actual base-layer output instead of bypassing that path.
        return self._apply_base_forward(x)

Jee Jee Li's avatar
Jee Jee Li committed
55
56
57
58
59
60
61
62
    # ReplicatedLinear should always be replaced, regardless of the fully
    # sharded LoRAs setting, because it is, by definition, copied per GPU.
    @classmethod
    def can_replace_layer(
        cls,
        source_layer: nn.Module,
        lora_config: LoRAConfig,
        packed_modules_list: list,
63
        model_config: PretrainedConfig | None = None,
Jee Jee Li's avatar
Jee Jee Li committed
64
    ) -> bool:
65
        return isinstance(source_layer, maybe_get_oot_by_class(ReplicatedLinear))
66
67
68
69
70
71
72
73
74
75
76
77

    def slice_lora_a(
        self, lora_a: torch.Tensor | list[torch.Tensor | None]
    ) -> torch.Tensor | list[torch.Tensor | None]:
        """Slice lora a if splitting for tensor parallelism."""
        return lora_a

    def slice_lora_b(
        self, lora_b: torch.Tensor | list[torch.Tensor | None]
    ) -> torch.Tensor | list[torch.Tensor | None]:
        """Slice lora b if splitting with tensor parallelism."""
        return lora_b