transformer_layer.py 9.97 KB
Newer Older
yuguo960516's avatar
bloom  
yuguo960516 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import oneflow.nn as nn

from libai.layers import build_activation
from libai.layers.droppath import DropPath
from libai.layers.layer_norm import LayerNorm
from libai.layers.mlp import MLP
from libai.utils import distributed as dist
from projects.MagicPrompt.layers.attention_layer import AttnMaskType, MultiheadAttention


class TransformerLayer(nn.Module):
    """A single transformer layer.

    Transformer layer takes input with size [bsz, seq_length, hidden size] and returns an
    output of the same size.
    The input and output has same sbp sign, (S(0), B).

    Arguments:
        hidden_size: size of hidden state.
        ffn_hidden_size: size of feed forword neural network.
        num_attention_heads: number of attention heads.
        is_decoder: used to specify whether this is transformer encoder layer or transformer
            decoder layer. Default: ``False``.
        attention_dropout_prob: dropout probability of attention weights.
        output_dropout_prob: dropout probability of output.
        layernorm_epsilon: epsilon used in layernorm layer. Default: `1e-5`.
        init_method: method to initialize the input layer weights.
        output_layer_init_method: method to initialize the output layer weights.
            If None, use `init_method`.
        bias_gelu_fusion: whether fuse add bias and gelu. Default: ``False``.
        bias_dropout_fusion: whether fuse add bias and dropout. Default: ``False``.
        scale_mask_softmax_fusion: whether to fuse scale, mask and softmax. Default: ``False``.
        apply_query_key_layer_scaling: if `true`, scaling the attention score by layer index.
            Default: ``False``.
        apply_residual_post_layernorm: if ``true``, use original BERT residual
            connection ordering. Otherwise, use Megatron BERT residual connection which
            is more stable when scaling model size introduced in
            https://arxiv.org/pdf/1909.08053.pdf.
            Default: ``False``.
        layer_idx: the layer index, which determines the placement.
    """

    def __init__(
        self,
        hidden_size,
        ffn_hidden_size,
        num_attention_heads,
        is_decoder=False,
        attention_dropout_prob=0.0,
        output_dropout_prob=0.0,
        drop_path_prob=0.0,
        layernorm_epsilon=1e-5,
        init_method=nn.init.xavier_normal_,
        output_layer_init_method=None,
        bias_gelu_fusion=False,
        bias_dropout_fusion=False,
        scale_mask_softmax_fusion=False,
        apply_query_key_layer_scaling=False,
        apply_residual_post_layernorm=False,
        attn_mask_type=AttnMaskType.padding,
        *,
        layer_idx=0
    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.ffn_hidden_size = ffn_hidden_size
        self.num_attention_heads = num_attention_heads
        self.attention_dropout_prob = attention_dropout_prob
        self.output_dropout_prob = output_dropout_prob
        self.layernorm_epsilon = layernorm_epsilon
        self.attn_mask_type = attn_mask_type

        self.layer_idx = layer_idx
        self.is_decoder = is_decoder

        self.bias_gelu_fusion = bias_gelu_fusion
        self.bias_dropout_fusion = bias_dropout_fusion
        self.scale_mask_softmax_fusion = scale_mask_softmax_fusion
        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
        self.apply_residual_post_layernorm = apply_residual_post_layernorm

        self.init_method = init_method
        if output_layer_init_method is None:
            output_layer_init_method = init_method
        self.output_layer_init_method = output_layer_init_method

        self.drop_path = DropPath(drop_path_prob) if drop_path_prob > 0.0 else nn.Identity()

        self.input_layernorm = LayerNorm(
            self.hidden_size, eps=self.layernorm_epsilon, layer_idx=self.layer_idx
        )

        self.self_attention = self.build_attention(is_cross_attention=False)
        self.post_attention_layernorm = LayerNorm(
            self.hidden_size, eps=self.layernorm_epsilon, layer_idx=self.layer_idx
        )

        if self.is_decoder:
            self.cross_attention = self.build_attention(is_cross_attention=True)
            self.post_cross_attention_layernorm = LayerNorm(
                self.hidden_size, eps=self.layernorm_epsilon, layer_idx=self.layer_idx
            )

        self.mlp = MLP(
            self.hidden_size,
            self.ffn_hidden_size,
            self.output_dropout_prob,
            self.init_method,
            output_layer_init_method=self.output_layer_init_method,
            bias_gelu_fusion=self.bias_gelu_fusion,
            bias_dropout_fusion=self.bias_dropout_fusion,
            layer_idx=self.layer_idx,
        )

        # use gelu_tanh activation
        if not bias_gelu_fusion:
            self.mlp.activation_func = build_activation("gelu_tanh")

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        encoder_states=None,
        encoder_attention_mask=None,
        past_key_value=None,
        use_cache=False,
    ):
        """
        Args:
            hidden_states: shape is (batch_size, seq_length, hidden_size),
                sbp signature is (S(0), B).
            attention_mask: the combination of key padding mask and casual mask of hidden states
                with shape (batch_size, 1, seq_length, seq_length) and the sbp
                signature is (S(0), B),
            encoder_states: encoder output with shape (batch_size, seq_length, hidden_size)
                and the sbp signature is (S(0), B), which will be used in cross attention.
            encoder_attention_mask: key padding mask of encoder states with shape
                (batch_size, 1, seq_length, seq_length) and the sbp signature is (S(0), B).
            past_key_value: tuple of key and value, each shape is
                (seq_length, bsz, num_heads, head_size), For decoder layer,
                the past_key_value contains the states both from self attention
                and cross attention.
            use_cache: it will be set to `True` when the model is in the inference phase and
                used for incremental decoding.
        """
        hidden_states = hidden_states.to_global(placement=dist.get_layer_placement(self.layer_idx))

        if attention_mask is not None:
            attention_mask = attention_mask.to_global(
                placement=dist.get_layer_placement(self.layer_idx)
            )

        if past_key_value is not None:
            if self.is_decoder:
                assert len(past_key_value) == 4
                self_attn_past_key_value = past_key_value[:2]
                cross_attn_past_key_value = past_key_value[2:]
            else:
                self_attn_past_key_value = past_key_value
                cross_attn_past_key_value = None
        else:
            self_attn_past_key_value, cross_attn_past_key_value = None, None

        layernorm_output = self.input_layernorm(hidden_states)
        attention_output = self.self_attention(
            layernorm_output,
            attention_mask=attention_mask,
            past_key_value=self_attn_past_key_value,
            use_cache=use_cache,
        )
        attention_output = self.drop_path(attention_output)

        if use_cache:
            attention_output, presents = attention_output

        if self.apply_residual_post_layernorm:
            residual = layernorm_output
        else:
            residual = hidden_states

        hidden_states = residual + attention_output

        layernorm_output = self.post_attention_layernorm(hidden_states)

        if self.is_decoder:
            attention_output = self.cross_attention(
                layernorm_output,
                encoder_states,
                attention_mask=encoder_attention_mask,
                past_key_value=cross_attn_past_key_value,
                use_cache=use_cache,
            )

            if use_cache:
                attention_output, decoder_presents = attention_output
                presents += decoder_presents

            attention_output = self.drop_path(attention_output)
            if self.apply_residual_post_layernorm:
                residual = layernorm_output
            else:
                residual = hidden_states

            hidden_states = residual + attention_output
            layernorm_output = self.post_cross_attention_layernorm(hidden_states)

        mlp_output = self.mlp(layernorm_output)
        mlp_output = self.drop_path(mlp_output)

        if self.apply_residual_post_layernorm:
            residual = layernorm_output
        else:
            residual = hidden_states

        output = residual + mlp_output

        if use_cache:
            output = (output, presents)
        return output

    def build_attention(self, is_cross_attention=False):
        return MultiheadAttention(
            self.hidden_size,
            self.num_attention_heads,
            is_cross_attention=is_cross_attention,
            attention_dropout_prob=self.attention_dropout_prob,
            output_dropout_prob=self.output_dropout_prob,
            init_method=self.init_method,
            output_layer_init_method=self.output_layer_init_method,
            bias_dropout_fusion=self.bias_dropout_fusion,
            scale_mask_softmax_fusion=self.scale_mask_softmax_fusion,
            apply_query_key_layer_scaling=self.apply_query_key_layer_scaling,
            attn_mask_type=self.attn_mask_type,
            layer_idx=self.layer_idx,
        )