modeling_tf_ctrl.py 27.8 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# coding=utf-8
# Copyright 2018 Salesforce and HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" TF 2.0 CTRL model."""


import logging
Aymeric Augustin's avatar
Aymeric Augustin committed
20

thomwolf's avatar
thomwolf committed
21
22
23
24
import numpy as np
import tensorflow as tf

from .configuration_ctrl import CTRLConfig
Lysandre's avatar
TF CTRL  
Lysandre committed
25
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
26
27
28
29
30
31
32
from .modeling_tf_utils import (
    TFPreTrainedModel,
    TFSharedEmbeddings,
    cast_bool_to_primitive,
    keras_serializable,
    shape_list,
)
33
from .tokenization_utils import BatchEncoding
Aymeric Augustin's avatar
Aymeric Augustin committed
34

thomwolf's avatar
thomwolf committed
35
36
37

logger = logging.getLogger(__name__)

38
39
40
41
TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "ctrl"
    # See all CTRL models at https://huggingface.co/models?filter=ctrl
]
thomwolf's avatar
thomwolf committed
42

43

thomwolf's avatar
thomwolf committed
44
def angle_defn(pos, i, d_model_size):
45
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model_size))
thomwolf's avatar
thomwolf committed
46
47
    return pos * angle_rates

48

thomwolf's avatar
thomwolf committed
49
def positional_encoding(position, d_model_size):
thomwolf's avatar
thomwolf committed
50
    # create the sinusoidal pattern for the positional encoding
51
    angle_rads = angle_defn(np.arange(position)[:, np.newaxis], np.arange(d_model_size)[np.newaxis, :], d_model_size)
thomwolf's avatar
thomwolf committed
52
53
54
55

    sines = np.sin(angle_rads[:, 0::2])
    cosines = np.cos(angle_rads[:, 1::2])

thomwolf's avatar
thomwolf committed
56
57
    # pos_encoding = tf.cast(np.concatenate([sines, cosines], axis=-1)[np.newaxis, ...], dtype=tf.float32)
    pos_encoding = tf.cast(np.concatenate([sines, cosines], axis=-1), dtype=tf.float32)
thomwolf's avatar
thomwolf committed
58
59
    return pos_encoding

60

thomwolf's avatar
thomwolf committed
61
62
63
def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=None):
    # calculate attention
    matmul_qk = tf.matmul(q, k, transpose_b=True)
64

thomwolf's avatar
thomwolf committed
65
    dk = tf.cast(shape_list(k)[-1], tf.float32)
thomwolf's avatar
thomwolf committed
66
67
68
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    if mask is not None:
69
        scaled_attention_logits += mask * -1e4
thomwolf's avatar
thomwolf committed
70
71
72
73
74

    if attention_mask is not None:
        # Apply the attention mask
        scaled_attention_logits = scaled_attention_logits + attention_mask

75
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
thomwolf's avatar
thomwolf committed
76
77
78
79
80

    # Mask heads if we want to
    if head_mask is not None:
        attention_weights = attention_weights * head_mask

81
    output = tf.matmul(attention_weights, v)
thomwolf's avatar
thomwolf committed
82
83
84
85
86

    return output, attention_weights


class TFMultiHeadAttention(tf.keras.layers.Layer):
87
    def __init__(self, d_model_size, num_heads, **kwargs):
Julien Chaumond's avatar
Julien Chaumond committed
88
        super().__init__(**kwargs)
thomwolf's avatar
thomwolf committed
89
90
91
92
93
        self.num_heads = num_heads
        self.d_model_size = d_model_size

        self.depth = int(d_model_size / self.num_heads)

94
95
96
        self.Wq = tf.keras.layers.Dense(d_model_size, name="Wq")
        self.Wk = tf.keras.layers.Dense(d_model_size, name="Wk")
        self.Wv = tf.keras.layers.Dense(d_model_size, name="Wv")
thomwolf's avatar
thomwolf committed
97

98
        self.dense = tf.keras.layers.Dense(d_model_size, name="dense")
thomwolf's avatar
thomwolf committed
99
100
101
102
103

    def split_into_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

thomwolf's avatar
thomwolf committed
104
    def call(self, inputs, training=False):
105
        v, k, q, mask, layer_past, attention_mask, head_mask, use_cache, output_attentions = inputs
106
        batch_size = shape_list(q)[0]
thomwolf's avatar
thomwolf committed
107
108
109
110
111
112
113
114

        q = self.Wq(q)
        k = self.Wk(k)
        v = self.Wv(v)

        q = self.split_into_heads(q, batch_size)
        k = self.split_into_heads(k, batch_size)
        v = self.split_into_heads(v, batch_size)
115

thomwolf's avatar
thomwolf committed
116
        if layer_past is not None:
117
            past_key, past_value = tf.unstack(layer_past, axis=0)
118
119
            k = tf.concat((past_key, k), axis=-2)
            v = tf.concat((past_value, v), axis=-2)
120
121

        # to cope with keras serialization
122
        use_cache = cast_bool_to_primitive(use_cache, True)
123
124
125
126
127

        if use_cache is True:
            present = tf.stack((k, v), axis=0)
        else:
            present = (None,)
thomwolf's avatar
thomwolf committed
128
129
130
131

        output = scaled_dot_product_attention(q, k, v, mask, attention_mask, head_mask)
        scaled_attention = tf.transpose(output[0], perm=[0, 2, 1, 3])
        attn = output[1]
132
        original_size_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model_size))
thomwolf's avatar
thomwolf committed
133
134
135
        output = self.dense(original_size_attention)

        outputs = (output, present)
136
        if cast_bool_to_primitive(output_attentions) is True:
thomwolf's avatar
thomwolf committed
137
138
139
140
            outputs = outputs + (attn,)
        return outputs


thomwolf's avatar
thomwolf committed
141
def point_wise_feed_forward_network(d_model_size, dff, name=""):
142
143
144
145
    return tf.keras.Sequential(
        [tf.keras.layers.Dense(dff, activation="relu", name="0"), tf.keras.layers.Dense(d_model_size, name="2")],
        name="ffn",
    )
thomwolf's avatar
thomwolf committed
146
147
148


class TFEncoderLayer(tf.keras.layers.Layer):
149
    def __init__(self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e-6, **kwargs):
Julien Chaumond's avatar
Julien Chaumond committed
150
        super().__init__(**kwargs)
thomwolf's avatar
thomwolf committed
151

152
        self.multi_head_attention = TFMultiHeadAttention(d_model_size, num_heads, name="multi_head_attention")
thomwolf's avatar
thomwolf committed
153
        self.ffn = point_wise_feed_forward_network(d_model_size, dff, name="ffn")
thomwolf's avatar
thomwolf committed
154

thomwolf's avatar
thomwolf committed
155
156
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm1")
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm2")
thomwolf's avatar
thomwolf committed
157

thomwolf's avatar
thomwolf committed
158
159
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
thomwolf's avatar
thomwolf committed
160
161

    def call(self, inputs, training=False):
162
        x, mask, layer_past, attention_mask, head_mask, use_cache, output_attentions = inputs
thomwolf's avatar
thomwolf committed
163
        normed = self.layernorm1(x)
164
        attn_outputs = self.multi_head_attention(
165
166
            [normed, normed, normed, mask, layer_past, attention_mask, head_mask, use_cache, output_attentions],
            training=training,
167
        )
thomwolf's avatar
thomwolf committed
168
169
170
171
172
173
174
175
176
177
178
179
180
        attn_output = attn_outputs[0]
        attn_output = self.dropout1(attn_output, training=training)
        out1 = x + attn_output

        out2 = self.layernorm2(out1)
        ffn_output = self.ffn(out2)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = out1 + ffn_output

        outputs = (out2,) + attn_outputs[1:]
        return outputs


181
@keras_serializable
thomwolf's avatar
thomwolf committed
182
class TFCTRLMainLayer(tf.keras.layers.Layer):
183
184
    config_class = CTRLConfig

thomwolf's avatar
thomwolf committed
185
    def __init__(self, config, **kwargs):
Julien Chaumond's avatar
Julien Chaumond committed
186
        super().__init__(**kwargs)
thomwolf's avatar
thomwolf committed
187
        self.output_hidden_states = config.output_hidden_states
188
        self.output_attentions = config.output_attentions
189
        self.use_cache = config.use_cache
190

thomwolf's avatar
thomwolf committed
191
192
193
194
195
        self.d_model_size = config.n_embd
        self.num_layers = config.n_layer

        self.pos_encoding = positional_encoding(config.n_positions, self.d_model_size)

196
197
198
        self.w = TFSharedEmbeddings(
            config.vocab_size, config.n_embd, initializer_range=config.initializer_range, name="w"
        )
thomwolf's avatar
thomwolf committed
199
200

        self.dropout = tf.keras.layers.Dropout(config.embd_pdrop)
201
202
203
204
205
206
207
208
209
210
211
        self.h = [
            TFEncoderLayer(
                config.n_embd,
                config.n_head,
                config.dff,
                config.resid_pdrop,
                config.layer_norm_epsilon,
                name="h_._{}".format(i),
            )
            for i in range(config.n_layer)
        ]
thomwolf's avatar
thomwolf committed
212
213
        self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="layernorm")

214
215
216
    def get_input_embeddings(self):
        return self.w

217
218
219
220
    def set_input_embeddings(self, value):
        self.w.weight = value
        self.w.vocab_size = value.shape[0]

thomwolf's avatar
thomwolf committed
221
222
223
224
225
226
227
228
229
    def _resize_token_embeddings(self, new_num_tokens):
        raise NotImplementedError

    def _prune_heads(self, heads_to_prune):
        """ Prunes heads of the model.
                heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        """
        raise NotImplementedError

230
231
232
233
234
235
236
237
238
    def call(
        self,
        inputs,
        past=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
239
        use_cache=None,
240
        output_attentions=None,
Joseph Liu's avatar
Joseph Liu committed
241
        output_hidden_states=None,
242
243
        training=False,
    ):
244

thomwolf's avatar
thomwolf committed
245
246
247
248
249
250
251
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
            past = inputs[1] if len(inputs) > 1 else past
            attention_mask = inputs[2] if len(inputs) > 2 else attention_mask
            token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids
            position_ids = inputs[4] if len(inputs) > 4 else position_ids
            head_mask = inputs[5] if len(inputs) > 5 else head_mask
252
            inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds
253
            use_cache = inputs[7] if len(inputs) > 7 else use_cache
254
            output_attentions = inputs[8] if len(inputs) > 8 else output_attentions
Joseph Liu's avatar
Joseph Liu committed
255
256
            output_hidden_states = inputs[9] if len(inputs) > 9 else output_hidden_states
            assert len(inputs) <= 10, "Too many inputs."
257
        elif isinstance(inputs, (dict, BatchEncoding)):
258
259
260
261
262
263
264
            input_ids = inputs.get("input_ids")
            past = inputs.get("past", past)
            attention_mask = inputs.get("attention_mask", attention_mask)
            token_type_ids = inputs.get("token_type_ids", token_type_ids)
            position_ids = inputs.get("position_ids", position_ids)
            head_mask = inputs.get("head_mask", head_mask)
            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
265
            use_cache = inputs.get("use_cache", use_cache)
266
            output_attentions = inputs.get("output_attentions", output_attentions)
Joseph Liu's avatar
Joseph Liu committed
267
268
            output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
            assert len(inputs) <= 10, "Too many inputs."
thomwolf's avatar
thomwolf committed
269
270
271
        else:
            input_ids = inputs

272
        output_attentions = output_attentions if output_attentions is not None else self.output_attentions
Joseph Liu's avatar
Joseph Liu committed
273
        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
274
        use_cache = use_cache if use_cache is not None else self.use_cache
275

276
277
278
279
280
281
282
283
284
285
        # If using past key value states, only the last tokens
        # should be given as an input
        if past is not None:
            if input_ids is not None:
                input_ids = input_ids[:, -1:]
            if inputs_embeds is not None:
                inputs_embeds = inputs_embeds[:, -1:]
            if token_type_ids is not None:
                token_type_ids = token_type_ids[:, -1:]

286
287
288
289
290
291
292
293
294
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
            input_shape = shape_list(input_ids)
            input_ids = tf.reshape(input_ids, [-1, input_shape[-1]])
        elif inputs_embeds is not None:
            input_shape = shape_list(inputs_embeds)[:-1]
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")
thomwolf's avatar
thomwolf committed
295
296
297
298
299
300
301

        if past is None:
            past_length = 0
            past = [None] * len(self.h)
        else:
            past_length = shape_list(past[0][0])[-2]
        if position_ids is None:
302
303
            position_ids = tf.range(past_length, input_shape[-1] + past_length, dtype=tf.int32)[tf.newaxis, :]
            position_ids = tf.tile(position_ids, [input_shape[0], 1])
thomwolf's avatar
thomwolf committed
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335

        # Attention mask.
        if attention_mask is not None:
            # We create a 3D attention mask from a 2D tensor mask.
            # Sizes are [batch_size, 1, 1, to_seq_length]
            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
            # this attention mask is more simple than the triangular masking of causal attention
            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
            attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]

            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
            # masked positions, this operation will create a tensor which is 0.0 for
            # positions we want to attend and -10000.0 for masked positions.
            # Since we are adding it to the raw scores before the softmax, this is
            # effectively the same as removing these entirely.

            attention_mask = tf.cast(attention_mask, tf.float32)
            attention_mask = (1.0 - attention_mask) * -10000.0
        else:
            attention_mask = None

        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
        # attention_probs has shape bsz x n_heads x N x N
        # head_mask has shape n_layer x batch x n_heads x N x N
        if head_mask is not None:
            raise NotImplementedError
        else:
            head_mask = [None] * self.num_layers

        if token_type_ids is not None:
            token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
336
            token_type_embeds = self.w(token_type_ids, mode="embedding")
thomwolf's avatar
thomwolf committed
337
338
339
340
341
            token_type_embeds *= tf.math.sqrt(tf.cast(self.d_model_size, tf.float32))
        else:
            token_type_embeds = 0
        position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])

342
        if inputs_embeds is None:
343
            inputs_embeds = self.w(input_ids, mode="embedding")
thomwolf's avatar
thomwolf committed
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
        seq_len = input_shape[-1]
        mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)

        inputs_embeds *= tf.math.sqrt(tf.cast(self.d_model_size, tf.float32))

        pos_embeds = tf.gather(self.pos_encoding, position_ids)

        hidden_states = inputs_embeds + pos_embeds + token_type_embeds

        hidden_states = self.dropout(hidden_states, training=training)

        output_shape = input_shape + [shape_list(hidden_states)[-1]]
        presents = ()
        all_hidden_states = ()
        all_attentions = []
        for i, (h, layer_past) in enumerate(zip(self.h, past)):
Joseph Liu's avatar
Joseph Liu committed
360
            if cast_bool_to_primitive(output_hidden_states) is True:
thomwolf's avatar
thomwolf committed
361
                all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),)
362
363
364
365
            outputs = h(
                [hidden_states, mask, layer_past, attention_mask, head_mask[i], use_cache, output_attentions],
                training=training,
            )
thomwolf's avatar
thomwolf committed
366
            hidden_states, present = outputs[:2]
367

368
            if use_cache is True:
369
                presents = presents + (present,)
thomwolf's avatar
thomwolf committed
370

371
            if cast_bool_to_primitive(output_attentions) is True:
thomwolf's avatar
thomwolf committed
372
373
374
375
                all_attentions.append(outputs[2])

        hidden_states = self.layernorm(hidden_states)
        hidden_states = tf.reshape(hidden_states, output_shape)
Joseph Liu's avatar
Joseph Liu committed
376
        if cast_bool_to_primitive(output_hidden_states) is True:
thomwolf's avatar
thomwolf committed
377
378
            all_hidden_states = all_hidden_states + (hidden_states,)

379
        outputs = (hidden_states,)
380
        if use_cache is True:
381
            outputs = outputs + (presents,)
Joseph Liu's avatar
Joseph Liu committed
382
        if cast_bool_to_primitive(output_hidden_states) is True:
thomwolf's avatar
thomwolf committed
383
            outputs = outputs + (all_hidden_states,)
384
        if cast_bool_to_primitive(output_attentions) is True:
thomwolf's avatar
thomwolf committed
385
386
387
388
389
390
391
            # let the number of heads free (-1) so we can extract attention even after head pruning
            attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:]
            all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions)
            outputs = outputs + (all_attentions,)
        return outputs


thomwolf's avatar
thomwolf committed
392
393
class TFCTRLPreTrainedModel(TFPreTrainedModel):
    """ An abstract class to handle weights initialization and
394
        a simple interface for downloading and loading pretrained models.
thomwolf's avatar
thomwolf committed
395
    """
396

thomwolf's avatar
thomwolf committed
397
398
399
400
    config_class = CTRLConfig
    base_model_prefix = "transformer"


Lysandre's avatar
TF CTRL  
Lysandre committed
401
CTRL_START_DOCSTRING = r"""
thomwolf's avatar
thomwolf committed
402

Lysandre's avatar
TF CTRL  
Lysandre committed
403
404
    .. note::
        TF 2.0 models accepts two formats as inputs:
thomwolf's avatar
thomwolf committed
405

Lysandre's avatar
TF CTRL  
Lysandre committed
406
407
            - having all inputs as keyword arguments (like PyTorch models), or
            - having all inputs as a list, tuple or dict in the first positional arguments.
thomwolf's avatar
thomwolf committed
408

Lysandre's avatar
Lysandre committed
409
        This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
Lysandre's avatar
TF CTRL  
Lysandre committed
410
        all the tensors in the first argument of the model call function: :obj:`model(inputs)`.
thomwolf's avatar
thomwolf committed
411

Lysandre's avatar
Lysandre committed
412
        If you choose this second option, there are three possibilities you can use to gather all the input Tensors
Lysandre's avatar
TF CTRL  
Lysandre committed
413
414
415
416
417
418
419
        in the first positional argument :

        - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
          :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
        - a dictionary with one or several input Tensors associated to the input names given in the docstring:
          :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
Lysandre's avatar
Lysandre committed
420

LysandreJik's avatar
LysandreJik committed
421
422
423
424
    Parameters:
        config (:class:`~transformers.CTRLConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
thomwolf's avatar
thomwolf committed
425
426
"""

Lysandre's avatar
TF CTRL  
Lysandre committed
427
428
CTRL_INPUTS_DOCSTRING = r"""
    Args:
429
430
431
        input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, input_ids_length)`):
            :obj:`input_ids_length` = ``sequence_length`` if ``past`` is ``None`` else ``past[0].shape[-2]`` (``sequence_length`` of input past key value states).

Lysandre's avatar
Lysandre committed
432
            Indices of input sequence tokens in the vocabulary.
433
434

            If `past` is used, only input_ids that do not have their past calculated should be passed as input_ids (see `past`).
Lysandre's avatar
Lysandre committed
435

thomwolf's avatar
thomwolf committed
436
437
            Indices can be obtained using :class:`transformers.CTRLTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
Lysandre's avatar
TF CTRL  
Lysandre committed
438
            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
Lysandre's avatar
Lysandre committed
439

Lysandre's avatar
TF CTRL  
Lysandre committed
440
441
442
            `What are input IDs? <../glossary.html#input-ids>`__
        past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
            Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
443
            (see `past` output below). Can be used to speed up sequential decoding.
444
445
            The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
Lysandre's avatar
TF CTRL  
Lysandre committed
446
        attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
thomwolf's avatar
thomwolf committed
447
448
449
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
Lysandre's avatar
Lysandre committed
450

Lysandre's avatar
TF CTRL  
Lysandre committed
451
            `What are attention masks? <../glossary.html#attention-mask>`__
Lysandre's avatar
Lysandre committed
452
        token_type_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Lysandre's avatar
TF CTRL  
Lysandre committed
453
454
455
            Segment token indices to indicate first and second portions of the inputs.
            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
            corresponds to a `sentence B` token
Lysandre's avatar
Lysandre committed
456

Lysandre's avatar
TF CTRL  
Lysandre committed
457
458
            `What are token type IDs? <../glossary.html#token-type-ids>`_
        position_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
thomwolf's avatar
thomwolf committed
459
460
            Indices of positions of each input sequence tokens in the position embeddings.
            Selected in the range ``[0, config.max_position_embeddings - 1]``.
Lysandre's avatar
Lysandre committed
461

Lysandre's avatar
TF CTRL  
Lysandre committed
462
463
            `What are position IDs? <../glossary.html#position-ids>`_
        head_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
thomwolf's avatar
thomwolf committed
464
465
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
Lysandre's avatar
TF CTRL  
Lysandre committed
466
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
flozi00's avatar
flozi00 committed
467
        inputs_embeds (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
Lysandre's avatar
TF CTRL  
Lysandre committed
468
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
469
470
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
471
472
473
        use_cache (:obj:`bool`):
            If `use_cache` is True, `past` key value states are returned and
            can be used to speed up decoding (see `past`). Defaults to `True`.
Lysandre's avatar
TF CTRL  
Lysandre committed
474
475
476
        training (:obj:`boolean`, `optional`, defaults to :obj:`False`):
            Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them
            (if set to :obj:`False`) for evaluation.
ZhuBaohe's avatar
ZhuBaohe committed
477
        output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
478
            If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
thomwolf's avatar
thomwolf committed
479
480
"""

481
482
483
484
485

@add_start_docstrings(
    "The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
    CTRL_START_DOCSTRING,
)
thomwolf's avatar
thomwolf committed
486
class TFCTRLModel(TFCTRLPreTrainedModel):
Lysandre's avatar
TF CTRL  
Lysandre committed
487
488
489
490
491
492
493
494
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.transformer = TFCTRLMainLayer(config, name="transformer")

    @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
    def call(self, inputs, **kwargs):
        r"""
    Return:
Lysandre's avatar
Fixes  
Lysandre committed
495
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.CTRLConfig`) and inputs:
Lysandre's avatar
TF CTRL  
Lysandre committed
496
        last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
thomwolf's avatar
thomwolf committed
497
            Sequence of hidden-states at the last layer of the model.
Lysandre's avatar
TF CTRL  
Lysandre committed
498
499
500
501
        past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
502
503
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
Lysandre's avatar
TF CTRL  
Lysandre committed
504
505
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

thomwolf's avatar
thomwolf committed
506
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
507
508
509
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Lysandre's avatar
TF CTRL  
Lysandre committed
510
511
512

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
thomwolf's avatar
thomwolf committed
513
514
515

    Examples::

thomwolf's avatar
thomwolf committed
516
517
518
        import tensorflow as tf
        from transformers import CTRLTokenizer, TFCTRLModel

thomwolf's avatar
thomwolf committed
519
        tokenizer = CTRLTokenizer.from_pretrained('ctrl')
thomwolf's avatar
thomwolf committed
520
        model = TFCTRLModel.from_pretrained('ctrl')
521
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
thomwolf's avatar
thomwolf committed
522
523
524
        outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

Lysandre's avatar
TF CTRL  
Lysandre committed
525
        """
thomwolf's avatar
thomwolf committed
526
527
        outputs = self.transformer(inputs, **kwargs)
        return outputs
thomwolf's avatar
thomwolf committed
528
529


thomwolf's avatar
thomwolf committed
530
531
class TFCTRLLMHead(tf.keras.layers.Layer):
    def __init__(self, config, input_embeddings, **kwargs):
Julien Chaumond's avatar
Julien Chaumond committed
532
        super().__init__(**kwargs)
thomwolf's avatar
thomwolf committed
533
        self.vocab_size = config.vocab_size
thomwolf's avatar
thomwolf committed
534

thomwolf's avatar
thomwolf committed
535
536
537
        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        self.input_embeddings = input_embeddings
thomwolf's avatar
thomwolf committed
538

thomwolf's avatar
thomwolf committed
539
    def build(self, input_shape):
540
        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
Julien Chaumond's avatar
Julien Chaumond committed
541
        super().build(input_shape)
thomwolf's avatar
thomwolf committed
542

thomwolf's avatar
thomwolf committed
543
544
545
546
    def call(self, hidden_states):
        hidden_states = self.input_embeddings(hidden_states, mode="linear")
        hidden_states = hidden_states + self.bias
        return hidden_states
thomwolf's avatar
thomwolf committed
547
548


549
550
@add_start_docstrings(
    """The CTRL Model transformer with a language modeling head on top
Lysandre's avatar
TF CTRL  
Lysandre committed
551
    (linear layer with weights tied to the input embeddings). """,
552
553
    CTRL_START_DOCSTRING,
)
thomwolf's avatar
thomwolf committed
554
class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
Lysandre's avatar
TF CTRL  
Lysandre committed
555
556
557
558
559
560
561
562
563
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.transformer = TFCTRLMainLayer(config, name="transformer")

        self.lm_head = TFCTRLLMHead(config, self.transformer.w, name="lm_head")

    def get_output_embeddings(self):
        return self.lm_head.input_embeddings

564
565
566
567
568
    def prepare_inputs_for_generation(self, inputs, past, **kwargs):
        # only last token for inputs_ids if past is defined in kwargs
        if past:
            inputs = tf.expand_dims(inputs[:, -1], -1)

569
        return {"inputs": inputs, "past": past, "use_cache": kwargs["use_cache"]}
570

Lysandre's avatar
TF CTRL  
Lysandre committed
571
572
573
574
    @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
    def call(self, inputs, **kwargs):
        r"""
    Return:
Lysandre's avatar
Fixes  
Lysandre committed
575
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.CTRLConfig`) and inputs:
Lysandre's avatar
TF CTRL  
Lysandre committed
576
        prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
thomwolf's avatar
thomwolf committed
577
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
Lysandre's avatar
TF CTRL  
Lysandre committed
578
579
580
581
        past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
Joseph Liu's avatar
Joseph Liu committed
582
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
583
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
Lysandre's avatar
TF CTRL  
Lysandre committed
584
585
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

thomwolf's avatar
thomwolf committed
586
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
587
588
589
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Lysandre's avatar
TF CTRL  
Lysandre committed
590
591
592

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
thomwolf's avatar
thomwolf committed
593
594
595

    Examples::

Lysandre's avatar
Lysandre committed
596
        import tensorflow as tf
thomwolf's avatar
thomwolf committed
597
        from transformers import CTRLTokenizer, TFCTRLLMHeadModel
thomwolf's avatar
thomwolf committed
598
599

        tokenizer = CTRLTokenizer.from_pretrained('ctrl')
thomwolf's avatar
thomwolf committed
600
        model = TFCTRLLMHeadModel.from_pretrained('ctrl')
thomwolf's avatar
thomwolf committed
601

Lysandre's avatar
Lysandre committed
602
603
        input_ids = tf.constant([tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)])
        outputs = model(input_ids)
thomwolf's avatar
thomwolf committed
604
605
        loss, logits = outputs[:2]

Lysandre's avatar
TF CTRL  
Lysandre committed
606
        """
thomwolf's avatar
thomwolf committed
607
        transformer_outputs = self.transformer(inputs, **kwargs)
thomwolf's avatar
thomwolf committed
608
609
610
611
612
613
        hidden_states = transformer_outputs[0]

        lm_logits = self.lm_head(hidden_states)

        outputs = (lm_logits,) + transformer_outputs[1:]

thomwolf's avatar
thomwolf committed
614
        return outputs  # lm_logits, presents, (all hidden_states), (attentions)