modeling_tf_bert.py 55.1 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" TF 2.0 BERT model. """


import logging

import numpy as np
import tensorflow as tf

from .configuration_bert import BertConfig
Lysandre's avatar
Lysandre committed
25
from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
26
from .modeling_tf_utils import TFPreTrainedModel, get_initializer, keras_serializable, shape_list
27
from .tokenization_utils import BatchEncoding
Aymeric Augustin's avatar
Aymeric Augustin committed
28

thomwolf's avatar
thomwolf committed
29
30
31
32
33

logger = logging.getLogger(__name__)


TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
Julien Chaumond's avatar
Julien Chaumond committed
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
    "bert-base-uncased": "https://cdn.huggingface.co/bert-base-uncased-tf_model.h5",
    "bert-large-uncased": "https://cdn.huggingface.co/bert-large-uncased-tf_model.h5",
    "bert-base-cased": "https://cdn.huggingface.co/bert-base-cased-tf_model.h5",
    "bert-large-cased": "https://cdn.huggingface.co/bert-large-cased-tf_model.h5",
    "bert-base-multilingual-uncased": "https://cdn.huggingface.co/bert-base-multilingual-uncased-tf_model.h5",
    "bert-base-multilingual-cased": "https://cdn.huggingface.co/bert-base-multilingual-cased-tf_model.h5",
    "bert-base-chinese": "https://cdn.huggingface.co/bert-base-chinese-tf_model.h5",
    "bert-base-german-cased": "https://cdn.huggingface.co/bert-base-german-cased-tf_model.h5",
    "bert-large-uncased-whole-word-masking": "https://cdn.huggingface.co/bert-large-uncased-whole-word-masking-tf_model.h5",
    "bert-large-cased-whole-word-masking": "https://cdn.huggingface.co/bert-large-cased-whole-word-masking-tf_model.h5",
    "bert-large-uncased-whole-word-masking-finetuned-squad": "https://cdn.huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5",
    "bert-large-cased-whole-word-masking-finetuned-squad": "https://cdn.huggingface.co/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5",
    "bert-base-cased-finetuned-mrpc": "https://cdn.huggingface.co/bert-base-cased-finetuned-mrpc-tf_model.h5",
    "bert-base-japanese": "https://cdn.huggingface.co/cl-tohoku/bert-base-japanese/tf_model.h5",
    "bert-base-japanese-whole-word-masking": "https://cdn.huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking/tf_model.h5",
    "bert-base-japanese-char": "https://cdn.huggingface.co/cl-tohoku/bert-base-japanese-char/tf_model.h5",
    "bert-base-japanese-char-whole-word-masking": "https://cdn.huggingface.co/cl-tohoku/bert-base-japanese-char-whole-word-masking/tf_model.h5",
    "bert-base-finnish-cased-v1": "https://cdn.huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/tf_model.h5",
    "bert-base-finnish-uncased-v1": "https://cdn.huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/tf_model.h5",
    "bert-base-dutch-cased": "https://cdn.huggingface.co/wietsedv/bert-base-dutch-cased/tf_model.h5",
thomwolf's avatar
thomwolf committed
54
55
56
57
}


def gelu(x):
thomwolf's avatar
thomwolf committed
58
    """ Gaussian Error Linear Unit.
Santiago Castro's avatar
Santiago Castro committed
59
    Original Implementation of the gelu activation function in Google Bert repo when initially created.
thomwolf's avatar
thomwolf committed
60
61
62
63
64
65
66
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
        Also see https://arxiv.org/abs/1606.08415
    """
    cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0)))
    return x * cdf

67

thomwolf's avatar
thomwolf committed
68
def gelu_new(x):
thomwolf's avatar
thomwolf committed
69
70
71
72
73
74
75
76
    """Gaussian Error Linear Unit.
    This is a smoother version of the RELU.
    Original paper: https://arxiv.org/abs/1606.08415
    Args:
        x: float Tensor to perform activation.
    Returns:
        `x` with the GELU activation applied.
    """
77
    cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
thomwolf's avatar
thomwolf committed
78
79
    return x * cdf

80

thomwolf's avatar
thomwolf committed
81
82
83
84
def swish(x):
    return x * tf.sigmoid(x)


85
86
87
88
89
90
ACT2FN = {
    "gelu": tf.keras.layers.Activation(gelu),
    "relu": tf.keras.activations.relu,
    "swish": tf.keras.layers.Activation(swish),
    "gelu_new": tf.keras.layers.Activation(gelu_new),
}
thomwolf's avatar
thomwolf committed
91
92
93
94
95


class TFBertEmbeddings(tf.keras.layers.Layer):
    """Construct the embeddings from word, position and token_type embeddings.
    """
96

thomwolf's avatar
thomwolf committed
97
    def __init__(self, config, **kwargs):
Julien Chaumond's avatar
Julien Chaumond committed
98
        super().__init__(**kwargs)
99
100
        self.vocab_size = config.vocab_size
        self.hidden_size = config.hidden_size
thomwolf's avatar
thomwolf committed
101
        self.initializer_range = config.initializer_range
102

103
104
105
106
107
108
109
110
111
112
113
114
        self.position_embeddings = tf.keras.layers.Embedding(
            config.max_position_embeddings,
            config.hidden_size,
            embeddings_initializer=get_initializer(self.initializer_range),
            name="position_embeddings",
        )
        self.token_type_embeddings = tf.keras.layers.Embedding(
            config.type_vocab_size,
            config.hidden_size,
            embeddings_initializer=get_initializer(self.initializer_range),
            name="token_type_embeddings",
        )
thomwolf's avatar
thomwolf committed
115
116
117

        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
        # any TensorFlow checkpoint file
118
        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
thomwolf's avatar
thomwolf committed
119
120
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)

121
122
123
124
125
126
127
128
    def build(self, input_shape):
        """Build shared word embedding layer """
        with tf.name_scope("word_embeddings"):
            # Create and initialize weights. The random normal initializer was chosen
            # arbitrarily, and works well.
            self.word_embeddings = self.add_weight(
                "weight",
                shape=[self.vocab_size, self.hidden_size],
129
130
                initializer=get_initializer(self.initializer_range),
            )
Julien Chaumond's avatar
Julien Chaumond committed
131
        super().build(input_shape)
132
133
134
135
136
137
138
139
140
141
142
143

    def call(self, inputs, mode="embedding", training=False):
        """Get token embeddings of inputs.
        Args:
            inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
            mode: string, a valid value is one of "embedding" and "linear".
        Returns:
            outputs: (1) If mode == "embedding", output embedding tensor, float32 with
                shape [batch_size, length, embedding_size]; (2) mode == "linear", output
                linear tensor, float32 with shape [batch_size, length, vocab_size].
        Raises:
            ValueError: if mode is not valid.
144

145
146
147
148
149
150
151
152
153
154
155
156
        Shared weights logic adapted from
            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
        """
        if mode == "embedding":
            return self._embedding(inputs, training=training)
        elif mode == "linear":
            return self._linear(inputs)
        else:
            raise ValueError("mode {} is not valid.".format(mode))

    def _embedding(self, inputs, training=False):
        """Applies embedding based on inputs tensor."""
157
        input_ids, position_ids, token_type_ids, inputs_embeds = inputs
thomwolf's avatar
thomwolf committed
158

159
        if input_ids is not None:
160
            input_shape = shape_list(input_ids)
161
        else:
162
            input_shape = shape_list(inputs_embeds)[:-1]
163

164
        seq_length = input_shape[1]
thomwolf's avatar
thomwolf committed
165
166
167
        if position_ids is None:
            position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
        if token_type_ids is None:
168
            token_type_ids = tf.fill(input_shape, 0)
thomwolf's avatar
thomwolf committed
169

170
171
        if inputs_embeds is None:
            inputs_embeds = tf.gather(self.word_embeddings, input_ids)
thomwolf's avatar
thomwolf committed
172
173
174
        position_embeddings = self.position_embeddings(position_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

175
        embeddings = inputs_embeds + position_embeddings + token_type_embeddings
thomwolf's avatar
thomwolf committed
176
        embeddings = self.LayerNorm(embeddings)
thomwolf's avatar
thomwolf committed
177
        embeddings = self.dropout(embeddings, training=training)
thomwolf's avatar
thomwolf committed
178
179
        return embeddings

180
181
182
183
184
185
186
    def _linear(self, inputs):
        """Computes logits by running inputs through a linear layer.
            Args:
                inputs: A float32 tensor with shape [batch_size, length, hidden_size]
            Returns:
                float32 tensor with shape [batch_size, length, vocab_size].
        """
187
188
        batch_size = shape_list(inputs)[0]
        length = shape_list(inputs)[1]
189
190
191
192
193
194

        x = tf.reshape(inputs, [-1, self.hidden_size])
        logits = tf.matmul(x, self.word_embeddings, transpose_b=True)

        return tf.reshape(logits, [batch_size, length, self.vocab_size])

thomwolf's avatar
thomwolf committed
195
196
197

class TFBertSelfAttention(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
Julien Chaumond's avatar
Julien Chaumond committed
198
        super().__init__(**kwargs)
thomwolf's avatar
thomwolf committed
199
200
201
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                "The hidden size (%d) is not a multiple of the number of attention "
202
203
                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
            )
thomwolf's avatar
thomwolf committed
204
205
206
207
208
209
210
        self.output_attentions = config.output_attentions

        self.num_attention_heads = config.num_attention_heads
        assert config.hidden_size % config.num_attention_heads == 0
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

211
212
213
214
215
216
217
218
219
        self.query = tf.keras.layers.Dense(
            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
        )
        self.key = tf.keras.layers.Dense(
            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
        )
        self.value = tf.keras.layers.Dense(
            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
        )
thomwolf's avatar
thomwolf committed
220
221
222
223
224
225
226
227
228
229

        self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)

    def transpose_for_scores(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs, training=False):
        hidden_states, attention_mask, head_mask = inputs

230
        batch_size = shape_list(hidden_states)[0]
thomwolf's avatar
thomwolf committed
231
232
233
234
235
236
237
238
239
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)

        query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
        key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
        value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)

        # Take the dot product between "query" and "key" to get the raw attention scores.
240
241
242
243
        attention_scores = tf.matmul(
            query_layer, key_layer, transpose_b=True
        )  # (batch size, num_heads, seq_len_q, seq_len_k)
        dk = tf.cast(shape_list(key_layer)[-1], tf.float32)  # scale attention_scores
thomwolf's avatar
thomwolf committed
244
        attention_scores = attention_scores / tf.math.sqrt(dk)
thomwolf's avatar
thomwolf committed
245
246
247
248

        if attention_mask is not None:
            # Apply the attention mask is (precomputed for all layers in TFBertModel call() function)
            attention_scores = attention_scores + attention_mask
thomwolf's avatar
thomwolf committed
249
250
251
252

        # Normalize the attention scores to probabilities.
        attention_probs = tf.nn.softmax(attention_scores, axis=-1)

thomwolf's avatar
thomwolf committed
253
254
255
        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
        attention_probs = self.dropout(attention_probs, training=training)
thomwolf's avatar
thomwolf committed
256
257
258
259
260
261
262
263

        # Mask heads if we want to
        if head_mask is not None:
            attention_probs = attention_probs * head_mask

        context_layer = tf.matmul(attention_probs, value_layer)

        context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
264
265
266
        context_layer = tf.reshape(
            context_layer, (batch_size, -1, self.all_head_size)
        )  # (batch_size, seq_len_q, all_head_size)
thomwolf's avatar
thomwolf committed
267
268
269
270
271
272
273

        outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
        return outputs


class TFBertSelfOutput(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
Julien Chaumond's avatar
Julien Chaumond committed
274
        super().__init__(**kwargs)
275
276
277
278
        self.dense = tf.keras.layers.Dense(
            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
thomwolf's avatar
thomwolf committed
279
280
281
282
283
284
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)

    def call(self, inputs, training=False):
        hidden_states, input_tensor = inputs

        hidden_states = self.dense(hidden_states)
thomwolf's avatar
thomwolf committed
285
        hidden_states = self.dropout(hidden_states, training=training)
thomwolf's avatar
thomwolf committed
286
287
288
289
290
291
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


class TFBertAttention(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
Julien Chaumond's avatar
Julien Chaumond committed
292
        super().__init__(**kwargs)
293
294
        self.self_attention = TFBertSelfAttention(config, name="self")
        self.dense_output = TFBertSelfOutput(config, name="output")
thomwolf's avatar
thomwolf committed
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309

    def prune_heads(self, heads):
        raise NotImplementedError

    def call(self, inputs, training=False):
        input_tensor, attention_mask, head_mask = inputs

        self_outputs = self.self_attention([input_tensor, attention_mask, head_mask], training=training)
        attention_output = self.dense_output([self_outputs[0], input_tensor], training=training)
        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
        return outputs


class TFBertIntermediate(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
Julien Chaumond's avatar
Julien Chaumond committed
310
        super().__init__(**kwargs)
311
312
313
        self.dense = tf.keras.layers.Dense(
            config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
314
        if isinstance(config.hidden_act, str):
thomwolf's avatar
thomwolf committed
315
316
317
318
319
320
321
322
323
324
325
326
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    def call(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


class TFBertOutput(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
Julien Chaumond's avatar
Julien Chaumond committed
327
        super().__init__(**kwargs)
328
329
330
331
        self.dense = tf.keras.layers.Dense(
            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
thomwolf's avatar
thomwolf committed
332
333
334
335
336
337
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)

    def call(self, inputs, training=False):
        hidden_states, input_tensor = inputs

        hidden_states = self.dense(hidden_states)
thomwolf's avatar
thomwolf committed
338
        hidden_states = self.dropout(hidden_states, training=training)
thomwolf's avatar
thomwolf committed
339
340
341
342
343
344
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


class TFBertLayer(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
Julien Chaumond's avatar
Julien Chaumond committed
345
        super().__init__(**kwargs)
346
347
348
        self.attention = TFBertAttention(config, name="attention")
        self.intermediate = TFBertIntermediate(config, name="intermediate")
        self.bert_output = TFBertOutput(config, name="output")
thomwolf's avatar
thomwolf committed
349
350
351
352
353
354
355
356
357
358
359
360
361
362

    def call(self, inputs, training=False):
        hidden_states, attention_mask, head_mask = inputs

        attention_outputs = self.attention([hidden_states, attention_mask, head_mask], training=training)
        attention_output = attention_outputs[0]
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.bert_output([intermediate_output, attention_output], training=training)
        outputs = (layer_output,) + attention_outputs[1:]  # add attentions if we output them
        return outputs


class TFBertEncoder(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
Julien Chaumond's avatar
Julien Chaumond committed
363
        super().__init__(**kwargs)
thomwolf's avatar
thomwolf committed
364
365
        self.output_attentions = config.output_attentions
        self.output_hidden_states = config.output_hidden_states
366
        self.layer = [TFBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)]
thomwolf's avatar
thomwolf committed
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396

    def call(self, inputs, training=False):
        hidden_states, attention_mask, head_mask = inputs

        all_hidden_states = ()
        all_attentions = ()
        for i, layer_module in enumerate(self.layer):
            if self.output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            layer_outputs = layer_module([hidden_states, attention_mask, head_mask[i]], training=training)
            hidden_states = layer_outputs[0]

            if self.output_attentions:
                all_attentions = all_attentions + (layer_outputs[1],)

        # Add last layer
        if self.output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        outputs = (hidden_states,)
        if self.output_hidden_states:
            outputs = outputs + (all_hidden_states,)
        if self.output_attentions:
            outputs = outputs + (all_attentions,)
        return outputs  # outputs, (hidden states), (attentions)


class TFBertPooler(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
Julien Chaumond's avatar
Julien Chaumond committed
397
        super().__init__(**kwargs)
398
399
400
401
402
403
        self.dense = tf.keras.layers.Dense(
            config.hidden_size,
            kernel_initializer=get_initializer(config.initializer_range),
            activation="tanh",
            name="dense",
        )
thomwolf's avatar
thomwolf committed
404
405
406
407
408
409
410
411
412
413
414

    def call(self, hidden_states):
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        return pooled_output


class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
Julien Chaumond's avatar
Julien Chaumond committed
415
        super().__init__(**kwargs)
416
417
418
        self.dense = tf.keras.layers.Dense(
            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
419
        if isinstance(config.hidden_act, str):
thomwolf's avatar
thomwolf committed
420
421
422
            self.transform_act_fn = ACT2FN[config.hidden_act]
        else:
            self.transform_act_fn = config.hidden_act
423
        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
thomwolf's avatar
thomwolf committed
424
425
426
427
428
429
430
431
432

    def call(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.transform_act_fn(hidden_states)
        hidden_states = self.LayerNorm(hidden_states)
        return hidden_states


class TFBertLMPredictionHead(tf.keras.layers.Layer):
thomwolf's avatar
thomwolf committed
433
    def __init__(self, config, input_embeddings, **kwargs):
Julien Chaumond's avatar
Julien Chaumond committed
434
        super().__init__(**kwargs)
thomwolf's avatar
thomwolf committed
435
        self.vocab_size = config.vocab_size
436
        self.transform = TFBertPredictionHeadTransform(config, name="transform")
thomwolf's avatar
thomwolf committed
437
438
439

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
thomwolf's avatar
thomwolf committed
440
        self.input_embeddings = input_embeddings
thomwolf's avatar
thomwolf committed
441
442

    def build(self, input_shape):
443
        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
Julien Chaumond's avatar
Julien Chaumond committed
444
        super().build(input_shape)
thomwolf's avatar
thomwolf committed
445
446
447

    def call(self, hidden_states):
        hidden_states = self.transform(hidden_states)
thomwolf's avatar
thomwolf committed
448
449
        hidden_states = self.input_embeddings(hidden_states, mode="linear")
        hidden_states = hidden_states + self.bias
thomwolf's avatar
thomwolf committed
450
451
452
453
        return hidden_states


class TFBertMLMHead(tf.keras.layers.Layer):
thomwolf's avatar
thomwolf committed
454
    def __init__(self, config, input_embeddings, **kwargs):
Julien Chaumond's avatar
Julien Chaumond committed
455
        super().__init__(**kwargs)
456
        self.predictions = TFBertLMPredictionHead(config, input_embeddings, name="predictions")
thomwolf's avatar
thomwolf committed
457
458
459
460
461
462
463
464

    def call(self, sequence_output):
        prediction_scores = self.predictions(sequence_output)
        return prediction_scores


class TFBertNSPHead(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
Julien Chaumond's avatar
Julien Chaumond committed
465
        super().__init__(**kwargs)
466
467
468
        self.seq_relationship = tf.keras.layers.Dense(
            2, kernel_initializer=get_initializer(config.initializer_range), name="seq_relationship"
        )
thomwolf's avatar
thomwolf committed
469
470
471
472
473
474

    def call(self, pooled_output):
        seq_relationship_score = self.seq_relationship(pooled_output)
        return seq_relationship_score


475
476
@keras_serializable
class TFBertMainLayer(tf.keras.layers.Layer):
477
478
    config_class = BertConfig

thomwolf's avatar
thomwolf committed
479
    def __init__(self, config, **kwargs):
480
        super().__init__(**kwargs)
thomwolf's avatar
thomwolf committed
481
482
        self.num_hidden_layers = config.num_hidden_layers

483
484
485
        self.embeddings = TFBertEmbeddings(config, name="embeddings")
        self.encoder = TFBertEncoder(config, name="encoder")
        self.pooler = TFBertPooler(config, name="pooler")
thomwolf's avatar
thomwolf committed
486

487
488
489
    def get_input_embeddings(self):
        return self.embeddings

thomwolf's avatar
thomwolf committed
490
491
492
493
494
495
496
497
498
499
    def _resize_token_embeddings(self, new_num_tokens):
        raise NotImplementedError

    def _prune_heads(self, heads_to_prune):
        """ Prunes heads of the model.
            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
            See base class PreTrainedModel
        """
        raise NotImplementedError

500
501
502
503
504
505
506
507
508
509
    def call(
        self,
        inputs,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        training=False,
    ):
thomwolf's avatar
thomwolf committed
510
        if isinstance(inputs, (tuple, list)):
thomwolf's avatar
thomwolf committed
511
            input_ids = inputs[0]
thomwolf's avatar
thomwolf committed
512
513
514
515
            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
            token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
            position_ids = inputs[3] if len(inputs) > 3 else position_ids
            head_mask = inputs[4] if len(inputs) > 4 else head_mask
516
517
            inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
            assert len(inputs) <= 6, "Too many inputs."
518
        elif isinstance(inputs, (dict, BatchEncoding)):
519
520
521
522
523
524
            input_ids = inputs.get("input_ids")
            attention_mask = inputs.get("attention_mask", attention_mask)
            token_type_ids = inputs.get("token_type_ids", token_type_ids)
            position_ids = inputs.get("position_ids", position_ids)
            head_mask = inputs.get("head_mask", head_mask)
            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
525
            assert len(inputs) <= 6, "Too many inputs."
thomwolf's avatar
thomwolf committed
526
527
        else:
            input_ids = inputs
thomwolf's avatar
thomwolf committed
528

529
530
531
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
532
            input_shape = shape_list(input_ids)
533
        elif inputs_embeds is not None:
534
            input_shape = shape_list(inputs_embeds)[:-1]
535
536
537
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

thomwolf's avatar
thomwolf committed
538
        if attention_mask is None:
539
            attention_mask = tf.fill(input_shape, 1)
thomwolf's avatar
thomwolf committed
540
        if token_type_ids is None:
541
            token_type_ids = tf.fill(input_shape, 0)
thomwolf's avatar
thomwolf committed
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563

        # We create a 3D attention mask from a 2D tensor mask.
        # Sizes are [batch_size, 1, 1, to_seq_length]
        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
        # this attention mask is more simple than the triangular masking of causal attention
        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
        extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
        # positions we want to attend and -10000.0 for masked positions.
        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.

        extended_attention_mask = tf.cast(extended_attention_mask, tf.float32)
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0

        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
        # attention_probs has shape bsz x n_heads x N x N
        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
564
        if head_mask is not None:
thomwolf's avatar
thomwolf committed
565
566
567
568
569
            raise NotImplementedError
        else:
            head_mask = [None] * self.num_hidden_layers
            # head_mask = tf.constant([0] * self.num_hidden_layers)

570
        embedding_output = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
thomwolf's avatar
thomwolf committed
571
572
573
574
575
        encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training)

        sequence_output = encoder_outputs[0]
        pooled_output = self.pooler(sequence_output)

576
577
578
        outputs = (sequence_output, pooled_output,) + encoder_outputs[
            1:
        ]  # add hidden_states and attentions if they are here
thomwolf's avatar
thomwolf committed
579
580
        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)

thomwolf's avatar
thomwolf committed
581

thomwolf's avatar
thomwolf committed
582
583
class TFBertPreTrainedModel(TFPreTrainedModel):
    """ An abstract class to handle weights initialization and
584
        a simple interface for downloading and loading pretrained models.
thomwolf's avatar
thomwolf committed
585
    """
586

thomwolf's avatar
thomwolf committed
587
588
589
590
591
    config_class = BertConfig
    pretrained_model_archive_map = TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP
    base_model_prefix = "bert"


Lysandre's avatar
Lysandre committed
592
BERT_START_DOCSTRING = r"""
Lysandre's avatar
Lysandre committed
593
    This model is a `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ sub-class.
Lysandre's avatar
Lysandre committed
594
    Use it as a regular TF 2.0 Keras Model and
thomwolf's avatar
thomwolf committed
595
596
    refer to the TF 2.0 documentation for all matter related to general usage and behavior.

Lysandre's avatar
Lysandre committed
597
    .. note::
Lysandre's avatar
Lysandre committed
598

thomwolf's avatar
thomwolf committed
599
600
601
602
603
        TF 2.0 models accepts two formats as inputs:

            - having all inputs as keyword arguments (like PyTorch models), or
            - having all inputs as a list, tuple or dict in the first positional arguments.

Lysandre's avatar
Lysandre committed
604
        This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
Lysandre's avatar
Lysandre committed
605
        all the tensors in the first argument of the model call function: :obj:`model(inputs)`.
thomwolf's avatar
thomwolf committed
606

Lysandre's avatar
Lysandre committed
607
        If you choose this second option, there are three possibilities you can use to gather all the input Tensors
Lysandre's avatar
Lysandre committed
608
        in the first positional argument :
thomwolf's avatar
thomwolf committed
609

Lysandre's avatar
Lysandre committed
610
        - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
thomwolf's avatar
thomwolf committed
611
        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
Lysandre's avatar
Lysandre committed
612
613
614
          :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
        - a dictionary with one or several input Tensors associated to the input names given in the docstring:
          :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
thomwolf's avatar
thomwolf committed
615
616

    Parameters:
617
        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
thomwolf's avatar
thomwolf committed
618
            Initializing with a config file does not load the weights associated with the model, only the configuration.
619
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
thomwolf's avatar
thomwolf committed
620
621
622
"""

BERT_INPUTS_DOCSTRING = r"""
Lysandre's avatar
Lysandre committed
623
    Args:
624
        input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`):
Lysandre's avatar
Lysandre committed
625
626
            Indices of input sequence tokens in the vocabulary.

627
628
            Indices can be obtained using :class:`transformers.BertTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
Lysandre's avatar
Lysandre committed
629
            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
Lysandre's avatar
Lysandre committed
630

Lysandre's avatar
Lysandre committed
631
            `What are input IDs? <../glossary.html#input-ids>`__
632
        attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
thomwolf's avatar
thomwolf committed
633
634
635
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
Lysandre's avatar
Lysandre committed
636

Lysandre's avatar
Lysandre committed
637
            `What are attention masks? <../glossary.html#attention-mask>`__
638
        token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
thomwolf's avatar
thomwolf committed
639
640
641
            Segment token indices to indicate first and second portions of the inputs.
            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
            corresponds to a `sentence B` token
Lysandre's avatar
Lysandre committed
642
643

            `What are token type IDs? <../glossary.html#token-type-ids>`__
644
        position_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
thomwolf's avatar
thomwolf committed
645
646
            Indices of positions of each input sequence tokens in the position embeddings.
            Selected in the range ``[0, config.max_position_embeddings - 1]``.
Lysandre's avatar
Lysandre committed
647

Lysandre's avatar
Lysandre committed
648
649
            `What are position IDs? <../glossary.html#position-ids>`__
        head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
thomwolf's avatar
thomwolf committed
650
651
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
Lysandre's avatar
Lysandre committed
652
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
Lysandre's avatar
Lysandre committed
653
        inputs_embeds (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, embedding_dim)`, `optional`, defaults to :obj:`None`):
Lysandre's avatar
Lysandre committed
654
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
655
656
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
Lysandre's avatar
Lysandre committed
657
658
659
        training (:obj:`boolean`, `optional`, defaults to :obj:`False`):
            Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them
            (if set to :obj:`False`) for evaluation.
thomwolf's avatar
thomwolf committed
660
661
"""

662
663
664
665
666

@add_start_docstrings(
    "The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
    BERT_START_DOCSTRING,
)
thomwolf's avatar
thomwolf committed
667
class TFBertModel(TFBertPreTrainedModel):
thomwolf's avatar
thomwolf committed
668
    def __init__(self, config, *inputs, **kwargs):
Julien Chaumond's avatar
Julien Chaumond committed
669
        super().__init__(config, *inputs, **kwargs)
670
        self.bert = TFBertMainLayer(config, name="bert")
thomwolf's avatar
thomwolf committed
671

672
    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
thomwolf's avatar
thomwolf committed
673
    def call(self, inputs, **kwargs):
Lysandre's avatar
Lysandre committed
674
        r"""
Lysandre Debut's avatar
Lysandre Debut committed
675
    Returns:
676
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
Lysandre Debut's avatar
Lysandre Debut committed
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
        last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`):
            Last layer hidden-state of the first token of the sequence (classification token)
            further processed by a Linear layer and a Tanh activation function. The Linear
            layer weights are trained from the next sentence prediction (classification)
            objective during Bert pretraining. This output is usually *not* a good summary
            of the semantic content of the input, you're often better with averaging or pooling
            the sequence of hidden-states for the whole input sequence.
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.


    Examples::

        import tensorflow as tf
        from transformers import BertTokenizer, TFBertModel

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = TFBertModel.from_pretrained('bert-base-uncased')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
Lysandre's avatar
Lysandre committed
708
        """
thomwolf's avatar
thomwolf committed
709
        outputs = self.bert(inputs, **kwargs)
thomwolf's avatar
thomwolf committed
710
711
712
        return outputs


713
714
@add_start_docstrings(
    """Bert Model with two heads on top as done during the pre-training:
thomwolf's avatar
thomwolf committed
715
    a `masked language modeling` head and a `next sentence prediction (classification)` head. """,
716
717
    BERT_START_DOCSTRING,
)
thomwolf's avatar
thomwolf committed
718
class TFBertForPreTraining(TFBertPreTrainedModel):
Lysandre's avatar
Lysandre committed
719
720
721
722
723
724
725
726
727
728
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        self.bert = TFBertMainLayer(config, name="bert")
        self.nsp = TFBertNSPHead(config, name="nsp___cls")
        self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls")

    def get_output_embeddings(self):
        return self.bert.embeddings

729
    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
Lysandre's avatar
Lysandre committed
730
731
732
    def call(self, inputs, **kwargs):
        r"""
    Return:
733
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
Lysandre's avatar
Lysandre committed
734
        prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
thomwolf's avatar
thomwolf committed
735
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
Lysandre's avatar
Lysandre committed
736
        seq_relationship_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`):
thomwolf's avatar
thomwolf committed
737
            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
Lysandre's avatar
Lysandre committed
738
739
740
741
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

thomwolf's avatar
thomwolf committed
742
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
Lysandre's avatar
Lysandre committed
743
744
745
746
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

thomwolf's avatar
thomwolf committed
747
748
749
750
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

thomwolf's avatar
thomwolf committed
751
        import tensorflow as tf
752
        from transformers import BertTokenizer, TFBertForPreTraining
thomwolf's avatar
thomwolf committed
753

thomwolf's avatar
thomwolf committed
754
755
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = TFBertForPreTraining.from_pretrained('bert-base-uncased')
756
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
thomwolf's avatar
thomwolf committed
757
758
759
        outputs = model(input_ids)
        prediction_scores, seq_relationship_scores = outputs[:2]

Lysandre's avatar
Lysandre committed
760
        """
thomwolf's avatar
thomwolf committed
761
        outputs = self.bert(inputs, **kwargs)
thomwolf's avatar
thomwolf committed
762
763

        sequence_output, pooled_output = outputs[:2]
764
        prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False))
765
        seq_relationship_score = self.nsp(pooled_output)
thomwolf's avatar
thomwolf committed
766

767
768
769
        outputs = (prediction_scores, seq_relationship_score,) + outputs[
            2:
        ]  # add hidden states and attention if they are here
thomwolf's avatar
thomwolf committed
770
771
772
773

        return outputs  # prediction_scores, seq_relationship_score, (hidden_states), (attentions)


Lysandre's avatar
Lysandre committed
774
@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
thomwolf's avatar
thomwolf committed
775
class TFBertForMaskedLM(TFBertPreTrainedModel):
Lysandre's avatar
Lysandre committed
776
777
778
779
780
781
782
783
784
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        self.bert = TFBertMainLayer(config, name="bert")
        self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls")

    def get_output_embeddings(self):
        return self.bert.embeddings

785
    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
Lysandre's avatar
Lysandre committed
786
787
788
    def call(self, inputs, **kwargs):
        r"""
    Return:
789
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
Lysandre's avatar
Lysandre committed
790
        prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
thomwolf's avatar
thomwolf committed
791
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
Lysandre's avatar
Lysandre committed
792
793
794
795
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

thomwolf's avatar
thomwolf committed
796
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
Lysandre's avatar
Lysandre committed
797
798
799
800
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

thomwolf's avatar
thomwolf committed
801
802
803
804
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

thomwolf's avatar
thomwolf committed
805
        import tensorflow as tf
806
        from transformers import BertTokenizer, TFBertForMaskedLM
thomwolf's avatar
thomwolf committed
807

thomwolf's avatar
thomwolf committed
808
809
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = TFBertForMaskedLM.from_pretrained('bert-base-uncased')
810
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
thomwolf's avatar
thomwolf committed
811
        outputs = model(input_ids)
thomwolf's avatar
thomwolf committed
812
        prediction_scores = outputs[0]
thomwolf's avatar
thomwolf committed
813

Lysandre's avatar
Lysandre committed
814
        """
thomwolf's avatar
thomwolf committed
815
        outputs = self.bert(inputs, **kwargs)
thomwolf's avatar
thomwolf committed
816
817

        sequence_output = outputs[0]
818
        prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False))
thomwolf's avatar
thomwolf committed
819
820
821
822
823
824

        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here

        return outputs  # prediction_scores, (hidden_states), (attentions)


825
@add_start_docstrings(
Lysandre's avatar
Lysandre committed
826
    """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING,
827
)
thomwolf's avatar
thomwolf committed
828
class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
Lysandre's avatar
Lysandre committed
829
830
831
832
833
834
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        self.bert = TFBertMainLayer(config, name="bert")
        self.nsp = TFBertNSPHead(config, name="nsp___cls")

835
    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
Lysandre's avatar
Lysandre committed
836
837
838
    def call(self, inputs, **kwargs):
        r"""
    Return:
839
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
Lysandre's avatar
Lysandre committed
840
        seq_relationship_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`)
thomwolf's avatar
thomwolf committed
841
            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
Lysandre's avatar
Lysandre committed
842
843
844
845
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

thomwolf's avatar
thomwolf committed
846
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
Lysandre's avatar
Lysandre committed
847
848
849
850
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

thomwolf's avatar
thomwolf committed
851
852
853
854
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

thomwolf's avatar
thomwolf committed
855
        import tensorflow as tf
856
        from transformers import BertTokenizer, TFBertForNextSentencePrediction
thomwolf's avatar
thomwolf committed
857

thomwolf's avatar
thomwolf committed
858
859
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = TFBertForNextSentencePrediction.from_pretrained('bert-base-uncased')
860
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
thomwolf's avatar
thomwolf committed
861
862
863
        outputs = model(input_ids)
        seq_relationship_scores = outputs[0]

Lysandre's avatar
Lysandre committed
864
        """
thomwolf's avatar
thomwolf committed
865
        outputs = self.bert(inputs, **kwargs)
thomwolf's avatar
thomwolf committed
866
867

        pooled_output = outputs[1]
868
        seq_relationship_score = self.nsp(pooled_output)
thomwolf's avatar
thomwolf committed
869
870
871
872

        outputs = (seq_relationship_score,) + outputs[2:]  # add hidden states and attention if they are here

        return outputs  # seq_relationship_score, (hidden_states), (attentions)
thomwolf's avatar
thomwolf committed
873
874


875
876
@add_start_docstrings(
    """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
thomwolf's avatar
thomwolf committed
877
    the pooled output) e.g. for GLUE tasks. """,
878
879
    BERT_START_DOCSTRING,
)
thomwolf's avatar
thomwolf committed
880
class TFBertForSequenceClassification(TFBertPreTrainedModel):
Lysandre's avatar
Lysandre committed
881
882
883
884
885
886
887
888
889
890
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels

        self.bert = TFBertMainLayer(config, name="bert")
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
        self.classifier = tf.keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )

891
    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
Lysandre's avatar
Lysandre committed
892
893
894
    def call(self, inputs, **kwargs):
        r"""
    Return:
895
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
Lysandre's avatar
Lysandre committed
896
        logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
thomwolf's avatar
thomwolf committed
897
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
Lysandre's avatar
Lysandre committed
898
899
900
901
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

thomwolf's avatar
thomwolf committed
902
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
Lysandre's avatar
Lysandre committed
903
904
905
906
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

thomwolf's avatar
thomwolf committed
907
908
909
910
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

thomwolf's avatar
thomwolf committed
911
        import tensorflow as tf
912
        from transformers import BertTokenizer, TFBertForSequenceClassification
thomwolf's avatar
thomwolf committed
913

thomwolf's avatar
thomwolf committed
914
915
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
916
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
thomwolf's avatar
thomwolf committed
917
        outputs = model(input_ids)
thomwolf's avatar
thomwolf committed
918
        logits = outputs[0]
thomwolf's avatar
thomwolf committed
919

Lysandre's avatar
Lysandre committed
920
        """
thomwolf's avatar
thomwolf committed
921
        outputs = self.bert(inputs, **kwargs)
thomwolf's avatar
thomwolf committed
922
923
924

        pooled_output = outputs[1]

925
        pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False))
thomwolf's avatar
thomwolf committed
926
927
928
929
930
931
932
        logits = self.classifier(pooled_output)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

        return outputs  # logits, (hidden_states), (attentions)


933
934
@add_start_docstrings(
    """Bert Model with a multiple choice classification head on top (a linear layer on top of
thomwolf's avatar
thomwolf committed
935
    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
936
937
    BERT_START_DOCSTRING,
)
thomwolf's avatar
thomwolf committed
938
class TFBertForMultipleChoice(TFBertPreTrainedModel):
thomwolf's avatar
thomwolf committed
939
    def __init__(self, config, *inputs, **kwargs):
Julien Chaumond's avatar
Julien Chaumond committed
940
        super().__init__(config, *inputs, **kwargs)
thomwolf's avatar
thomwolf committed
941

942
        self.bert = TFBertMainLayer(config, name="bert")
thomwolf's avatar
thomwolf committed
943
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
944
945
946
947
        self.classifier = tf.keras.layers.Dense(
            1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )

Lysandre's avatar
Lysandre committed
948
949
950
951
952
953
954
955
956
    @property
    def dummy_inputs(self):
        """ Dummy inputs to build the network.

        Returns:
            tf.Tensor with dummy inputs
        """
        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}

957
    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
958
959
960
961
962
963
964
965
966
967
    def call(
        self,
        inputs,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        training=False,
    ):
Lysandre's avatar
Lysandre committed
968
969
        r"""
    Return:
970
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
Lysandre's avatar
Lysandre committed
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
        classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`:
            `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).

            Classification scores (before SoftMax).
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

        import tensorflow as tf
        from transformers import BertTokenizer, TFBertForMultipleChoice

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = TFBertForMultipleChoice.from_pretrained('bert-base-uncased')
        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
        input_ids = tf.constant([tokenizer.encode(s) for s in choices])[None, :]  # Batch size 1, 2 choices
        outputs = model(input_ids)
        classification_scores = outputs[0]

        """
thomwolf's avatar
thomwolf committed
999
        if isinstance(inputs, (tuple, list)):
thomwolf's avatar
thomwolf committed
1000
            input_ids = inputs[0]
thomwolf's avatar
thomwolf committed
1001
1002
1003
1004
            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
            token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
            position_ids = inputs[3] if len(inputs) > 3 else position_ids
            head_mask = inputs[4] if len(inputs) > 4 else head_mask
1005
1006
            inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
            assert len(inputs) <= 6, "Too many inputs."
thomwolf's avatar
thomwolf committed
1007
        elif isinstance(inputs, dict):
1008
1009
1010
1011
1012
1013
            input_ids = inputs.get("input_ids")
            attention_mask = inputs.get("attention_mask", attention_mask)
            token_type_ids = inputs.get("token_type_ids", token_type_ids)
            position_ids = inputs.get("position_ids", position_ids)
            head_mask = inputs.get("head_mask", head_mask)
            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
1014
            assert len(inputs) <= 6, "Too many inputs."
thomwolf's avatar
thomwolf committed
1015
1016
        else:
            input_ids = inputs
thomwolf's avatar
thomwolf committed
1017

1018
        if input_ids is not None:
1019
1020
            num_choices = shape_list(input_ids)[1]
            seq_length = shape_list(input_ids)[2]
1021
        else:
1022
1023
            num_choices = shape_list(inputs_embeds)[1]
            seq_length = shape_list(inputs_embeds)[2]
thomwolf's avatar
thomwolf committed
1024

1025
        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
thomwolf's avatar
thomwolf committed
1026
1027
1028
1029
        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None

1030
1031
1032
1033
1034
1035
1036
1037
        flat_inputs = [
            flat_input_ids,
            flat_attention_mask,
            flat_token_type_ids,
            flat_position_ids,
            head_mask,
            inputs_embeds,
        ]
thomwolf's avatar
thomwolf committed
1038
1039
1040
1041
1042

        outputs = self.bert(flat_inputs, training=training)

        pooled_output = outputs[1]

thomwolf's avatar
thomwolf committed
1043
        pooled_output = self.dropout(pooled_output, training=training)
thomwolf's avatar
thomwolf committed
1044
1045
1046
1047
1048
1049
1050
1051
        logits = self.classifier(pooled_output)
        reshaped_logits = tf.reshape(logits, (-1, num_choices))

        outputs = (reshaped_logits,) + outputs[2:]  # add hidden states and attention if they are here

        return outputs  # reshaped_logits, (hidden_states), (attentions)


1052
1053
@add_start_docstrings(
    """Bert Model with a token classification head on top (a linear layer on top of
thomwolf's avatar
thomwolf committed
1054
    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
1055
1056
    BERT_START_DOCSTRING,
)
thomwolf's avatar
thomwolf committed
1057
class TFBertForTokenClassification(TFBertPreTrainedModel):
Lysandre's avatar
Lysandre committed
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels

        self.bert = TFBertMainLayer(config, name="bert")
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
        self.classifier = tf.keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )

1068
    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
Lysandre's avatar
Lysandre committed
1069
1070
1071
    def call(self, inputs, **kwargs):
        r"""
    Return:
1072
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
Lysandre's avatar
Lysandre committed
1073
        scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
thomwolf's avatar
thomwolf committed
1074
            Classification scores (before SoftMax).
Lysandre's avatar
Lysandre committed
1075
1076
1077
1078
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

thomwolf's avatar
thomwolf committed
1079
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
Lysandre's avatar
Lysandre committed
1080
1081
1082
1083
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

thomwolf's avatar
thomwolf committed
1084
1085
1086
1087
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

thomwolf's avatar
thomwolf committed
1088
        import tensorflow as tf
1089
        from transformers import BertTokenizer, TFBertForTokenClassification
thomwolf's avatar
thomwolf committed
1090

thomwolf's avatar
thomwolf committed
1091
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
thomwolf's avatar
thomwolf committed
1092
        model = TFBertForTokenClassification.from_pretrained('bert-base-uncased')
1093
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
thomwolf's avatar
thomwolf committed
1094
        outputs = model(input_ids)
thomwolf's avatar
thomwolf committed
1095
        scores = outputs[0]
thomwolf's avatar
thomwolf committed
1096

Lysandre's avatar
Lysandre committed
1097
        """
thomwolf's avatar
thomwolf committed
1098
        outputs = self.bert(inputs, **kwargs)
thomwolf's avatar
thomwolf committed
1099
1100
1101

        sequence_output = outputs[0]

1102
        sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False))
thomwolf's avatar
thomwolf committed
1103
1104
1105
1106
1107
1108
1109
        logits = self.classifier(sequence_output)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

        return outputs  # scores, (hidden_states), (attentions)


1110
1111
@add_start_docstrings(
    """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
thomwolf's avatar
thomwolf committed
1112
    the hidden-states output to compute `span start logits` and `span end logits`). """,
1113
1114
    BERT_START_DOCSTRING,
)
thomwolf's avatar
thomwolf committed
1115
class TFBertForQuestionAnswering(TFBertPreTrainedModel):
Lysandre's avatar
Lysandre committed
1116
1117
1118
1119
1120
1121
1122
1123
1124
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels

        self.bert = TFBertMainLayer(config, name="bert")
        self.qa_outputs = tf.keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
        )

1125
    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
Lysandre's avatar
Lysandre committed
1126
1127
1128
    def call(self, inputs, **kwargs):
        r"""
    Return:
1129
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
Lysandre's avatar
Lysandre committed
1130
        start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
thomwolf's avatar
thomwolf committed
1131
            Span-start scores (before SoftMax).
Lysandre's avatar
Lysandre committed
1132
        end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
thomwolf's avatar
thomwolf committed
1133
            Span-end scores (before SoftMax).
Lysandre's avatar
Lysandre committed
1134
1135
1136
1137
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

thomwolf's avatar
thomwolf committed
1138
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
Lysandre's avatar
Lysandre committed
1139
1140
1141
1142
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

thomwolf's avatar
thomwolf committed
1143
1144
1145
1146
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

thomwolf's avatar
thomwolf committed
1147
        import tensorflow as tf
1148
        from transformers import BertTokenizer, TFBertForQuestionAnswering
thomwolf's avatar
thomwolf committed
1149

thomwolf's avatar
thomwolf committed
1150
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
        model = TFBertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
        encoding = tokenizer.encode_plus(question, text)
        input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"]
        start_scores, end_scores = model(tf.constant(input_ids)[None, :], token_type_ids=tf.constant(token_type_ids)[None, :])

        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
        answer = ' '.join(all_tokens[tf.math.argmax(tf.squeeze(start_scores)) : tf.math.argmax(tf.squeeze(end_scores))+1])
        assert answer == "a nice puppet"
thomwolf's avatar
thomwolf committed
1161

Lysandre's avatar
Lysandre committed
1162
        """
thomwolf's avatar
thomwolf committed
1163
        outputs = self.bert(inputs, **kwargs)
thomwolf's avatar
thomwolf committed
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174

        sequence_output = outputs[0]

        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = tf.split(logits, 2, axis=-1)
        start_logits = tf.squeeze(start_logits, axis=-1)
        end_logits = tf.squeeze(end_logits, axis=-1)

        outputs = (start_logits, end_logits,) + outputs[2:]

        return outputs  # start_logits, end_logits, (hidden_states), (attentions)