Black 20 release

a75c64d8 · Lysandre · e78c1103 · a75c64d8 · a75c64d8 · a75c64d8
Commit a75c64d8 authored Aug 26, 2020 by Lysandre
20 changed files
--- a/src/transformers/modeling_retribert.py
+++ b/src/transformers/modeling_retribert.py
@@ -40,8 +40,8 @@ RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
 # INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
 class RetriBertPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
+    """An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    a simple interface for downloading and loading pretrained models.
    """
    config_class = RetriBertConfig
@@ -73,7 +73,8 @@ RETRIBERT_START_DOCSTRING = r"""
 @add_start_docstrings(
-    """Bert Based model to embed queries or document for document retreival. """, RETRIBERT_START_DOCSTRING,
+    """Bert Based model to embed queries or document for document retreival. """,
+    RETRIBERT_START_DOCSTRING,
 )
 class RetriBertModel(RetriBertPreTrainedModel):
    def __init__(self, config):
@@ -91,7 +92,11 @@ class RetriBertModel(RetriBertPreTrainedModel):
        self.init_weights()
    def embed_sentences_checkpointed(
-        self, input_ids, attention_mask, sent_encoder, checkpoint_batch_size=-1,
+        self,
+        input_ids,
+        attention_mask,
+        sent_encoder,
+        checkpoint_batch_size=-1,
    ):
        # reproduces BERT forward pass with checkpointing
        if checkpoint_batch_size < 0 or input_ids.shape[0] < checkpoint_batch_size:
@@ -108,7 +113,11 @@ class RetriBertModel(RetriBertPreTrainedModel):
            # define function for cehckpointing
            def partial_encode(*inputs):
-                encoder_outputs = sent_encoder.encoder(inputs[0], attention_mask=inputs[1], head_mask=head_mask,)
+                encoder_outputs = sent_encoder.encoder(
+                    inputs[0],
+                    attention_mask=inputs[1],
+                    head_mask=head_mask,
+                )
                sequence_output = encoder_outputs[0]
                pooled_output = sent_encoder.pooler(sequence_output)
                return pooled_output
@@ -127,13 +136,24 @@ class RetriBertModel(RetriBertPreTrainedModel):
            return torch.cat(pooled_output_list, dim=0)
    def embed_questions(
-        self, input_ids, attention_mask=None, checkpoint_batch_size=-1,
+        self,
+        input_ids,
+        attention_mask=None,
+        checkpoint_batch_size=-1,
    ):
-        q_reps = self.embed_sentences_checkpointed(input_ids, attention_mask, self.bert_query, checkpoint_batch_size,)
+        q_reps = self.embed_sentences_checkpointed(
+            input_ids,
+            attention_mask,
+            self.bert_query,
+            checkpoint_batch_size,
+        )
        return self.project_query(q_reps)
    def embed_answers(
-        self, input_ids, attention_mask=None, checkpoint_batch_size=-1,
+        self,
+        input_ids,
+        attention_mask=None,
+        checkpoint_batch_size=-1,
    ):
        a_reps = self.embed_sentences_checkpointed(
            input_ids,
@@ -147,33 +167,33 @@ class RetriBertModel(RetriBertPreTrainedModel):
        self, input_ids_query, attention_mask_query, input_ids_doc, attention_mask_doc, checkpoint_batch_size=-1
    ):
        r"""
-    Args:
+        Args:
-        input_ids_query (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            input_ids_query (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary for the queries in a batch.
+                Indices of input sequence tokens in the vocabulary for the queries in a batch.
-            Indices can be obtained using :class:`transformers.RetriBertTokenizer`.
+                Indices can be obtained using :class:`transformers.RetriBertTokenizer`.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
+                See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.__call__` for details.
+                :func:`transformers.PreTrainedTokenizer.__call__` for details.
-            `What are input IDs? <../glossary.html#input-ids>`__
+                `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask_query (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            attention_mask_query (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to avoid performing attention on queries padding token indices.
+                Mask to avoid performing attention on queries padding token indices.
-            Mask values selected in ``[0, 1]``:
+                Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+                ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-            `What are attention masks? <../glossary.html#attention-mask>`__
+                `What are attention masks? <../glossary.html#attention-mask>`__
-        input_ids_doc (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            input_ids_doc (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary for the documents in a batch.
+                Indices of input sequence tokens in the vocabulary for the documents in a batch.
-        attention_mask_doc (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            attention_mask_doc (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to avoid performing attention on documents padding token indices.
+                Mask to avoid performing attention on documents padding token indices.
-        checkpoint_batch_size (:obj:`int`, `optional`, defaults to `:obj:`-1`):
+            checkpoint_batch_size (:obj:`int`, `optional`, defaults to `:obj:`-1`):
-            If greater than 0, uses gradient checkpointing to only compute sequence representation on checkpoint_batch_size examples at a time
+                If greater than 0, uses gradient checkpointing to only compute sequence representation on checkpoint_batch_size examples at a time
-            on the GPU. All query representations are still compared to all document representations in the batch.
+                on the GPU. All query representations are still compared to all document representations in the batch.
-    Return:
+        Return:
-        :obj:`torch.FloatTensor` the bi-directional cross-entropy loss obtained while trying to match each query to its corresponding document
+            :obj:`torch.FloatTensor` the bi-directional cross-entropy loss obtained while trying to match each query to its corresponding document
-        and each cocument to its corresponding query in the batch
+            and each cocument to its corresponding query in the batch
        """
        device = input_ids_query.device
        q_reps = self.embed_questions(input_ids_query, attention_mask_query, checkpoint_batch_size)

--- a/src/transformers/modeling_roberta.py
+++ b/src/transformers/modeling_roberta.py
@@ -83,7 +83,7 @@ class RobertaEmbeddings(BertEmbeddings):
        )
    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
-        """ We are provided embeddings directly. We cannot infer which are padded so just generate
+        """We are provided embeddings directly. We cannot infer which are padded so just generate
        sequential position ids.
        :param torch.Tensor inputs_embeds:
@@ -220,36 +220,36 @@ class RobertaForCausalLM(BertPreTrainedModel):
        return_dict=None,
    ):
        r"""
-        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
+            encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
-            if the model is configured as a decoder.
+                if the model is configured as a decoder.
-        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
+                Mask to avoid performing attention on the padding token indices of the encoder input. This mask
-            is used in the cross-attention if the model is configured as a decoder.
+                is used in the cross-attention if the model is configured as a decoder.
-            Mask values selected in ``[0, 1]``:
+                Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+                ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the left-to-right language modeling loss (next word prediction).
+                Labels for computing the left-to-right language modeling loss (next word prediction).
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+                Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+                Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
-            in ``[0, ..., config.vocab_size]``
+                in ``[0, ..., config.vocab_size]``
-    Returns:
+        Returns:
-    Example::
+        Example::
-        >>> from transformers import RobertaTokenizer, RobertaLMHeadModel, RobertaConfig
+            >>> from transformers import RobertaTokenizer, RobertaLMHeadModel, RobertaConfig
-        >>> import torch
+            >>> import torch
-        >>> tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
+            >>> tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        >>> config = RobertaConfig.from_pretrained("roberta-base")
+            >>> config = RobertaConfig.from_pretrained("roberta-base")
-        >>> config.is_decoder = True
+            >>> config.is_decoder = True
-        >>> model = RobertaLMHeadModel.from_pretrained('roberta-base', config=config, return_dict=True)
+            >>> model = RobertaLMHeadModel.from_pretrained('roberta-base', config=config, return_dict=True)
-        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-        >>> outputs = model(**inputs)
+            >>> outputs = model(**inputs)
-        >>> prediction_logits = outputs.logits
+            >>> prediction_logits = outputs.logits
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -283,7 +283,10 @@ class RobertaForCausalLM(BertPreTrainedModel):
            return ((lm_loss,) + output) if lm_loss is not None else output
        return CausalLMOutput(
-            loss=lm_loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )
    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
@@ -493,7 +496,10 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
            return ((loss,) + output) if loss is not None else output
        return SequenceClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )
@@ -581,7 +587,10 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
            return ((loss,) + output) if loss is not None else output
        return MultipleChoiceModelOutput(
-            loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )
@@ -667,7 +676,10 @@ class RobertaForTokenClassification(BertPreTrainedModel):
            return ((loss,) + output) if loss is not None else output
        return TokenClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )
@@ -791,7 +803,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
 def create_position_ids_from_input_ids(input_ids, padding_idx):
-    """ Replace non-padding symbols with their position numbers. Position numbers begin at
+    """Replace non-padding symbols with their position numbers. Position numbers begin at
    padding_idx+1. Padding symbols are ignored. This is modified from fairseq's
    `utils.make_positions`.

--- a/src/transformers/modeling_t5.py
+++ b/src/transformers/modeling_t5.py
@@ -62,8 +62,7 @@ T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
 # More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
 ####################################################
 def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
-    """ Load tf checkpoints in a pytorch model.
+    """Load tf checkpoints in a pytorch model."""
-    """
    try:
        import re
@@ -156,8 +155,8 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
 class T5LayerNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
-        """ Construct a layernorm module in the T5 style
+        """Construct a layernorm module in the T5 style
-            No bias and no substraction of mean.
+        No bias and no substraction of mean.
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
@@ -569,8 +568,8 @@ class T5Block(nn.Module):
 class T5PreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
+    """An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    a simple interface for downloading and loading pretrained models.
    """
    config_class = T5Config
@@ -913,9 +912,9 @@ class T5Model(T5PreTrainedModel):
        return self.decoder
    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
+        """Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        See base class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)
@@ -940,19 +939,19 @@ class T5Model(T5PreTrainedModel):
        **kwargs,
    ):
        r"""
-    Returns:
+        Returns:
-    Example::
+        Example::
-        >>> from transformers import T5Tokenizer, T5Model
+            >>> from transformers import T5Tokenizer, T5Model
-        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        >>> model = T5Model.from_pretrained('t5-small')
+            >>> model = T5Model.from_pretrained('t5-small')
-        >>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt")  # Batch size 1
+            >>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt")  # Batch size 1
-        >>> outputs = model(input_ids=input_ids)
+            >>> outputs = model(input_ids=input_ids)
-        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+            >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        """
        if "decoder_past_key_value_states" in kwargs:
            warnings.warn(
@@ -1093,31 +1092,31 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
        **kwargs,
    ):
        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the sequence classification/regression loss.
+                Labels for computing the sequence classification/regression loss.
-            Indices should be in :obj:`[-100, 0, ..., config.vocab_size - 1]`.
+                Indices should be in :obj:`[-100, 0, ..., config.vocab_size - 1]`.
-            All labels set to ``-100`` are ignored (masked), the loss is only
+                All labels set to ``-100`` are ignored (masked), the loss is only
-            computed for labels in ``[0, ..., config.vocab_size]``
+                computed for labels in ``[0, ..., config.vocab_size]``
-        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+            kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
-            Used to hide legacy arguments that have been deprecated.
+                Used to hide legacy arguments that have been deprecated.
-    Returns:
+        Returns:
-    Examples::
+        Examples::
-        >>> from transformers import T5Tokenizer, T5ForConditionalGeneration
+            >>> from transformers import T5Tokenizer, T5ForConditionalGeneration
-        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        >>> model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True)
+            >>> model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True)
-        >>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt")  # Batch size 1
+            >>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt")  # Batch size 1
-        >>> outputs = model(input_ids=input_ids, labels=input_ids)
+            >>> outputs = model(input_ids=input_ids, labels=input_ids)
-        >>> loss = outputs.loss
+            >>> loss = outputs.loss
-        >>> logits = outputs.logits
+            >>> logits = outputs.logits
-        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        >>> model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True)
+            >>> model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True)
-        >>> input_ids = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="pt")  # Batch size 1
+            >>> input_ids = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="pt")  # Batch size 1
-        >>> outputs = model.generate(input_ids)
+            >>> outputs = model.generate(input_ids)
        """
        if "lm_labels" in kwargs:

--- a/src/transformers/modeling_tf_albert.py
+++ b/src/transformers/modeling_tf_albert.py
@@ -74,8 +74,7 @@ TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
 class TFAlbertEmbeddings(tf.keras.layers.Layer):
-    """Construct the embeddings from word, position and token_type embeddings.
+    """Construct the embeddings from word, position and token_type embeddings."""
-    """
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
@@ -169,10 +168,10 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
    def _linear(self, inputs):
        """Computes logits by running inputs through a linear layer.
-            Args:
+        Args:
-                inputs: A float32 tensor with shape [batch_size, length, embedding_size]
+            inputs: A float32 tensor with shape [batch_size, length, embedding_size]
-            Returns:
+        Returns:
-                float32 tensor with shape [batch_size, length, vocab_size].
+            float32 tensor with shape [batch_size, length, vocab_size].
        """
        batch_size = shape_list(inputs)[0]
        length = shape_list(inputs)[1]
@@ -478,8 +477,8 @@ class TFAlbertTransformer(tf.keras.layers.Layer):
 class TFAlbertPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
+    """An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    a simple interface for downloading and loading pretrained models.
    """
    config_class = AlbertConfig
@@ -551,9 +550,9 @@ class TFAlbertMainLayer(tf.keras.layers.Layer):
        raise NotImplementedError
    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
+        """Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        See base class PreTrainedModel
        """
        raise NotImplementedError
@@ -655,7 +654,10 @@ class TFAlbertMainLayer(tf.keras.layers.Layer):
        pooled_output = self.pooler(sequence_output[:, 0])
        if not return_dict:
-            return (sequence_output, pooled_output,) + encoder_outputs[1:]
+            return (
+                sequence_output,
+                pooled_output,
+            ) + encoder_outputs[1:]
        return TFBaseModelOutputWithPooling(
            last_hidden_state=sequence_output,
@@ -821,16 +823,16 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel):
    @replace_return_docstrings(output_type=TFAlbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
    def call(self, inputs, **kwargs):
        r"""
-    Return:
+        Return:
-    Examples::
+        Examples::
-        import tensorflow as tf
+            import tensorflow as tf
-        from transformers import AlbertTokenizer, TFAlbertForPreTraining
+            from transformers import AlbertTokenizer, TFAlbertForPreTraining
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
+            tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = TFAlbertForPreTraining.from_pretrained('albert-base-v2')
+            model = TFAlbertForPreTraining.from_pretrained('albert-base-v2')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
+            input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
+            outputs = model(input_ids)
-        prediction_scores, sop_scores = outputs[:2]
+            prediction_scores, sop_scores = outputs[:2]
        """
        return_dict = kwargs.get("return_dict")
        return_dict = return_dict if return_dict is not None else self.albert.return_dict
@@ -856,7 +858,9 @@ class TFAlbertSOPHead(tf.keras.layers.Layer):
        self.dropout = tf.keras.layers.Dropout(config.classifier_dropout_prob)
        self.classifier = tf.keras.layers.Dense(
-            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier",
+            config.num_labels,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="classifier",
        )
    def call(self, pooled_output, training: bool):
@@ -935,7 +939,10 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss)
            return ((loss,) + output) if loss is not None else output
        return TFMaskedLMOutput(
-            loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )
@@ -1016,7 +1023,10 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
            return ((loss,) + output) if loss is not None else output
        return TFSequenceClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )
@@ -1095,7 +1105,10 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
            return ((loss,) + output) if loss is not None else output
        return TFTokenClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )
@@ -1211,7 +1224,7 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
    @property
    def dummy_inputs(self):
-        """ Dummy inputs to build the network.
+        """Dummy inputs to build the network.
        Returns:
            tf.Tensor with dummy inputs
@@ -1316,5 +1329,8 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
            return ((loss,) + output) if loss is not None else output
        return TFMultipleChoiceModelOutput(
-            loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )
--- a/src/transformers/modeling_tf_auto.py
+++ b/src/transformers/modeling_tf_auto.py
@@ -310,27 +310,27 @@ TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict(
 class TFAutoModel(object):
    r"""
-        :class:`~transformers.TFAutoModel` is a generic model class
+    :class:`~transformers.TFAutoModel` is a generic model class
-        that will be instantiated as one of the base model classes of the library
+    that will be instantiated as one of the base model classes of the library
-        when created with the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)`
+    when created with the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    class method.
-        The `from_pretrained()` method takes care of returning the correct model class instance
+    The `from_pretrained()` method takes care of returning the correct model class instance
-        based on the `model_type` property of the config object, or when it's missing,
+    based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string:
+    falling back to using pattern matching on the `pretrained_model_name_or_path` string:
-            - `t5`: TFT5Model (T5 model)
+        - `t5`: TFT5Model (T5 model)
-            - `distilbert`: TFDistilBertModel (DistilBERT model)
+        - `distilbert`: TFDistilBertModel (DistilBERT model)
-            - `roberta`: TFRobertaModel (RoBERTa model)
+        - `roberta`: TFRobertaModel (RoBERTa model)
-            - `bert`: TFBertModel (Bert model)
+        - `bert`: TFBertModel (Bert model)
-            - `openai-gpt`: TFOpenAIGPTModel (OpenAI GPT model)
+        - `openai-gpt`: TFOpenAIGPTModel (OpenAI GPT model)
-            - `gpt2`: TFGPT2Model (OpenAI GPT-2 model)
+        - `gpt2`: TFGPT2Model (OpenAI GPT-2 model)
-            - `transfo-xl`: TFTransfoXLModel (Transformer-XL model)
+        - `transfo-xl`: TFTransfoXLModel (Transformer-XL model)
-            - `xlnet`: TFXLNetModel (XLNet model)
+        - `xlnet`: TFXLNetModel (XLNet model)
-            - `xlm`: TFXLMModel (XLM model)
+        - `xlm`: TFXLMModel (XLM model)
-            - `ctrl`: TFCTRLModel (CTRL model)
+        - `ctrl`: TFCTRLModel (CTRL model)
-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
    """
    def __init__(self):
@@ -342,7 +342,7 @@ class TFAutoModel(object):
    @classmethod
    def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
        from a configuration.
        Note:
@@ -381,7 +381,7 @@ class TFAutoModel(object):
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
        from a pre-trained model configuration.
        The `from_pretrained()` method takes care of returning the correct model class instance
@@ -477,11 +477,11 @@ class TFAutoModel(object):
 class TFAutoModelForPreTraining(object):
    r"""
-        :class:`~transformers.TFAutoModelForPreTraining` is a generic model class
+    :class:`~transformers.TFAutoModelForPreTraining` is a generic model class
-        that will be instantiated as one of the model classes of the library -with the architecture used for pretraining this model– when created with the `TFAutoModelForPreTraining.from_pretrained(pretrained_model_name_or_path)`
+    that will be instantiated as one of the model classes of the library -with the architecture used for pretraining this model– when created with the `TFAutoModelForPreTraining.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    class method.
-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
    """
    def __init__(self):
@@ -493,7 +493,7 @@ class TFAutoModelForPreTraining(object):
    @classmethod
    def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
        from a configuration.
        Note:
@@ -532,7 +532,7 @@ class TFAutoModelForPreTraining(object):
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the model classes of the library -with the architecture used for pretraining this model– from a pre-trained model configuration.
+        r"""Instantiates one of the model classes of the library -with the architecture used for pretraining this model– from a pre-trained model configuration.
        The `from_pretrained()` method takes care of returning the correct model class instance
        based on the `model_type` property of the config object, or when it's missing,
@@ -630,27 +630,27 @@ class TFAutoModelForPreTraining(object):
 class TFAutoModelWithLMHead(object):
    r"""
-        :class:`~transformers.TFAutoModelWithLMHead` is a generic model class
+    :class:`~transformers.TFAutoModelWithLMHead` is a generic model class
-        that will be instantiated as one of the language modeling model classes of the library
+    that will be instantiated as one of the language modeling model classes of the library
-        when created with the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
+    when created with the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    class method.
-        The `from_pretrained()` method takes care of returning the correct model class instance
+    The `from_pretrained()` method takes care of returning the correct model class instance
-        based on the `model_type` property of the config object, or when it's missing,
+    based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string:
+    falling back to using pattern matching on the `pretrained_model_name_or_path` string:
-            - `t5`: TFT5ForConditionalGeneration (T5 model)
+        - `t5`: TFT5ForConditionalGeneration (T5 model)
-            - `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
+        - `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
-            - `roberta`: TFRobertaForMaskedLM (RoBERTa model)
+        - `roberta`: TFRobertaForMaskedLM (RoBERTa model)
-            - `bert`: TFBertForMaskedLM (Bert model)
+        - `bert`: TFBertForMaskedLM (Bert model)
-            - `openai-gpt`: TFOpenAIGPTLMHeadModel (OpenAI GPT model)
+        - `openai-gpt`: TFOpenAIGPTLMHeadModel (OpenAI GPT model)
-            - `gpt2`: TFGPT2LMHeadModel (OpenAI GPT-2 model)
+        - `gpt2`: TFGPT2LMHeadModel (OpenAI GPT-2 model)
-            - `transfo-xl`: TFTransfoXLLMHeadModel (Transformer-XL model)
+        - `transfo-xl`: TFTransfoXLLMHeadModel (Transformer-XL model)
-            - `xlnet`: TFXLNetLMHeadModel (XLNet model)
+        - `xlnet`: TFXLNetLMHeadModel (XLNet model)
-            - `xlm`: TFXLMWithLMHeadModel (XLM model)
+        - `xlm`: TFXLMWithLMHeadModel (XLM model)
-            - `ctrl`: TFCTRLLMHeadModel (CTRL model)
+        - `ctrl`: TFCTRLLMHeadModel (CTRL model)
-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
    """
    def __init__(self):
@@ -662,7 +662,7 @@ class TFAutoModelWithLMHead(object):
    @classmethod
    def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
        from a configuration.
        Note:
@@ -705,7 +705,7 @@ class TFAutoModelWithLMHead(object):
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the language modeling model classes of the library
+        r"""Instantiates one of the language modeling model classes of the library
        from a pre-trained model configuration.
        The `from_pretrained()` method takes care of returning the correct model class instance
@@ -808,18 +808,18 @@ class TFAutoModelWithLMHead(object):
 class TFAutoModelForMultipleChoice:
    r"""
-        :class:`~transformers.TFAutoModelForMultipleChoice` is a generic model class
+    :class:`~transformers.TFAutoModelForMultipleChoice` is a generic model class
-        that will be instantiated as one of the multiple choice model classes of the library
+    that will be instantiated as one of the multiple choice model classes of the library
-        when created with the `TFAutoModelForMultipleChoice.from_pretrained(pretrained_model_name_or_path)`
+    when created with the `TFAutoModelForMultipleChoice.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    class method.
-        The `from_pretrained()` method takes care of returning the correct model class instance
+    The `from_pretrained()` method takes care of returning the correct model class instance
-        based on the `model_type` property of the config object, or when it's missing,
+    based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string:
+    falling back to using pattern matching on the `pretrained_model_name_or_path` string:
-            - `albert`: TFAlbertForMultipleChoice (Albert model)
+        - `albert`: TFAlbertForMultipleChoice (Albert model)
-            - `bert`: TFBertForMultipleChoice (Bert model)
+        - `bert`: TFBertForMultipleChoice (Bert model)
-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
    """
    def __init__(self):
@@ -831,7 +831,7 @@ class TFAutoModelForMultipleChoice:
    @classmethod
    def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
        from a configuration.
        Note:
@@ -864,7 +864,7 @@ class TFAutoModelForMultipleChoice:
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the multiple choice model classes of the library
+        r"""Instantiates one of the multiple choice model classes of the library
        from a pre-trained model configuration.
        The `from_pretrained()` method takes care of returning the correct model class instance
@@ -958,12 +958,12 @@ class TFAutoModelForMultipleChoice:
 class TFAutoModelForCausalLM:
    r"""
-        :class:`~transformers.TFAutoModelForCausalLM` is a generic model class
+    :class:`~transformers.TFAutoModelForCausalLM` is a generic model class
-        that will be instantiated as one of the language modeling model classes of the library
+    that will be instantiated as one of the language modeling model classes of the library
-        when created with the `TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)`
+    when created with the `TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    class method.
-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
    """
    def __init__(self):
@@ -975,7 +975,7 @@ class TFAutoModelForCausalLM:
    @classmethod
    def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
        from a configuration.
        Note:
@@ -1011,7 +1011,7 @@ class TFAutoModelForCausalLM:
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the language modeling model classes of the library
+        r"""Instantiates one of the language modeling model classes of the library
        from a pre-trained model configuration.
        The `from_pretrained()` method takes care of returning the correct model class instance
@@ -1093,12 +1093,12 @@ class TFAutoModelForCausalLM:
 class TFAutoModelForMaskedLM:
    r"""
-        :class:`~transformers.TFAutoModelForMaskedLM` is a generic model class
+    :class:`~transformers.TFAutoModelForMaskedLM` is a generic model class
-        that will be instantiated as one of the language modeling model classes of the library
+    that will be instantiated as one of the language modeling model classes of the library
-        when created with the `TFAutoModelForMaskedLM.from_pretrained(pretrained_model_name_or_path)`
+    when created with the `TFAutoModelForMaskedLM.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    class method.
-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
    """
    def __init__(self):
@@ -1110,7 +1110,7 @@ class TFAutoModelForMaskedLM:
    @classmethod
    def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
        from a configuration.
        Note:
@@ -1149,7 +1149,7 @@ class TFAutoModelForMaskedLM:
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the language modeling model classes of the library
+        r"""Instantiates one of the language modeling model classes of the library
        from a pre-trained model configuration.
        The `from_pretrained()` method takes care of returning the correct model class instance
@@ -1235,12 +1235,12 @@ class TFAutoModelForMaskedLM:
 class TFAutoModelForSeq2SeqLM:
    r"""
-        :class:`~transformers.TFAutoModelForSeq2SeqLM` is a generic model class
+    :class:`~transformers.TFAutoModelForSeq2SeqLM` is a generic model class
-        that will be instantiated as one of the language modeling model classes of the library
+    that will be instantiated as one of the language modeling model classes of the library
-        when created with the `TFAutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path)`
+    when created with the `TFAutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    class method.
-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
    """
    def __init__(self):
@@ -1252,7 +1252,7 @@ class TFAutoModelForSeq2SeqLM:
    @classmethod
    def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
        from a configuration.
        Note:
@@ -1285,7 +1285,7 @@ class TFAutoModelForSeq2SeqLM:
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the language modeling model classes of the library
+        r"""Instantiates one of the language modeling model classes of the library
        from a pre-trained model configuration.
        The `from_pretrained()` method takes care of returning the correct model class instance
@@ -1364,22 +1364,22 @@ class TFAutoModelForSeq2SeqLM:
 class TFAutoModelForSequenceClassification(object):
    r"""
-        :class:`~transformers.TFAutoModelForSequenceClassification` is a generic model class
+    :class:`~transformers.TFAutoModelForSequenceClassification` is a generic model class
-        that will be instantiated as one of the sequence classification model classes of the library
+    that will be instantiated as one of the sequence classification model classes of the library
-        when created with the `TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
+    when created with the `TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    class method.
-        The `from_pretrained()` method takes care of returning the correct model class instance
+    The `from_pretrained()` method takes care of returning the correct model class instance
-        based on the `model_type` property of the config object, or when it's missing,
+    based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string:
+    falling back to using pattern matching on the `pretrained_model_name_or_path` string:
-            - `distilbert`: TFDistilBertForSequenceClassification (DistilBERT model)
+        - `distilbert`: TFDistilBertForSequenceClassification (DistilBERT model)
-            - `roberta`: TFRobertaForSequenceClassification (RoBERTa model)
+        - `roberta`: TFRobertaForSequenceClassification (RoBERTa model)
-            - `bert`: TFBertForSequenceClassification (Bert model)
+        - `bert`: TFBertForSequenceClassification (Bert model)
-            - `xlnet`: TFXLNetForSequenceClassification (XLNet model)
+        - `xlnet`: TFXLNetForSequenceClassification (XLNet model)
-            - `xlm`: TFXLMForSequenceClassification (XLM model)
+        - `xlm`: TFXLMForSequenceClassification (XLM model)
-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
    """
    def __init__(self):
@@ -1391,7 +1391,7 @@ class TFAutoModelForSequenceClassification(object):
    @classmethod
    def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
        from a configuration.
        Note:
@@ -1428,7 +1428,7 @@ class TFAutoModelForSequenceClassification(object):
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the sequence classification model classes of the library
+        r"""Instantiates one of the sequence classification model classes of the library
        from a pre-trained model configuration.
        The `from_pretrained()` method takes care of returning the correct model class instance
@@ -1525,23 +1525,23 @@ class TFAutoModelForSequenceClassification(object):
 class TFAutoModelForQuestionAnswering(object):
    r"""
-        :class:`~transformers.TFAutoModelForQuestionAnswering` is a generic model class
+    :class:`~transformers.TFAutoModelForQuestionAnswering` is a generic model class
-        that will be instantiated as one of the question answering model classes of the library
+    that will be instantiated as one of the question answering model classes of the library
-        when created with the `TFAutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
+    when created with the `TFAutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    class method.
-        The `from_pretrained()` method takes care of returning the correct model class instance
+    The `from_pretrained()` method takes care of returning the correct model class instance
-        based on the `model_type` property of the config object, or when it's missing,
+    based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string:
+    falling back to using pattern matching on the `pretrained_model_name_or_path` string:
-            - `distilbert`: TFDistilBertForQuestionAnswering (DistilBERT model)
+        - `distilbert`: TFDistilBertForQuestionAnswering (DistilBERT model)
-            - `albert`: TFAlbertForQuestionAnswering (ALBERT model)
+        - `albert`: TFAlbertForQuestionAnswering (ALBERT model)
-            - `roberta`: TFRobertaForQuestionAnswering (RoBERTa model)
+        - `roberta`: TFRobertaForQuestionAnswering (RoBERTa model)
-            - `bert`: TFBertForQuestionAnswering (Bert model)
+        - `bert`: TFBertForQuestionAnswering (Bert model)
-            - `xlnet`: TFXLNetForQuestionAnswering (XLNet model)
+        - `xlnet`: TFXLNetForQuestionAnswering (XLNet model)
-            - `xlm`: TFXLMForQuestionAnswering (XLM model)
+        - `xlm`: TFXLMForQuestionAnswering (XLM model)
-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
    """
    def __init__(self):
@@ -1553,7 +1553,7 @@ class TFAutoModelForQuestionAnswering(object):
    @classmethod
    def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
        from a configuration.
        Note:
@@ -1591,7 +1591,7 @@ class TFAutoModelForQuestionAnswering(object):
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the question answering model classes of the library
+        r"""Instantiates one of the question answering model classes of the library
        from a pre-trained model configuration.
        The `from_pretrained()` method takes care of returning the correct model class instance
@@ -1697,7 +1697,7 @@ class TFAutoModelForTokenClassification:
    @classmethod
    def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
        from a configuration.
        Note:
@@ -1733,7 +1733,7 @@ class TFAutoModelForTokenClassification:
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the question answering model classes of the library
+        r"""Instantiates one of the question answering model classes of the library
        from a pre-trained model configuration.
        The `from_pretrained()` method takes care of returning the correct model class instance

--- a/src/transformers/modeling_tf_bert.py
+++ b/src/transformers/modeling_tf_bert.py
@@ -89,7 +89,7 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
 def gelu(x):
-    """ Gaussian Error Linear Unit.
+    """Gaussian Error Linear Unit.
    Original Implementation of the gelu activation function in Google Bert repo when initially created.
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
@@ -127,8 +127,7 @@ ACT2FN = {
 class TFBertEmbeddings(tf.keras.layers.Layer):
-    """Construct the embeddings from word, position and token_type embeddings.
+    """Construct the embeddings from word, position and token_type embeddings."""
-    """
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
@@ -225,10 +224,10 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
    def _linear(self, inputs):
        """Computes logits by running inputs through a linear layer.
-            Args:
+        Args:
-                inputs: A float32 tensor with shape [batch_size, length, hidden_size]
+            inputs: A float32 tensor with shape [batch_size, length, hidden_size]
-            Returns:
+        Returns:
-                float32 tensor with shape [batch_size, length, vocab_size].
+            float32 tensor with shape [batch_size, length, vocab_size].
        """
        batch_size = shape_list(inputs)[0]
        length = shape_list(inputs)[1]
@@ -551,9 +550,9 @@ class TFBertMainLayer(tf.keras.layers.Layer):
        self.embeddings.vocab_size = value.shape[0]
    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
+        """Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        See base class PreTrainedModel
        """
        raise NotImplementedError
@@ -656,7 +655,10 @@ class TFBertMainLayer(tf.keras.layers.Layer):
        pooled_output = self.pooler(sequence_output)
        if not return_dict:
-            return (sequence_output, pooled_output,) + encoder_outputs[1:]
+            return (
+                sequence_output,
+                pooled_output,
+            ) + encoder_outputs[1:]
        return TFBaseModelOutputWithPooling(
            last_hidden_state=sequence_output,
@@ -667,8 +669,8 @@ class TFBertMainLayer(tf.keras.layers.Layer):
 class TFBertPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
+    """An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    a simple interface for downloading and loading pretrained models.
    """
    config_class = BertConfig
@@ -824,18 +826,18 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
    @replace_return_docstrings(output_type=TFBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
    def call(self, inputs, **kwargs):
        r"""
-    Return:
+        Return:
-    Examples::
+        Examples::
-        import tensorflow as tf
+            import tensorflow as tf
-        from transformers import BertTokenizer, TFBertForPreTraining
+            from transformers import BertTokenizer, TFBertForPreTraining
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFBertForPreTraining.from_pretrained('bert-base-uncased')
+            model = TFBertForPreTraining.from_pretrained('bert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
+            input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
+            outputs = model(input_ids)
-        prediction_scores, seq_relationship_scores = outputs[:2]
+            prediction_scores, seq_relationship_scores = outputs[:2]
        """
        return_dict = kwargs.get("return_dict")
@@ -933,7 +935,10 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss):
            return ((loss,) + output) if loss is not None else output
        return TFMaskedLMOutput(
-            loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )
@@ -1011,12 +1016,16 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss):
            return ((loss,) + output) if loss is not None else output
        return TFCausalLMOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )
 @add_start_docstrings(
-    """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING,
+    """Bert Model with a `next sentence prediction (classification)` head on top. """,
+    BERT_START_DOCSTRING,
 )
 class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
@@ -1029,22 +1038,22 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
    @replace_return_docstrings(output_type=TFNextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
    def call(self, inputs, **kwargs):
        r"""
-    Return:
+        Return:
-    Examples::
+        Examples::
-        import tensorflow as tf
+            import tensorflow as tf
-        from transformers import BertTokenizer, TFBertForNextSentencePrediction
+            from transformers import BertTokenizer, TFBertForNextSentencePrediction
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFBertForNextSentencePrediction.from_pretrained('bert-base-uncased')
+            model = TFBertForNextSentencePrediction.from_pretrained('bert-base-uncased')
-        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+            prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+            next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-        encoding = tokenizer(prompt, next_sentence, return_tensors='tf')
+            encoding = tokenizer(prompt, next_sentence, return_tensors='tf')
-        logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
+            logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
-        assert logits[0][0] < logits[0][1] # the next sentence was random
+            assert logits[0][0] < logits[0][1] # the next sentence was random
        """
        return_dict = kwargs.get("return_dict")
        return_dict = return_dict if return_dict is not None else self.bert.return_dict
@@ -1057,7 +1066,9 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
            return (seq_relationship_score,) + outputs[2:]
        return TFNextSentencePredictorOutput(
-            logits=seq_relationship_score, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )
@@ -1138,7 +1149,10 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific
            return ((loss,) + output) if loss is not None else output
        return TFSequenceClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )
@@ -1159,7 +1173,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
    @property
    def dummy_inputs(self):
-        """ Dummy inputs to build the network.
+        """Dummy inputs to build the network.
        Returns:
            tf.Tensor with dummy inputs
@@ -1261,7 +1275,10 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
            return ((loss,) + output) if loss is not None else output
        return TFMultipleChoiceModelOutput(
-            loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )
@@ -1340,7 +1357,10 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
            return ((loss,) + output) if loss is not None else output
        return TFTokenClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )

--- a/src/transformers/modeling_tf_camembert.py
+++ b/src/transformers/modeling_tf_camembert.py
@@ -77,7 +77,8 @@ class TFCamembertModel(TFRobertaModel):
 @add_start_docstrings(
-    """CamemBERT Model with a `language modeling` head on top. """, CAMEMBERT_START_DOCSTRING,
+    """CamemBERT Model with a `language modeling` head on top. """,
+    CAMEMBERT_START_DOCSTRING,
 )
 class TFCamembertForMaskedLM(TFRobertaForMaskedLM):
    """

--- a/src/transformers/modeling_tf_ctrl.py
+++ b/src/transformers/modeling_tf_ctrl.py
@@ -245,8 +245,8 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
        raise NotImplementedError
    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
+        """Prunes heads of the model.
-                heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        """
        raise NotImplementedError
@@ -426,8 +426,8 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
 class TFCTRLPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
+    """An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    a simple interface for downloading and loading pretrained models.
    """
    config_class = CTRLConfig

--- a/src/transformers/modeling_tf_distilbert.py
+++ b/src/transformers/modeling_tf_distilbert.py
@@ -70,7 +70,7 @@ TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
 # UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE #
 def gelu(x):
-    """ Gaussian Error Linear Unit.
+    """Gaussian Error Linear Unit.
    Original Implementation of the gelu activation function in Google Bert repo when initially created.
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
@@ -177,10 +177,10 @@ class TFEmbeddings(tf.keras.layers.Layer):
    def _linear(self, inputs):
        """Computes logits by running inputs through a linear layer.
-            Args:
+        Args:
-                inputs: A float32 tensor with shape [batch_size, length, hidden_size]
+            inputs: A float32 tensor with shape [batch_size, length, hidden_size]
-            Returns:
+        Returns:
-                float32 tensor with shape [batch_size, length, vocab_size].
+            float32 tensor with shape [batch_size, length, vocab_size].
        """
        batch_size = shape_list(inputs)[0]
        length = shape_list(inputs)[1]
@@ -518,8 +518,8 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
 # INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
 class TFDistilBertPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
+    """An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    a simple interface for downloading and loading pretrained models.
    """
    config_class = DistilBertConfig
@@ -634,7 +634,8 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
 @add_start_docstrings(
-    """DistilBert Model with a `masked language modeling` head on top. """, DISTILBERT_START_DOCSTRING,
+    """DistilBert Model with a `masked language modeling` head on top. """,
+    DISTILBERT_START_DOCSTRING,
 )
 class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModelingLoss):
    def __init__(self, config, *inputs, **kwargs):
@@ -875,7 +876,10 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla
            return ((loss,) + output) if loss is not None else output
        return TFTokenClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )
@@ -902,7 +906,7 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic
    @property
    def dummy_inputs(self):
-        """ Dummy inputs to build the network.
+        """Dummy inputs to build the network.
        Returns:
            tf.Tensor with dummy inputs

--- a/src/transformers/modeling_tf_electra.py
+++ b/src/transformers/modeling_tf_electra.py
@@ -54,8 +54,7 @@ TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
 class TFElectraEmbeddings(tf.keras.layers.Layer):
-    """Construct the embeddings from word, position and token_type embeddings.
+    """Construct the embeddings from word, position and token_type embeddings."""
-    """
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
@@ -94,7 +93,13 @@ class TFElectraEmbeddings(tf.keras.layers.Layer):
        super().build(input_shape)
    def call(
-        self, input_ids, position_ids=None, token_type_ids=None, inputs_embeds=None, mode="embedding", training=False,
+        self,
+        input_ids,
+        position_ids=None,
+        token_type_ids=None,
+        inputs_embeds=None,
+        mode="embedding",
+        training=False,
    ):
        """Get token embeddings of inputs.
        Args:
@@ -144,10 +149,10 @@ class TFElectraEmbeddings(tf.keras.layers.Layer):
    def _linear(self, inputs):
        """Computes logits by running inputs through a linear layer.
-            Args:
+        Args:
-                inputs: A float32 tensor with shape [batch_size, length, hidden_size]
+            inputs: A float32 tensor with shape [batch_size, length, hidden_size]
-            Returns:
+        Returns:
-                float32 tensor with shape [batch_size, length, vocab_size].
+            float32 tensor with shape [batch_size, length, vocab_size].
        """
        batch_size = shape_list(inputs)[0]
        length = shape_list(inputs)[1]
@@ -250,9 +255,9 @@ class TFElectraMainLayer(TFElectraPreTrainedModel):
        raise NotImplementedError
    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
+        """Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        See base class PreTrainedModel
        """
        raise NotImplementedError
@@ -491,18 +496,18 @@ class TFElectraForPreTraining(TFElectraPreTrainedModel):
        training=False,
    ):
        r"""
-    Returns:
+        Returns:
-    Examples::
+        Examples::
-        import tensorflow as tf
+            import tensorflow as tf
-        from transformers import ElectraTokenizer, TFElectraForPreTraining
+            from transformers import ElectraTokenizer, TFElectraForPreTraining
-        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
+            tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
-        model = TFElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
+            model = TFElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+            input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
+            outputs = model(input_ids)
-        scores = outputs[0]
+            scores = outputs[0]
        """
        return_dict = return_dict if return_dict is not None else self.electra.config.return_dict
@@ -729,7 +734,10 @@ class TFElectraForSequenceClassification(TFElectraPreTrainedModel, TFSequenceCla
            return ((loss,) + output) if loss is not None else output
        return TFSequenceClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )
@@ -752,7 +760,7 @@ class TFElectraForMultipleChoice(TFElectraPreTrainedModel, TFMultipleChoiceLoss)
    @property
    def dummy_inputs(self):
-        """ Dummy inputs to build the network.
+        """Dummy inputs to build the network.
        Returns:
            tf.Tensor with dummy inputs
@@ -853,7 +861,10 @@ class TFElectraForMultipleChoice(TFElectraPreTrainedModel, TFMultipleChoiceLoss)
            return ((loss,) + output) if loss is not None else output
        return TFMultipleChoiceModelOutput(
-            loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )
@@ -1020,7 +1031,10 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin
            loss = self.compute_loss(labels, (start_logits, end_logits))
        if not return_dict:
-            output = (start_logits, end_logits,) + discriminator_hidden_states[1:]
+            output = (
+                start_logits,
+                end_logits,
+            ) + discriminator_hidden_states[1:]
            return ((loss,) + output) if loss is not None else output
        return TFQuestionAnsweringModelOutput(

--- a/src/transformers/modeling_tf_gpt2.py
+++ b/src/transformers/modeling_tf_gpt2.py
@@ -252,8 +252,8 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
        self.wte.vocab_size = self.wte.weight.shape[0]
    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
+        """Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        """
        raise NotImplementedError
@@ -417,8 +417,8 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
 class TFGPT2PreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
+    """An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    a simple interface for downloading and loading pretrained models.
    """
    config_class = GPT2Config
@@ -698,34 +698,34 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
        training=False,
    ):
        r"""
-        mc_token_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
+            mc_token_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
-            Index of the classification token in each input sequence.
+                Index of the classification token in each input sequence.
-            Selected in the range ``[0, input_ids.size(-1) - 1[``.
+                Selected in the range ``[0, input_ids.size(-1) - 1[``.
-    Return:
+        Return:
-    Examples::
+        Examples::
-        >>> import tensorflow as tf
+            >>> import tensorflow as tf
-        >>> from transformers import GPT2Tokenizer, TFGPT2DoubleHeadsModel
+            >>> from transformers import GPT2Tokenizer, TFGPT2DoubleHeadsModel
-        >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+            >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        >>> model = TFGPT2DoubleHeadsModel.from_pretrained('gpt2')
+            >>> model = TFGPT2DoubleHeadsModel.from_pretrained('gpt2')
-        >>> # Add a [CLS] to the vocabulary (we should train it also!)
+            >>> # Add a [CLS] to the vocabulary (we should train it also!)
-        >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
+            >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
-        >>> embedding_layer = model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
+            >>> embedding_layer = model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
-        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
+            >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
-        >>> encoded_choices = [tokenizer.encode(s) for s in choices]
+            >>> encoded_choices = [tokenizer.encode(s) for s in choices]
-        >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
+            >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
-        >>> input_ids = tf.constant(encoded_choices)[None, :]  # Batch size: 1, number of choices: 2
+            >>> input_ids = tf.constant(encoded_choices)[None, :]  # Batch size: 1, number of choices: 2
-        >>> mc_token_ids = tf.constant([cls_token_location])  # Batch size: 1
+            >>> mc_token_ids = tf.constant([cls_token_location])  # Batch size: 1
-        >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
+            >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
-        >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
+            >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
        """
        if isinstance(inputs, (tuple, list)):

--- a/src/transformers/modeling_tf_longformer.py
+++ b/src/transformers/modeling_tf_longformer.py
@@ -55,9 +55,9 @@ TF_LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
 def _compute_global_attention_mask(input_ids_shape, sep_token_indices, before_sep_token=True):
    """
-        Computes global attention mask by putting attention on all tokens
+    Computes global attention mask by putting attention on all tokens
-        before `sep_token_id` if `before_sep_token is True` else after
+    before `sep_token_id` if `before_sep_token is True` else after
-        `sep_token_id`.
+    `sep_token_id`.
    """
    assert sep_token_indices.shape[1] == 2, "`input_ids` should have two dimensions"
@@ -72,11 +72,14 @@ def _compute_global_attention_mask(input_ids_shape, sep_token_indices, before_se
        )
    else:
        # last token is separation token and should not be counted and in the middle are two separation tokens
-        attention_mask = tf.cast(
+        attention_mask = (
-            tf.broadcast_to(attention_mask, input_ids_shape)
+            tf.cast(
-            > tf.broadcast_to(question_end_index + 1, input_ids_shape),
+                tf.broadcast_to(attention_mask, input_ids_shape)
-            tf.dtypes.int32,
+                > tf.broadcast_to(question_end_index + 1, input_ids_shape),
-        ) * tf.cast(tf.broadcast_to(attention_mask, input_ids_shape) < input_ids_shape[-1], tf.dtypes.int32)
+                tf.dtypes.int32,
+            )
+            * tf.cast(tf.broadcast_to(attention_mask, input_ids_shape) < input_ids_shape[-1], tf.dtypes.int32)
+        )
    return attention_mask
@@ -130,7 +133,9 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
        self.one_sided_attn_window_size = attention_window // 2
    def call(
-        self, inputs, training=False,
+        self,
+        inputs,
+        training=False,
    ):
        """
        LongformerSelfAttention expects `len(hidden_states)` to be multiple of `attention_window`.
@@ -433,7 +438,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
    def _sliding_chunks_matmul_attn_probs_value(self, attn_probs, value, window_overlap):
        """Same as _sliding_chunks_query_key_matmul but for attn_probs and value tensors.
-           Returned tensor will be of the same shape as `attn_probs`"""
+        Returned tensor will be of the same shape as `attn_probs`"""
        batch_size, seq_len, num_heads, head_dim = shape_list(value)
@@ -508,17 +513,17 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
    @staticmethod
    def _pad_and_diagonalize(chunked_hidden_states):
        """shift every row 1 step right, converting columns into diagonals.
-           Example:
+        Example:
-                 chunked_hidden_states: [ 0.4983,  2.6918, -0.0071,  1.0492,
+              chunked_hidden_states: [ 0.4983,  2.6918, -0.0071,  1.0492,
-                                          -1.8348,  0.7672,  0.2986,  0.0285,
+                                       -1.8348,  0.7672,  0.2986,  0.0285,
-                                          -0.7584,  0.4206, -0.0405,  0.1599,
+                                       -0.7584,  0.4206, -0.0405,  0.1599,
-                                          2.0514, -1.1600,  0.5372,  0.2629 ]
+                                       2.0514, -1.1600,  0.5372,  0.2629 ]
-                 window_overlap = num_rows = 4
+              window_overlap = num_rows = 4
-                (pad & diagonilize) =>
+             (pad & diagonilize) =>
-                [ 0.4983,  2.6918, -0.0071,  1.0492, 0.0000,  0.0000,  0.0000
+             [ 0.4983,  2.6918, -0.0071,  1.0492, 0.0000,  0.0000,  0.0000
-                  0.0000,  -1.8348,  0.7672,  0.2986,  0.0285, 0.0000,  0.0000
+               0.0000,  -1.8348,  0.7672,  0.2986,  0.0285, 0.0000,  0.0000
-                  0.0000,  0.0000, -0.7584,  0.4206, -0.0405,  0.1599, 0.0000
+               0.0000,  0.0000, -0.7584,  0.4206, -0.0405,  0.1599, 0.0000
-                  0.0000,  0.0000,  0.0000, 2.0514, -1.1600,  0.5372,  0.2629 ]
+               0.0000,  0.0000,  0.0000, 2.0514, -1.1600,  0.5372,  0.2629 ]
        """
        total_num_heads, num_chunks, window_overlap, hidden_dim = shape_list(chunked_hidden_states)
@@ -779,7 +784,8 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
            tf.transpose(global_attn_output, (0, 2, 1, 3)), is_local_index_global_attn_nonzero
        )
        nonzero_global_attn_output = tf.reshape(
-            nonzero_global_attn_output, (shape_list(is_local_index_global_attn_nonzero)[0], -1),
+            nonzero_global_attn_output,
+            (shape_list(is_local_index_global_attn_nonzero)[0], -1),
        )
        # overwrite values with global attention
@@ -910,9 +916,9 @@ class TFLongformerMainLayer(tf.keras.layers.Layer):
        self.embeddings.vocab_size = value.shape[0]
    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
+        """Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        See base class PreTrainedModel
        """
        raise NotImplementedError
@@ -1021,7 +1027,10 @@ class TFLongformerMainLayer(tf.keras.layers.Layer):
            sequence_output = sequence_output[:, :-padding_len]
        if not return_dict:
-            return (sequence_output, pooled_output,) + encoder_outputs[1:]
+            return (
+                sequence_output,
+                pooled_output,
+            ) + encoder_outputs[1:]
        return TFBaseModelOutputWithPooling(
            last_hidden_state=sequence_output,
@@ -1031,7 +1040,13 @@ class TFLongformerMainLayer(tf.keras.layers.Layer):
        )
    def _pad_to_window_size(
-        self, input_ids, attention_mask, token_type_ids, position_ids, inputs_embeds, pad_token_id,
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        inputs_embeds,
+        pad_token_id,
    ):
        """A helper function to pad tokens and mask to work with implementation of Longformer selfattention."""
        # padding
@@ -1083,8 +1098,8 @@ class TFLongformerMainLayer(tf.keras.layers.Layer):
 class TFLongformerPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
+    """An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    a simple interface for downloading and loading pretrained models.
    """
    config_class = LongformerConfig
@@ -1286,7 +1301,10 @@ class TFLongformerForMaskedLM(TFLongformerPreTrainedModel, TFMaskedLanguageModel
            return ((loss,) + output) if loss is not None else output
        return TFMaskedLMOutput(
-            loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )

--- a/src/transformers/modeling_tf_mobilebert.py
+++ b/src/transformers/modeling_tf_mobilebert.py
@@ -99,8 +99,7 @@ NORM2FN = {"layer_norm": TFLayerNorm, "no_norm": TFNoNorm}
 class TFMobileBertEmbeddings(tf.keras.layers.Layer):
-    """Construct the embeddings from word, position and token_type embeddings.
+    """Construct the embeddings from word, position and token_type embeddings."""
-    """
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
@@ -223,10 +222,10 @@ class TFMobileBertEmbeddings(tf.keras.layers.Layer):
    def _linear(self, inputs):
        """Computes logits by running inputs through a linear layer.
-            Args:
+        Args:
-                inputs: A float32 tensor with shape [batch_size, length, hidden_size]
+            inputs: A float32 tensor with shape [batch_size, length, hidden_size]
-            Returns:
+        Returns:
-                float32 tensor with shape [batch_size, length, vocab_size].
+            float32 tensor with shape [batch_size, length, vocab_size].
        """
        batch_size = shape_list(inputs)[0]
        length = shape_list(inputs)[1]
@@ -696,9 +695,9 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer):
        raise NotImplementedError
    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
+        """Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        See base class PreTrainedModel
        """
        raise NotImplementedError
@@ -799,7 +798,10 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer):
        pooled_output = self.pooler(sequence_output)
        if not return_dict:
-            return (sequence_output, pooled_output,) + encoder_outputs[1:]
+            return (
+                sequence_output,
+                pooled_output,
+            ) + encoder_outputs[1:]
        return TFBaseModelOutputWithPooling(
            last_hidden_state=sequence_output,
@@ -810,8 +812,8 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer):
 class TFMobileBertPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
+    """An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    a simple interface for downloading and loading pretrained models.
    """
    config_class = MobileBertConfig
@@ -967,18 +969,18 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel):
    @replace_return_docstrings(output_type=TFMobileBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
    def call(self, inputs, **kwargs):
        r"""
-    Return:
+        Return:
-    Examples::
+        Examples::
-        >>> import tensorflow as tf
+            >>> import tensorflow as tf
-        >>> from transformers import MobileBertTokenizer, TFMobileBertForPreTraining
+            >>> from transformers import MobileBertTokenizer, TFMobileBertForPreTraining
-        >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
+            >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
-        >>> model = TFMobileBertForPreTraining.from_pretrained('google/mobilebert-uncased')
+            >>> model = TFMobileBertForPreTraining.from_pretrained('google/mobilebert-uncased')
-        >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+            >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        >>> outputs = model(input_ids)
+            >>> outputs = model(input_ids)
-        >>> prediction_scores, seq_relationship_scores = outputs[:2]
+            >>> prediction_scores, seq_relationship_scores = outputs[:2]
        """
        return_dict = kwargs.get("return_dict")
@@ -1069,7 +1071,10 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModel
            return ((loss,) + output) if loss is not None else output
        return TFMaskedLMOutput(
-            loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )
@@ -1098,21 +1103,21 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel):
    @replace_return_docstrings(output_type=TFNextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
    def call(self, inputs, **kwargs):
        r"""
-    Return:
+        Return:
-    Examples::
+        Examples::
-        >>> import tensorflow as tf
+            >>> import tensorflow as tf
-        >>> from transformers import MobileBertTokenizer, TFMobileBertForNextSentencePrediction
+            >>> from transformers import MobileBertTokenizer, TFMobileBertForNextSentencePrediction
-        >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
+            >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
-        >>> model = TFMobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased')
+            >>> model = TFMobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased')
-        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+            >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+            >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='tf')
+            >>> encoding = tokenizer(prompt, next_sentence, return_tensors='tf')
-        >>> logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
+            >>> logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
        """
        return_dict = kwargs.get("return_dict")
        return_dict = return_dict if return_dict is not None else self.mobilebert.return_dict
@@ -1125,7 +1130,9 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel):
            return (seq_relationship_score,) + outputs[2:]
        return TFNextSentencePredictorOutput(
-            logits=seq_relationship_score, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )
@@ -1206,7 +1213,10 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque
            return ((loss,) + output) if loss is not None else output
        return TFSequenceClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )
@@ -1323,7 +1333,7 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
    @property
    def dummy_inputs(self):
-        """ Dummy inputs to build the network.
+        """Dummy inputs to build the network.
        Returns:
            tf.Tensor with dummy inputs
@@ -1425,7 +1435,10 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
            return ((loss,) + output) if loss is not None else output
        return TFMultipleChoiceModelOutput(
-            loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )
@@ -1504,5 +1517,8 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla
            return ((loss,) + output) if loss is not None else output
        return TFTokenClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )
--- a/src/transformers/modeling_tf_openai.py
+++ b/src/transformers/modeling_tf_openai.py
@@ -243,8 +243,8 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
        self.tokens_embed.vocab_size = value.shape[0]
    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
+        """Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        """
        raise NotImplementedError
@@ -373,13 +373,15 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
            return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
        return TFBaseModelOutput(
-            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions,
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
        )
 class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
+    """An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    a simple interface for downloading and loading pretrained models.
    """
    config_class = OpenAIGPTConfig
@@ -630,31 +632,31 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
        training=False,
    ):
        r"""
-        mc_token_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
+            mc_token_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
-            Index of the classification token in each input sequence.
+                Index of the classification token in each input sequence.
-            Selected in the range ``[0, input_ids.size(-1) - 1]``.
+                Selected in the range ``[0, input_ids.size(-1) - 1]``.
-    Return:
+        Return:
-    Examples::
+        Examples::
-        >>> import tensorflow as tf
+            >>> import tensorflow as tf
-        >>> from transformers import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel
+            >>> from transformers import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel
-        >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+            >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        >>> model = TFOpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
+            >>> model = TFOpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
-        >>> # Add a [CLS] to the vocabulary (we should train it also!)
+            >>> # Add a [CLS] to the vocabulary (we should train it also!)
-        >>> tokenizer.add_special_tokens({'cls_token': '[CLS]'})
+            >>> tokenizer.add_special_tokens({'cls_token': '[CLS]'})
-        >>> model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
+            >>> model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
-        >>> print(tokenizer.cls_token_id, len(tokenizer))  # The newly token the last token of the vocabulary
+            >>> print(tokenizer.cls_token_id, len(tokenizer))  # The newly token the last token of the vocabulary
-        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
+            >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
-        >>> encoding = tokenizer(choices, return_tensors="tf")
+            >>> encoding = tokenizer(choices, return_tensors="tf")
-        >>> inputs = {k: tf.expand_dims(v, 0) for k, v in encoding.items()}
+            >>> inputs = {k: tf.expand_dims(v, 0) for k, v in encoding.items()}
-        >>> inputs["mc_token_ids"]= tf.constant([inputs["input_ids"].shape[-1] - 1, inputs["input_ids"].shape[-1] - 1])[None, :]  # Batch size 1
+            >>> inputs["mc_token_ids"]= tf.constant([inputs["input_ids"].shape[-1] - 1, inputs["input_ids"].shape[-1] - 1])[None, :]  # Batch size 1
-        >>> outputs = model(inputs)
+            >>> outputs = model(inputs)
-        >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
+            >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
        """
        if isinstance(inputs, (tuple, list)):

--- a/src/transformers/modeling_tf_pytorch_utils.py
+++ b/src/transformers/modeling_tf_pytorch_utils.py
@@ -28,15 +28,15 @@ logger = logging.get_logger(__name__)
 def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove=""):
-    """ Convert a TF 2.0 model variable name in a pytorch model weight name.
+    """Convert a TF 2.0 model variable name in a pytorch model weight name.
-        Conventions for TF2.0 scopes -> PyTorch attribute names conversions:
+    Conventions for TF2.0 scopes -> PyTorch attribute names conversions:
-            - '$1___$2' is replaced by $2 (can be used to duplicate or remove layers in TF2.0 vs PyTorch)
+        - '$1___$2' is replaced by $2 (can be used to duplicate or remove layers in TF2.0 vs PyTorch)
-            - '_._' is replaced by a new level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList)
+        - '_._' is replaced by a new level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList)
-        return tuple with:
+    return tuple with:
-            - pytorch model weight name
+        - pytorch model weight name
-            - transpose: boolean indicating weither TF2.0 and PyTorch weights matrices are transposed with regards to each other
+        - transpose: boolean indicating weither TF2.0 and PyTorch weights matrices are transposed with regards to each other
    """
    tf_name = tf_name.replace(":0", "")  # device ids
    tf_name = re.sub(
@@ -72,8 +72,7 @@ def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove="")
 def load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=None, allow_missing_keys=False):
-    """ Load pytorch checkpoints in a TF 2.0 model
+    """Load pytorch checkpoints in a TF 2.0 model"""
-    """
    try:
        import tensorflow as tf  # noqa: F401
        import torch  # noqa: F401
@@ -96,8 +95,7 @@ def load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_i
 def load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=None, allow_missing_keys=False):
-    """ Load pytorch checkpoints in a TF 2.0 model
+    """Load pytorch checkpoints in a TF 2.0 model"""
-    """
    pt_state_dict = pt_model.state_dict()
    return load_pytorch_weights_in_tf2_model(
@@ -106,8 +104,7 @@ def load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=None, allow_mi
 def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, allow_missing_keys=False):
-    """ Load pytorch state_dict in a TF 2.0 model.
+    """Load pytorch state_dict in a TF 2.0 model."""
-    """
    try:
        import tensorflow as tf  # noqa: F401
        import torch  # noqa: F401
@@ -230,9 +227,9 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
 def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs=None, allow_missing_keys=False):
-    """ Load TF 2.0 HDF5 checkpoint in a PyTorch model
+    """Load TF 2.0 HDF5 checkpoint in a PyTorch model
-        We use HDF5 to easily do transfer learning
+    We use HDF5 to easily do transfer learning
-        (see https://github.com/tensorflow/tensorflow/blob/ee16fcac960ae660e0e4496658a366e2f745e1f0/tensorflow/python/keras/engine/network.py#L1352-L1357).
+    (see https://github.com/tensorflow/tensorflow/blob/ee16fcac960ae660e0e4496658a366e2f745e1f0/tensorflow/python/keras/engine/network.py#L1352-L1357).
    """
    try:
        import tensorflow as tf  # noqa: F401
@@ -265,16 +262,14 @@ def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs
 def load_tf2_model_in_pytorch_model(pt_model, tf_model, allow_missing_keys=False):
-    """ Load TF 2.0 model in a pytorch model
+    """Load TF 2.0 model in a pytorch model"""
-    """
    weights = tf_model.weights
    return load_tf2_weights_in_pytorch_model(pt_model, weights, allow_missing_keys=allow_missing_keys)
 def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=False):
-    """ Load TF2.0 symbolic weights in a PyTorch model
+    """Load TF2.0 symbolic weights in a PyTorch model"""
-    """
    try:
        import tensorflow as tf  # noqa: F401
        import torch  # noqa: F401

--- a/src/transformers/modeling_tf_roberta.py
+++ b/src/transformers/modeling_tf_roberta.py
@@ -73,7 +73,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings):
        self.padding_idx = 1
    def create_position_ids_from_input_ids(self, x):
-        """ Replace non-padding symbols with their position numbers. Position numbers begin at
+        """Replace non-padding symbols with their position numbers. Position numbers begin at
        padding_idx+1. Padding symbols are ignored. This is modified from fairseq's
        `utils.make_positions`.
        :param tf.Tensor x:
@@ -84,7 +84,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings):
        return incremental_indicies + self.padding_idx
    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
-        """ We are provided embeddings directly. We cannot infer which are padded so just generate
+        """We are provided embeddings directly. We cannot infer which are padded so just generate
        sequential position ids.
        :param tf.Tensor inputs_embeds:
        :return tf.Tensor:
@@ -120,8 +120,8 @@ class TFRobertaMainLayer(TFBertMainLayer):
 class TFRobertaPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
+    """An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    a simple interface for downloading and loading pretrained models.
    """
    config_class = RobertaConfig
@@ -330,7 +330,10 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos
            return ((loss,) + output) if loss is not None else output
        return TFMaskedLMOutput(
-            loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )
@@ -431,7 +434,10 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla
            return ((loss,) + output) if loss is not None else output
        return TFSequenceClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )
@@ -452,7 +458,7 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
    @property
    def dummy_inputs(self):
-        """ Dummy inputs to build the network.
+        """Dummy inputs to build the network.
        Returns:
            tf.Tensor with dummy inputs
@@ -549,7 +555,10 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
            return ((loss,) + output) if loss is not None else output
        return TFMultipleChoiceModelOutput(
-            loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )
@@ -628,7 +637,10 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific
            return ((loss,) + output) if loss is not None else output
        return TFTokenClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )

--- a/src/transformers/modeling_tf_t5.py
+++ b/src/transformers/modeling_tf_t5.py
@@ -67,8 +67,8 @@ TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
 class TFT5LayerNorm(tf.keras.layers.Layer):
    def __init__(self, epsilon=1e-6, **kwargs):
-        """ Construct a layernorm module in the T5 style
+        """Construct a layernorm module in the T5 style
-            No bias and no substraction of mean.
+        No bias and no substraction of mean.
        """
        super().__init__(**kwargs)
        self.variance_epsilon = epsilon
@@ -140,7 +140,9 @@ class TFT5Attention(tf.keras.layers.Layer):
        if self.has_relative_attention_bias:
            self.relative_attention_bias = tf.keras.layers.Embedding(
-                self.relative_attention_num_buckets, self.n_heads, name="relative_attention_bias",
+                self.relative_attention_num_buckets,
+                self.n_heads,
+                name="relative_attention_bias",
            )
        self.pruned_heads = set()
@@ -199,7 +201,9 @@ class TFT5Attention(tf.keras.layers.Layer):
        memory_position = tf.range(klen)[None, :]
        relative_position = memory_position - context_position  # shape (qlen, klen)
        rp_bucket = self._relative_position_bucket(
-            relative_position, bidirectional=not self.is_decoder, num_buckets=self.relative_attention_num_buckets,
+            relative_position,
+            bidirectional=not self.is_decoder,
+            num_buckets=self.relative_attention_num_buckets,
        )
        values = self.relative_attention_bias(rp_bucket)  # shape (qlen, klen, num_heads)
        values = tf.expand_dims(tf.transpose(values, [2, 0, 1]), axis=0)  # shape (1, num_heads, qlen, klen)
@@ -316,7 +320,9 @@ class TFT5LayerSelfAttention(tf.keras.layers.Layer):
    def __init__(self, config, has_relative_attention_bias=False, **kwargs):
        super().__init__(**kwargs)
        self.SelfAttention = TFT5Attention(
-            config, has_relative_attention_bias=has_relative_attention_bias, name="SelfAttention",
+            config,
+            has_relative_attention_bias=has_relative_attention_bias,
+            name="SelfAttention",
        )
        self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm")
        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
@@ -353,7 +359,9 @@ class TFT5LayerCrossAttention(tf.keras.layers.Layer):
    def __init__(self, config, has_relative_attention_bias=False, **kwargs):
        super().__init__(**kwargs)
        self.EncDecAttention = TFT5Attention(
-            config, has_relative_attention_bias=has_relative_attention_bias, name="EncDecAttention",
+            config,
+            has_relative_attention_bias=has_relative_attention_bias,
+            name="EncDecAttention",
        )
        self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm")
        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
@@ -396,12 +404,18 @@ class TFT5Block(tf.keras.layers.Layer):
        self.is_decoder = config.is_decoder
        self.layer = []
        self.layer.append(
-            TFT5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias, name="layer_._0",)
+            TFT5LayerSelfAttention(
+                config,
+                has_relative_attention_bias=has_relative_attention_bias,
+                name="layer_._0",
+            )
        )
        if self.is_decoder:
            self.layer.append(
                TFT5LayerCrossAttention(
-                    config, has_relative_attention_bias=has_relative_attention_bias, name="layer_._1",
+                    config,
+                    has_relative_attention_bias=has_relative_attention_bias,
+                    name="layer_._1",
                )
            )
@@ -490,9 +504,9 @@ class TFT5Block(tf.keras.layers.Layer):
 class _NoLayerEmbedTokens:
    """
-     this class wraps a the TFSharedEmbeddingTokens layer into a python 'no-keras-layer'
+    this class wraps a the TFSharedEmbeddingTokens layer into a python 'no-keras-layer'
-     class to avoid problem with weight restoring. Also it makes sure that the layer is
+    class to avoid problem with weight restoring. Also it makes sure that the layer is
-     called from the correct scope to avoid problem with saving/storing the correct weights
+    called from the correct scope to avoid problem with saving/storing the correct weights
    """
    def __init__(self, layer, abs_scope_name=None):
@@ -539,7 +553,11 @@ class TFT5MainLayer(tf.keras.layers.Layer):
        self.num_hidden_layers = config.num_layers
        self.block = [
-            TFT5Block(config, has_relative_attention_bias=bool(i == 0), name="block_._{}".format(i),)
+            TFT5Block(
+                config,
+                has_relative_attention_bias=bool(i == 0),
+                name="block_._{}".format(i),
+            )
            for i in range(config.num_layers)
        ]
        self.final_layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="final_layer_norm")
@@ -654,7 +672,8 @@ class TFT5MainLayer(tf.keras.layers.Layer):
            if self.is_decoder:
                seq_ids = tf.range(mask_seq_length)
                causal_mask = tf.less_equal(
-                    tf.tile(seq_ids[None, None, :], (batch_size, mask_seq_length, 1)), seq_ids[None, :, None],
+                    tf.tile(seq_ids[None, None, :], (batch_size, mask_seq_length, 1)),
+                    seq_ids[None, :, None],
                )
                causal_mask = tf.cast(causal_mask, dtype=tf.float32)
                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
@@ -765,8 +784,8 @@ class TFT5MainLayer(tf.keras.layers.Layer):
 # pointers for your model.
 ####################################################
 class TFT5PreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
+    """An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    a simple interface for downloading and loading pretrained models.
    """
    config_class = T5Config
@@ -961,17 +980,17 @@ class TFT5Model(TFT5PreTrainedModel):
        training=False,
    ):
        r"""
-    Returns:
+        Returns:
-    Examples::
+        Examples::
-        >>> from transformers import T5Tokenizer, TFT5Model
+            >>> from transformers import T5Tokenizer, TFT5Model
-        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        >>> model = TFT5Model.from_pretrained('t5-small')
+            >>> model = TFT5Model.from_pretrained('t5-small')
-        >>> inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf")  # Batch size 1
+            >>> inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf")  # Batch size 1
-        >>> outputs = model(inputs, decoder_input_ids=inputs)
+            >>> outputs = model(inputs, decoder_input_ids=inputs)
-        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+            >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        """
        if isinstance(inputs, (tuple, list)):
@@ -1157,26 +1176,26 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling
        training=False,
    ):
        r"""
-        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the cross entropy classification loss.
+                Labels for computing the cross entropy classification loss.
-            Indices should be in ``[0, ..., config.vocab_size - 1]``.
+                Indices should be in ``[0, ..., config.vocab_size - 1]``.
-    Returns:
+        Returns:
-    Examples::
+        Examples::
-        >>> from transformers import T5Tokenizer, TFT5ForConditionalGeneration
+            >>> from transformers import T5Tokenizer, TFT5ForConditionalGeneration
-        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        >>> model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
+            >>> model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
-        >>> inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf")  # Batch size 1
+            >>> inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf")  # Batch size 1
-        >>> outputs = model(inputs, decoder_input_ids=inputs)
+            >>> outputs = model(inputs, decoder_input_ids=inputs)
-        >>> prediction_scores = outputs[0]
+            >>> prediction_scores = outputs[0]
-        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        >>> model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
+            >>> model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
-        >>> inputs = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="tf")  # Batch size 1
+            >>> inputs = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="tf")  # Batch size 1
-        >>> result = model.generate(inputs)
+            >>> result = model.generate(inputs)
        """
        if isinstance(inputs, (tuple, list)):

--- a/src/transformers/modeling_tf_transfo_xl.py
+++ b/src/transformers/modeling_tf_transfo_xl.py
@@ -628,7 +628,13 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
                hids.append(core_out)
                mems_i = None if mems is None else mems[i]
                layer_outputs = layer(
-                    core_out, pos_emb, dec_attn_mask, mems_i, head_mask[i], output_attentions, training=training,
+                    core_out,
+                    pos_emb,
+                    dec_attn_mask,
+                    mems_i,
+                    head_mask[i],
+                    output_attentions,
+                    training=training,
                )
                core_out = layer_outputs[0]
                if output_attentions:
@@ -657,13 +663,16 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
            return tuple(v for v in [core_out, new_mems, hids, attentions] if v is not None)
        return TFTransfoXLModelOutput(
-            last_hidden_state=core_out, mems=new_mems, hidden_states=hids, attentions=attentions,
+            last_hidden_state=core_out,
+            mems=new_mems,
+            hidden_states=hids,
+            attentions=attentions,
        )
 class TFTransfoXLPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
+    """An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    a simple interface for downloading and loading pretrained models.
    """
    config_class = TransfoXLConfig
@@ -852,8 +861,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
        )
    def get_output_embeddings(self):
-        """ Double-check if you are using adaptive softmax.
+        """Double-check if you are using adaptive softmax."""
-        """
        if len(self.crit.out_layers) > 0:
            return self.crit.out_layers[-1]
        return None

--- a/src/transformers/modeling_tf_transfo_xl_utilities.py
+++ b/src/transformers/modeling_tf_transfo_xl_utilities.py
@@ -64,7 +64,10 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
                else:
                    self.out_projs.append(None)
                weight = self.add_weight(
-                    shape=(self.vocab_size, self.d_embed,),
+                    shape=(
+                        self.vocab_size,
+                        self.d_embed,
+                    ),
                    initializer="zeros",
                    trainable=True,
                    name="out_layers_._{}_._weight".format(i),
@@ -86,7 +89,10 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
                )
                self.out_projs.append(weight)
                weight = self.add_weight(
-                    shape=(r_idx - l_idx, d_emb_i,),
+                    shape=(
+                        r_idx - l_idx,
+                        d_emb_i,
+                    ),
                    initializer="zeros",
                    trainable=True,
                    name="out_layers_._{}_._weight".format(i),

--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -207,13 +207,12 @@ class TFMultipleChoiceLoss(TFSequenceClassificationLoss):
 class TFMaskedLanguageModelingLoss(TFCausalLanguageModelingLoss):
    """
-   Loss function suitable for masked language modeling (MLM), that is, the task of guessing the masked tokens.
+    Loss function suitable for masked language modeling (MLM), that is, the task of guessing the masked tokens.
-   .. note::
+    .. note::
-        Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
-"""
+         Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
+    """
 class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin):