Merge pull request #2255 from aaugustin/implement-best-practices

Implement some Python best practices

Merge pull request #2255 from aaugustin/implement-best-practices
Implement some Python best practices
54abc67a · Thomas Wolf · GitHub · 645713e2 · c11b3e29 · 54abc67a
Unverified Commit 54abc67a authored Dec 22, 2019 by Thomas Wolf Committed by GitHub Dec 22, 2019
20 changed files
--- a/transformers/modelcard.py
+++ b/transformers/modelcard.py
@@ -14,8 +14,7 @@
 # limitations under the License.
 """ Configuration base class and utilities."""

-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals

 import copy
 import json
@@ -24,9 +23,15 @@ import os
 from io import open

 from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
-
-from .file_utils import CONFIG_NAME, MODEL_CARD_NAME, WEIGHTS_NAME, TF2_WEIGHTS_NAME, \
-                        cached_path, is_remote_url, hf_bucket_url
+from .file_utils import (
+    CONFIG_NAME,
+    MODEL_CARD_NAME,
+    TF2_WEIGHTS_NAME,
+    WEIGHTS_NAME,
+    cached_path,
+    hf_bucket_url,
+    is_remote_url,
+)


 logger = logging.getLogger(__name__)
@@ -48,17 +53,18 @@ class ModelCard(object):

        Parameters:
    """
+
    def __init__(self, **kwargs):
        # Recomended attributes from https://arxiv.org/abs/1810.03993 (see papers)
-        self.model_details = kwargs.pop('model_details', {})
-        self.intended_use = kwargs.pop('intended_use', {})
-        self.factors = kwargs.pop('factors', {})
-        self.metrics = kwargs.pop('metrics', {})
-        self.evaluation_data = kwargs.pop('evaluation_data', {})
-        self.training_data = kwargs.pop('training_data', {})
-        self.quantitative_analyses = kwargs.pop('quantitative_analyses', {})
-        self.ethical_considerations = kwargs.pop('ethical_considerations', {})
-        self.caveats_and_recommendations = kwargs.pop('caveats_and_recommendations', {})
+        self.model_details = kwargs.pop("model_details", {})
+        self.intended_use = kwargs.pop("intended_use", {})
+        self.factors = kwargs.pop("factors", {})
+        self.metrics = kwargs.pop("metrics", {})
+        self.evaluation_data = kwargs.pop("evaluation_data", {})
+        self.training_data = kwargs.pop("training_data", {})
+        self.quantitative_analyses = kwargs.pop("quantitative_analyses", {})
+        self.ethical_considerations = kwargs.pop("ethical_considerations", {})
+        self.caveats_and_recommendations = kwargs.pop("caveats_and_recommendations", {})

        # Open additional attributes
        for key, value in kwargs.items():
@@ -122,10 +128,10 @@ class ModelCard(object):
            modelcard = ModelCard.from_pretrained('bert-base-uncased', output_attention=True, foo=False)

        """
-        cache_dir = kwargs.pop('cache_dir', None)
-        proxies = kwargs.pop('proxies', None)
-        find_from_standard_name = kwargs.pop('find_from_standard_name', True)
-        return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
+        cache_dir = kwargs.pop("cache_dir", None)
+        proxies = kwargs.pop("proxies", None)
+        find_from_standard_name = kwargs.pop("find_from_standard_name", True)
+        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)

        if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
            # For simplicity we use the same pretrained url than the configuration files
@@ -145,36 +151,43 @@ class ModelCard(object):

        try:
            # Load from URL or cache if already cached
-            resolved_model_card_file = cached_path(model_card_file, cache_dir=cache_dir, force_download=True,
-                                               proxies=proxies, resume_download=False)
+            resolved_model_card_file = cached_path(
+                model_card_file, cache_dir=cache_dir, force_download=True, proxies=proxies, resume_download=False
+            )
            if resolved_model_card_file == model_card_file:
                logger.info("loading model card file {}".format(model_card_file))
            else:
-                logger.info("loading model card file {} from cache at {}".format(
-                    model_card_file, resolved_model_card_file))
+                logger.info(
+                    "loading model card file {} from cache at {}".format(model_card_file, resolved_model_card_file)
+                )
            # Load model card
            modelcard = cls.from_json_file(resolved_model_card_file)

        except EnvironmentError:
            if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
-                logger.warning("Couldn't reach server at '{}' to download model card file.".format(
-                        model_card_file))
+                logger.warning("Couldn't reach server at '{}' to download model card file.".format(model_card_file))
            else:
-                logger.warning("Model name '{}' was not found in model name list ({}). " \
-                      "We assumed '{}' was a path or url to a model card file named {} or " \
+                logger.warning(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url to a model card file named {} or "
                    "a directory containing such a file but couldn't find any such file at this path or url.".format(
                        pretrained_model_name_or_path,
-                        ', '.join(ALL_PRETRAINED_CONFIG_ARCHIVE_MAP.keys()),
-                        model_card_file, MODEL_CARD_NAME))
+                        ", ".join(ALL_PRETRAINED_CONFIG_ARCHIVE_MAP.keys()),
+                        model_card_file,
+                        MODEL_CARD_NAME,
+                    )
+                )
            logger.warning("Creating an empty model card.")

            # We fall back on creating an empty model card
            modelcard = cls()

        except json.JSONDecodeError:
-            logger.warning("Couldn't reach server at '{}' to download model card file or "
+            logger.warning(
+                "Couldn't reach server at '{}' to download model card file or "
                "model card file is not a valid JSON file. "
-                           "Please check network or file content here: {}.".format(model_card_file, resolved_model_card_file))
+                "Please check network or file content here: {}.".format(model_card_file, resolved_model_card_file)
+            )
            logger.warning("Creating an empty model card.")

            # We fall back on creating an empty model card
@@ -203,7 +216,7 @@ class ModelCard(object):
    @classmethod
    def from_json_file(cls, json_file):
        """Constructs a `ModelCard` from a json file of parameters."""
-        with open(json_file, "r", encoding='utf-8') as reader:
+        with open(json_file, "r", encoding="utf-8") as reader:
            text = reader.read()
        dict_obj = json.loads(text)
        return cls(**dict_obj)
@@ -225,5 +238,5 @@ class ModelCard(object):

    def to_json_file(self, json_file_path):
        """ Save this instance to a json file."""
-        with open(json_file_path, "w", encoding='utf-8') as writer:
+        with open(json_file_path, "w", encoding="utf-8") as writer:
            writer.write(self.to_json_string())
--- a/transformers/modeling_albert.py
+++ b/transformers/modeling_albert.py
-
 # coding=utf-8
 # Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
 #
@@ -15,29 +14,33 @@
 # limitations under the License.
 """PyTorch ALBERT model. """

-import os
-import math
 import logging
+import math
+import os
+
 import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss, MSELoss
-from transformers.modeling_utils import PreTrainedModel
+
 from transformers.configuration_albert import AlbertConfig
-from transformers.modeling_bert import BertEmbeddings, BertSelfAttention, prune_linear_layer, ACT2FN
+from transformers.modeling_bert import ACT2FN, BertEmbeddings, BertSelfAttention, prune_linear_layer
+from transformers.modeling_utils import PreTrainedModel
+
 from .file_utils import add_start_docstrings

+
 logger = logging.getLogger(__name__)


 ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-pytorch_model.bin",
-    'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-pytorch_model.bin",
-    'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-pytorch_model.bin",
-    'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-pytorch_model.bin",
-    'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-pytorch_model.bin",
-    'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-pytorch_model.bin",
-    'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-pytorch_model.bin",
-    'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-pytorch_model.bin",
+    "albert-base-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-pytorch_model.bin",
+    "albert-large-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-pytorch_model.bin",
+    "albert-xlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-pytorch_model.bin",
+    "albert-xxlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-pytorch_model.bin",
+    "albert-base-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-pytorch_model.bin",
+    "albert-large-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-pytorch_model.bin",
+    "albert-xlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-pytorch_model.bin",
+    "albert-xxlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-pytorch_model.bin",
 }


@@ -48,8 +51,10 @@ def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
        import numpy as np
        import tensorflow as tf
    except ImportError:
-        logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions.")
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
        raise
    tf_path = os.path.abspath(tf_checkpoint_path)
    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
@@ -109,7 +114,7 @@ def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
        if "seq_relationship" in name:
            continue

-        name = name.split('/')
+        name = name.split("/")

        # Ignore the gradients applied by the LAMB/ADAM optimizers.
        if "adam_m" in name or "adam_v" in name or "global_step" in name:
@@ -118,32 +123,32 @@ def load_tf_weights_in_albert(model, config, tf_checkpoint_path):

        pointer = model
        for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
-                l = re.split(r'_(\d+)', m_name)
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
            else:
-                l = [m_name]
-
-            if l[0] == 'kernel' or l[0] == 'gamma':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'output_bias' or l[0] == 'beta':
-                pointer = getattr(pointer, 'bias')
-            elif l[0] == 'output_weights':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'squad':
-                pointer = getattr(pointer, 'classifier')
+                scope_names = [m_name]
+
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
            else:
                try:
-                    pointer = getattr(pointer, l[0])
+                    pointer = getattr(pointer, scope_names[0])
                except AttributeError:
                    logger.info("Skipping {}".format("/".join(name)))
                    continue
-            if len(l) >= 2:
-                num = int(l[1])
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
                pointer = pointer[num]

-        if m_name[-11:] == '_embeddings':
-            pointer = getattr(pointer, 'weight')
-        elif m_name == 'kernel':
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
            array = np.transpose(array)
        try:
            assert pointer.shape == array.shape
@@ -160,6 +165,7 @@ class AlbertEmbeddings(BertEmbeddings):
    """
    Construct the embeddings from word, position and token_type embeddings.
    """
+
    def __init__(self, config):
        super(AlbertEmbeddings, self).__init__(config)

@@ -238,9 +244,12 @@ class AlbertAttention(BertSelfAttention):
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        reshaped_context_layer = context_layer.view(*new_context_layer_shape)

-
        # Should find a better way to do this
-        w = self.dense.weight.t().view(self.num_attention_heads, self.attention_head_size, self.hidden_size).to(context_layer.dtype)
+        w = (
+            self.dense.weight.t()
+            .view(self.num_attention_heads, self.attention_head_size, self.hidden_size)
+            .to(context_layer.dtype)
+        )
        b = self.dense.bias.to(context_layer.dtype)

        projected_context_layer = torch.einsum("bfnd,ndh->bfh", context_layer, w) + b
@@ -328,7 +337,11 @@ class AlbertTransformer(nn.Module):
            # Index of the layer inside the group
            layer_idx = int(i - group_idx * layers_per_group)

-            layer_group_output = self.albert_layer_groups[group_idx](hidden_states, attention_mask, head_mask[group_idx*layers_per_group:(group_idx+1)*layers_per_group])  
+            layer_group_output = self.albert_layer_groups[group_idx](
+                hidden_states,
+                attention_mask,
+                head_mask[group_idx * layers_per_group : (group_idx + 1) * layers_per_group],
+            )
            hidden_states = layer_group_output[0]

            if self.output_attentions:
@@ -337,7 +350,6 @@ class AlbertTransformer(nn.Module):
            if self.output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

-        
        outputs = (hidden_states,)
        if self.output_hidden_states:
            outputs = outputs + (all_hidden_states,)
@@ -346,11 +358,11 @@ class AlbertTransformer(nn.Module):
        return outputs  # last-layer hidden state, (all hidden states), (all attentions)


-
 class AlbertPreTrainedModel(PreTrainedModel):
    """ An abstract class to handle weights initialization and
        a simple interface for dowloading and loading pretrained models.
    """
+
    config_class = AlbertConfig
    pretrained_model_archive_map = ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
    base_model_prefix = "albert"
@@ -431,8 +443,12 @@ ALBERT_INPUTS_DOCSTRING = r"""
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
 """

-@add_start_docstrings("The bare ALBERT Model transformer outputting raw hidden-states without any specific head on top.",
-                      ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare ALBERT Model transformer outputting raw hidden-states without any specific head on top.",
+    ALBERT_START_DOCSTRING,
+    ALBERT_INPUTS_DOCSTRING,
+)
 class AlbertModel(AlbertPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -500,8 +516,15 @@ class AlbertModel(AlbertPreTrainedModel):
            inner_group_idx = int(layer - group_idx * self.config.inner_group_num)
            self.encoder.albert_layer_groups[group_idx].albert_layers[inner_group_idx].attention.prune_heads(heads)

-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
-                inputs_embeds=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+    ):

        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
@@ -527,24 +550,30 @@ class AlbertModel(AlbertPreTrainedModel):
                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
                head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
            elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+                head_mask = (
+                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+                )  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
        else:
            head_mask = [None] * self.config.num_hidden_layers

-        embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
-                                           inputs_embeds=inputs_embeds)
-        encoder_outputs = self.encoder(embedding_output,
-                                       extended_attention_mask,
-                                       head_mask=head_mask)
+        embedding_output = self.embeddings(
+            input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+        )
+        encoder_outputs = self.encoder(embedding_output, extended_attention_mask, head_mask=head_mask)

        sequence_output = encoder_outputs[0]

        pooled_output = self.pooler_activation(self.pooler(sequence_output[:, 0]))

-        outputs = (sequence_output, pooled_output) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
+        outputs = (sequence_output, pooled_output) + encoder_outputs[
+            1:
+        ]  # add hidden_states and attentions if they are here
        return outputs

+
 class AlbertMLMHead(nn.Module):
    def __init__(self, config):
        super(AlbertMLMHead, self).__init__()
@@ -566,7 +595,9 @@ class AlbertMLMHead(nn.Module):
        return prediction_scores


-@add_start_docstrings("Bert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    "Bert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING
+)
 class AlbertForMaskedLM(AlbertPreTrainedModel):
    r"""
        **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -602,21 +633,28 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
        """ Make sure we are sharing the input and output embeddings.
            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
        """
-        self._tie_or_clone_weights(self.predictions.decoder,
-                                   self.albert.embeddings.word_embeddings)
+        self._tie_or_clone_weights(self.predictions.decoder, self.albert.embeddings.word_embeddings)

    def get_output_embeddings(self):
        return self.predictions.decoder

-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                masked_lm_labels=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        masked_lm_labels=None,
+    ):
        outputs = self.albert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
-            inputs_embeds=inputs_embeds
+            inputs_embeds=inputs_embeds,
        )
        sequence_outputs = outputs[0]

@@ -631,9 +669,12 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
        return outputs


-@add_start_docstrings("""Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings(
+    """Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks. """,
-    ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
+    ALBERT_START_DOCSTRING,
+    ALBERT_INPUTS_DOCSTRING,
+)
 class AlbertForSequenceClassification(AlbertPreTrainedModel):
    r"""
        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -665,6 +706,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
        loss, logits = outputs[:2]

    """
+
    def __init__(self, config):
        super(AlbertForSequenceClassification, self).__init__(config)
        self.num_labels = config.num_labels
@@ -675,8 +717,16 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):

        self.init_weights()

-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
-                position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):

        outputs = self.albert(
            input_ids=input_ids,
@@ -684,7 +734,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
-            inputs_embeds=inputs_embeds
+            inputs_embeds=inputs_embeds,
        )

        pooled_output = outputs[1]
@@ -707,10 +757,12 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
        return outputs  # (loss), logits, (hidden_states), (attentions)


-
-@add_start_docstrings("""Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
    the hidden-states output to compute `span start logits` and `span end logits`). """,
-    ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
+    ALBERT_START_DOCSTRING,
+    ALBERT_INPUTS_DOCSTRING,
+)
 class AlbertForQuestionAnswering(AlbertPreTrainedModel):
    r"""
        **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -752,6 +804,7 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):


    """
+
    def __init__(self, config):
        super(AlbertForQuestionAnswering, self).__init__(config)
        self.num_labels = config.num_labels
@@ -761,8 +814,17 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):

        self.init_weights()

-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
-                inputs_embeds=None, start_positions=None, end_positions=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+    ):

        outputs = self.albert(
            input_ids=input_ids,
@@ -770,7 +832,7 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
-            inputs_embeds=inputs_embeds
+            inputs_embeds=inputs_embeds,
        )

        sequence_output = outputs[0]

--- a/transformers/modeling_auto.py
+++ b/transformers/modeling_auto.py
@@ -18,40 +18,91 @@ from __future__ import absolute_import, division, print_function, unicode_litera

 import logging

-from .configuration_auto import (AlbertConfig, BertConfig, CamembertConfig, CTRLConfig,
-                                 DistilBertConfig, GPT2Config, OpenAIGPTConfig, RobertaConfig,
-                                 TransfoXLConfig, XLMConfig, XLNetConfig, XLMRobertaConfig)
-
-from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering, \
-    BertForTokenClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_ctrl import CTRLModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering, \
-    XLNetForTokenClassification, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering, \
-    XLM_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, \
-    RobertaForTokenClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, \
-    DistilBertForSequenceClassification, DistilBertForTokenClassification, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, \
-    CamembertForMultipleChoice, CamembertForTokenClassification, CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, \
-    AlbertForQuestionAnswering, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_t5 import T5Model, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_xlm_roberta import XLMRobertaModel, XLMRobertaForMaskedLM, XLMRobertaForSequenceClassification, \
-    XLMRobertaForMultipleChoice, XLMRobertaForTokenClassification, XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
-
-from .modeling_utils import PreTrainedModel, SequenceSummary
-
-from .file_utils import add_start_docstrings
+from .configuration_auto import (
+    AlbertConfig,
+    BertConfig,
+    CamembertConfig,
+    CTRLConfig,
+    DistilBertConfig,
+    GPT2Config,
+    OpenAIGPTConfig,
+    RobertaConfig,
+    TransfoXLConfig,
+    XLMConfig,
+    XLMRobertaConfig,
+    XLNetConfig,
+)
+from .modeling_albert import (
+    ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    AlbertForMaskedLM,
+    AlbertForQuestionAnswering,
+    AlbertForSequenceClassification,
+    AlbertModel,
+)
+from .modeling_bert import (
+    BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    BertForMaskedLM,
+    BertForQuestionAnswering,
+    BertForSequenceClassification,
+    BertForTokenClassification,
+    BertModel,
+)
+from .modeling_camembert import (
+    CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    CamembertForMaskedLM,
+    CamembertForSequenceClassification,
+    CamembertForTokenClassification,
+    CamembertModel,
+)
+from .modeling_ctrl import CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRLLMHeadModel, CTRLModel
+from .modeling_distilbert import (
+    DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    DistilBertForMaskedLM,
+    DistilBertForQuestionAnswering,
+    DistilBertForSequenceClassification,
+    DistilBertForTokenClassification,
+    DistilBertModel,
+)
+from .modeling_gpt2 import GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, GPT2LMHeadModel, GPT2Model
+from .modeling_openai import OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, OpenAIGPTLMHeadModel, OpenAIGPTModel
+from .modeling_roberta import (
+    ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+    RobertaForMaskedLM,
+    RobertaForSequenceClassification,
+    RobertaForTokenClassification,
+    RobertaModel,
+)
+from .modeling_t5 import T5_PRETRAINED_MODEL_ARCHIVE_MAP, T5Model, T5WithLMHeadModel
+from .modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, TransfoXLLMHeadModel, TransfoXLModel
+from .modeling_xlm import (
+    XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+    XLMForQuestionAnswering,
+    XLMForSequenceClassification,
+    XLMModel,
+    XLMWithLMHeadModel,
+)
+from .modeling_xlm_roberta import (
+    XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+    XLMRobertaForMaskedLM,
+    XLMRobertaForSequenceClassification,
+    XLMRobertaForTokenClassification,
+    XLMRobertaModel,
+)
+from .modeling_xlnet import (
+    XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+    XLNetForQuestionAnswering,
+    XLNetForSequenceClassification,
+    XLNetForTokenClassification,
+    XLNetLMHeadModel,
+    XLNetModel,
+)
+

 logger = logging.getLogger(__name__)


-ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value)
+ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict(
+    (key, value)
    for pretrained_map in [
        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
        OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
@@ -67,7 +118,8 @@ ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value)
        T5_PRETRAINED_MODEL_ARCHIVE_MAP,
        XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
    ]
-    for key, value, in pretrained_map.items())
+    for key, value, in pretrained_map.items()
+)


 class AutoModel(object):
@@ -98,10 +150,13 @@ class AutoModel(object):

        This class cannot be instantiated using `__init__()` (throws an error).
    """
+
    def __init__(self):
-        raise EnvironmentError("AutoModel is designed to be instantiated "
+        raise EnvironmentError(
+            "AutoModel is designed to be instantiated "
            "using the `AutoModel.from_pretrained(pretrained_model_name_or_path)` or "
-            "`AutoModel.from_config(config)` methods.")
+            "`AutoModel.from_config(config)` methods."
+        )

    @classmethod
    def from_config(cls, config):
@@ -232,35 +287,39 @@ class AutoModel(object):
            model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)

        """
-        if 't5' in pretrained_model_name_or_path:
+        if "t5" in pretrained_model_name_or_path:
            return T5Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'distilbert' in pretrained_model_name_or_path:
+        elif "distilbert" in pretrained_model_name_or_path:
            return DistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'albert' in pretrained_model_name_or_path:
+        elif "albert" in pretrained_model_name_or_path:
            return AlbertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'camembert' in pretrained_model_name_or_path:
+        elif "camembert" in pretrained_model_name_or_path:
            return CamembertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm-roberta' in pretrained_model_name_or_path:
+        elif "xlm-roberta" in pretrained_model_name_or_path:
            return XLMRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'roberta' in pretrained_model_name_or_path:
+        elif "roberta" in pretrained_model_name_or_path:
            return RobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
+        elif "bert" in pretrained_model_name_or_path:
            return BertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'openai-gpt' in pretrained_model_name_or_path:
+        elif "openai-gpt" in pretrained_model_name_or_path:
            return OpenAIGPTModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'gpt2' in pretrained_model_name_or_path:
+        elif "gpt2" in pretrained_model_name_or_path:
            return GPT2Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'transfo-xl' in pretrained_model_name_or_path:
+        elif "transfo-xl" in pretrained_model_name_or_path:
            return TransfoXLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
+        elif "xlnet" in pretrained_model_name_or_path:
            return XLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm' in pretrained_model_name_or_path:
+        elif "xlm" in pretrained_model_name_or_path:
            return XLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'ctrl' in pretrained_model_name_or_path:
+        elif "ctrl" in pretrained_model_name_or_path:
            return CTRLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
            "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm-roberta', 'xlm', 'roberta, 'ctrl', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
+            "'xlm-roberta', 'xlm', 'roberta, 'ctrl', 'distilbert', 'camembert', 'albert'".format(
+                pretrained_model_name_or_path
+            )
+        )


 class AutoModelWithLMHead(object):
@@ -291,10 +350,13 @@ class AutoModelWithLMHead(object):

        This class cannot be instantiated using `__init__()` (throws an error).
    """
+
    def __init__(self):
-        raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated "
+        raise EnvironmentError(
+            "AutoModelWithLMHead is designed to be instantiated "
            "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` or "
-            "`AutoModelWithLMHead.from_config(config)` methods.")
+            "`AutoModelWithLMHead.from_config(config)` methods."
+        )

    @classmethod
    def from_config(cls, config):
@@ -423,35 +485,39 @@ class AutoModelWithLMHead(object):
            model = AutoModelWithLMHead.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)

        """
-        if 't5' in pretrained_model_name_or_path:
+        if "t5" in pretrained_model_name_or_path:
            return T5WithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'distilbert' in pretrained_model_name_or_path:
+        elif "distilbert" in pretrained_model_name_or_path:
            return DistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'albert' in pretrained_model_name_or_path:
+        elif "albert" in pretrained_model_name_or_path:
            return AlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'camembert' in pretrained_model_name_or_path:
+        elif "camembert" in pretrained_model_name_or_path:
            return CamembertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm-roberta' in pretrained_model_name_or_path:
+        elif "xlm-roberta" in pretrained_model_name_or_path:
            return XLMRobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'roberta' in pretrained_model_name_or_path:
+        elif "roberta" in pretrained_model_name_or_path:
            return RobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
+        elif "bert" in pretrained_model_name_or_path:
            return BertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'openai-gpt' in pretrained_model_name_or_path:
+        elif "openai-gpt" in pretrained_model_name_or_path:
            return OpenAIGPTLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'gpt2' in pretrained_model_name_or_path:
+        elif "gpt2" in pretrained_model_name_or_path:
            return GPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'transfo-xl' in pretrained_model_name_or_path:
+        elif "transfo-xl" in pretrained_model_name_or_path:
            return TransfoXLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
+        elif "xlnet" in pretrained_model_name_or_path:
            return XLNetLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm' in pretrained_model_name_or_path:
+        elif "xlm" in pretrained_model_name_or_path:
            return XLMWithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'ctrl' in pretrained_model_name_or_path:
+        elif "ctrl" in pretrained_model_name_or_path:
            return CTRLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
            "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm-roberta', 'xlm', 'roberta','ctrl', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
+            "'xlm-roberta', 'xlm', 'roberta','ctrl', 'distilbert', 'camembert', 'albert'".format(
+                pretrained_model_name_or_path
+            )
+        )


 class AutoModelForSequenceClassification(object):
@@ -477,10 +543,13 @@ class AutoModelForSequenceClassification(object):

        This class cannot be instantiated using `__init__()` (throws an error).
    """
+
    def __init__(self):
-        raise EnvironmentError("AutoModelForSequenceClassification is designed to be instantiated "
+        raise EnvironmentError(
+            "AutoModelForSequenceClassification is designed to be instantiated "
            "using the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)` or "
-            "`AutoModelForSequenceClassification.from_config(config)` methods.")
+            "`AutoModelForSequenceClassification.from_config(config)` methods."
+        )

    @classmethod
    def from_config(cls, config):
@@ -597,25 +666,39 @@ class AutoModelForSequenceClassification(object):
            model = AutoModelForSequenceClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)

        """
-        if 'distilbert' in pretrained_model_name_or_path:
-            return DistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'albert' in pretrained_model_name_or_path:
-            return AlbertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'camembert' in pretrained_model_name_or_path:
-            return CamembertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm-roberta' in pretrained_model_name_or_path:
-            return XLMRobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'roberta' in pretrained_model_name_or_path:
-            return RobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
+        if "distilbert" in pretrained_model_name_or_path:
+            return DistilBertForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "albert" in pretrained_model_name_or_path:
+            return AlbertForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "camembert" in pretrained_model_name_or_path:
+            return CamembertForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "xlm-roberta" in pretrained_model_name_or_path:
+            return XLMRobertaForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "roberta" in pretrained_model_name_or_path:
+            return RobertaForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "bert" in pretrained_model_name_or_path:
            return BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
+        elif "xlnet" in pretrained_model_name_or_path:
            return XLNetForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm' in pretrained_model_name_or_path:
+        elif "xlm" in pretrained_model_name_or_path:
            return XLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)

-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'bert', 'xlnet', 'xlm-roberta', 'xlm', 'roberta', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
+            "'bert', 'xlnet', 'xlm-roberta', 'xlm', 'roberta', 'distilbert', 'camembert', 'albert'".format(
+                pretrained_model_name_or_path
+            )
+        )


 class AutoModelForQuestionAnswering(object):
@@ -638,10 +721,13 @@ class AutoModelForQuestionAnswering(object):

        This class cannot be instantiated using `__init__()` (throws an error).
    """
+
    def __init__(self):
-        raise EnvironmentError("AutoModelForQuestionAnswering is designed to be instantiated "
+        raise EnvironmentError(
+            "AutoModelForQuestionAnswering is designed to be instantiated "
            "using the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)` or "
-            "`AutoModelForQuestionAnswering.from_config(config)` methods.")
+            "`AutoModelForQuestionAnswering.from_config(config)` methods."
+        )

    @classmethod
    def from_config(cls, config):
@@ -745,26 +831,30 @@ class AutoModelForQuestionAnswering(object):
            model = AutoModelForQuestionAnswering.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)

        """
-        if 'distilbert' in pretrained_model_name_or_path:
+        if "distilbert" in pretrained_model_name_or_path:
            return DistilBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'albert' in pretrained_model_name_or_path:
+        elif "albert" in pretrained_model_name_or_path:
            return AlbertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
+        elif "bert" in pretrained_model_name_or_path:
            return BertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
+        elif "xlnet" in pretrained_model_name_or_path:
            return XLNetForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm' in pretrained_model_name_or_path:
+        elif "xlm" in pretrained_model_name_or_path:
            return XLMForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)

-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'bert', 'xlnet', 'xlm', 'distilbert', 'albert'".format(pretrained_model_name_or_path))
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
+            "'bert', 'xlnet', 'xlm', 'distilbert', 'albert'".format(pretrained_model_name_or_path)
+        )


 class AutoModelForTokenClassification:
    def __init__(self):
-        raise EnvironmentError("AutoModelForTokenClassification is designed to be instantiated "
+        raise EnvironmentError(
+            "AutoModelForTokenClassification is designed to be instantiated "
            "using the `AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` or "
-                               "`AutoModelForTokenClassification.from_config(config)` methods.")
+            "`AutoModelForTokenClassification.from_config(config)` methods."
+        )

    @classmethod
    def from_config(cls, config):
@@ -870,18 +960,28 @@ class AutoModelForTokenClassification:
            model = AutoModelForTokenClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)

        """
-        if 'camembert' in pretrained_model_name_or_path:
-            return CamembertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'distilbert' in pretrained_model_name_or_path:
-            return DistilBertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm-roberta' in pretrained_model_name_or_path:
-            return XLMRobertaForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'roberta' in pretrained_model_name_or_path:
+        if "camembert" in pretrained_model_name_or_path:
+            return CamembertForTokenClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "distilbert" in pretrained_model_name_or_path:
+            return DistilBertForTokenClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "xlm-roberta" in pretrained_model_name_or_path:
+            return XLMRobertaForTokenClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "roberta" in pretrained_model_name_or_path:
            return RobertaForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
+        elif "bert" in pretrained_model_name_or_path:
            return BertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
+        elif "xlnet" in pretrained_model_name_or_path:
            return XLNetForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)

-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'bert', 'xlnet', 'camembert', 'distilbert', 'xlm-roberta', 'roberta'".format(pretrained_model_name_or_path))
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
+            "'bert', 'xlnet', 'camembert', 'distilbert', 'xlm-roberta', 'roberta'".format(
+                pretrained_model_name_or_path
+            )
+        )
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -26,34 +26,35 @@ import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss

-from .modeling_utils import PreTrainedModel, prune_linear_layer
 from .configuration_bert import BertConfig
 from .file_utils import add_start_docstrings
+from .modeling_utils import PreTrainedModel, prune_linear_layer
+

 logger = logging.getLogger(__name__)

 BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin",
-    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin",
-    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin",
-    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin",
-    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin",
-    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin",
-    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin",
-    'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin",
-    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin",
-    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin",
-    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin",
-    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin",
-    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
-    'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin",
-    'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin",
-    'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-pytorch_model.bin",
-    'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin",
-    'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin",
-    'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin",
-    'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/pytorch_model.bin",
-    'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/pytorch_model.bin",
+    "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin",
+    "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin",
+    "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin",
+    "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin",
+    "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin",
+    "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin",
+    "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin",
+    "bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin",
+    "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin",
+    "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin",
+    "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin",
+    "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin",
+    "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
+    "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin",
+    "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin",
+    "bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-pytorch_model.bin",
+    "bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin",
+    "bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin",
+    "bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin",
+    "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/pytorch_model.bin",
+    "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/pytorch_model.bin",
 }


@@ -65,8 +66,10 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
        import numpy as np
        import tensorflow as tf
    except ImportError:
-        logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions.")
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
        raise
    tf_path = os.path.abspath(tf_checkpoint_path)
    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
@@ -81,7 +84,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
        arrays.append(array)

    for name, array in zip(names, arrays):
-        name = name.split('/')
+        name = name.split("/")
        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
        # which are not required for using pretrained model
        if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
@@ -89,30 +92,30 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
            continue
        pointer = model
        for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
-                l = re.split(r'_(\d+)', m_name)
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
            else:
-                l = [m_name]
-            if l[0] == 'kernel' or l[0] == 'gamma':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'output_bias' or l[0] == 'beta':
-                pointer = getattr(pointer, 'bias')
-            elif l[0] == 'output_weights':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'squad':
-                pointer = getattr(pointer, 'classifier')
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
            else:
                try:
-                    pointer = getattr(pointer, l[0])
+                    pointer = getattr(pointer, scope_names[0])
                except AttributeError:
                    logger.info("Skipping {}".format("/".join(name)))
                    continue
-            if len(l) >= 2:
-                num = int(l[1])
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
                pointer = pointer[num]
-        if m_name[-11:] == '_embeddings':
-            pointer = getattr(pointer, 'weight')
-        elif m_name == 'kernel':
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
            array = np.transpose(array)
        try:
            assert pointer.shape == array.shape
@@ -157,6 +160,7 @@ BertLayerNorm = torch.nn.LayerNorm
 class BertEmbeddings(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings.
    """
+
    def __init__(self, config):
        super(BertEmbeddings, self).__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
@@ -199,7 +203,8 @@ class BertSelfAttention(nn.Module):
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
        self.output_attentions = config.output_attentions

        self.num_attention_heads = config.num_attention_heads
@@ -217,7 +222,14 @@ class BertSelfAttention(nn.Module):
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

-    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None):
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+    ):
        mixed_query_layer = self.query(hidden_states)

        # If this is instantiated as a cross-attention module, the keys
@@ -307,8 +319,17 @@ class BertAttention(nn.Module):
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

-    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None):
-        self_outputs = self.self(hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+    ):
+        self_outputs = self.self(
+            hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask
+        )
        attention_output = self.output(self_outputs[0], hidden_states)
        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
        return outputs
@@ -318,7 +339,9 @@ class BertIntermediate(nn.Module):
    def __init__(self, config):
        super(BertIntermediate, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
-        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+        if isinstance(config.hidden_act, str) or (
+            sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)  # noqa: F821
+        ):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act
@@ -353,13 +376,22 @@ class BertLayer(nn.Module):
        self.intermediate = BertIntermediate(config)
        self.output = BertOutput(config)

-    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None):
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+    ):
        self_attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
        attention_output = self_attention_outputs[0]
        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights

        if self.is_decoder and encoder_hidden_states is not None:
-            cross_attention_outputs = self.crossattention(attention_output, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask)
+            cross_attention_outputs = self.crossattention(
+                attention_output, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask
+            )
            attention_output = cross_attention_outputs[0]
            outputs = outputs + cross_attention_outputs[1:]  # add cross attentions if we output attention weights

@@ -376,14 +408,23 @@ class BertEncoder(nn.Module):
        self.output_hidden_states = config.output_hidden_states
        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])

-    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None):
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+    ):
        all_hidden_states = ()
        all_attentions = ()
        for i, layer_module in enumerate(self.layer):
            if self.output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

-            layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i], encoder_hidden_states, encoder_attention_mask)
+            layer_outputs = layer_module(
+                hidden_states, attention_mask, head_mask[i], encoder_hidden_states, encoder_attention_mask
+            )
            hidden_states = layer_outputs[0]

            if self.output_attentions:
@@ -420,7 +461,9 @@ class BertPredictionHeadTransform(nn.Module):
    def __init__(self, config):
        super(BertPredictionHeadTransform, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+        if isinstance(config.hidden_act, str) or (
+            sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)  # noqa: F821
+        ):
            self.transform_act_fn = ACT2FN[config.hidden_act]
        else:
            self.transform_act_fn = config.hidden_act
@@ -440,9 +483,7 @@ class BertLMPredictionHead(nn.Module):

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
-        self.decoder = nn.Linear(config.hidden_size,
-                                 config.vocab_size,
-                                 bias=False)
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        self.bias = nn.Parameter(torch.zeros(config.vocab_size))

@@ -488,6 +529,7 @@ class BertPreTrainedModel(PreTrainedModel):
    """ An abstract class to handle weights initialization and
        a simple interface for dowloading and loading pretrained models.
    """
+
    config_class = BertConfig
    pretrained_model_archive_map = BERT_PRETRAINED_MODEL_ARCHIVE_MAP
    load_tf_weights = load_tf_weights_in_bert
@@ -581,8 +623,12 @@ BERT_INPUTS_DOCSTRING = r"""
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
 """

-@add_start_docstrings("The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
-                      BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class BertModel(BertPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -612,6 +658,7 @@ class BertModel(BertPreTrainedModel):
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

    """
+
    def __init__(self, config):
        super(BertModel, self).__init__(config)
        self.config = config
@@ -636,8 +683,17 @@ class BertModel(BertPreTrainedModel):
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None,
-                head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+    ):
        """ Forward pass on the Model.

        The model can behave as an encoder (with only self-attention) as well
@@ -681,12 +737,18 @@ class BertModel(BertPreTrainedModel):
                batch_size, seq_length = input_shape
                seq_ids = torch.arange(seq_length, device=device)
                causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
-                causal_mask = causal_mask.to(torch.long)  # not converting to long will cause errors with pytorch version < 1.3
+                causal_mask = causal_mask.to(
+                    torch.long
+                )  # not converting to long will cause errors with pytorch version < 1.3
                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
            else:
                extended_attention_mask = attention_mask[:, None, None, :]
        else:
-            raise ValueError("Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(input_shape, attention_mask.shape))
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
@@ -709,10 +771,15 @@ class BertModel(BertPreTrainedModel):
            elif encoder_attention_mask.dim() == 2:
                encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
            else:
-                raise ValueError("Wrong shape for encoder_hidden_shape (shape {}) or encoder_attention_mask (shape {})".format(encoder_hidden_shape,
-                                                                                                                               encoder_attention_mask.shape))
-
-            encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
+                raise ValueError(
+                    "Wrong shape for encoder_hidden_shape (shape {}) or encoder_attention_mask (shape {})".format(
+                        encoder_hidden_shape, encoder_attention_mask.shape
+                    )
+                )
+
+            encoder_extended_attention_mask = encoder_extended_attention_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # fp16 compatibility
            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
        else:
            encoder_extended_attention_mask = None
@@ -727,28 +794,40 @@ class BertModel(BertPreTrainedModel):
                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
                head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
            elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype)  # switch to fload if need + fp16 compatibility
+                head_mask = (
+                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+                )  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
        else:
            head_mask = [None] * self.config.num_hidden_layers

-        embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds)
-        encoder_outputs = self.encoder(embedding_output,
+        embedding_output = self.embeddings(
+            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
            attention_mask=extended_attention_mask,
            head_mask=head_mask,
            encoder_hidden_states=encoder_hidden_states,
-                                       encoder_attention_mask=encoder_extended_attention_mask)
+            encoder_attention_mask=encoder_extended_attention_mask,
+        )
        sequence_output = encoder_outputs[0]
        pooled_output = self.pooler(sequence_output)

-        outputs = (sequence_output, pooled_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
+        outputs = (sequence_output, pooled_output,) + encoder_outputs[
+            1:
+        ]  # add hidden_states and attentions if they are here
        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)


-@add_start_docstrings("""Bert Model with two heads on top as done during the pre-training:
+@add_start_docstrings(
+    """Bert Model with two heads on top as done during the pre-training:
                       a `masked language modeling` head and a `next sentence prediction (classification)` head. """,
    BERT_START_DOCSTRING,
-                      BERT_INPUTS_DOCSTRING)
+    BERT_INPUTS_DOCSTRING,
+)
 class BertForPreTraining(BertPreTrainedModel):
    r"""
        **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -786,6 +865,7 @@ class BertForPreTraining(BertPreTrainedModel):
        prediction_scores, seq_relationship_scores = outputs[:2]

    """
+
    def __init__(self, config):
        super(BertForPreTraining, self).__init__(config)

@@ -797,20 +877,33 @@ class BertForPreTraining(BertPreTrainedModel):
    def get_output_embeddings(self):
        return self.cls.predictions.decoder

-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                masked_lm_labels=None, next_sentence_label=None):
-
-        outputs = self.bert(input_ids,
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        masked_lm_labels=None,
+        next_sentence_label=None,
+    ):
+
+        outputs = self.bert(
+            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+            inputs_embeds=inputs_embeds,
+        )

        sequence_output, pooled_output = outputs[:2]
        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)

-        outputs = (prediction_scores, seq_relationship_score,) + outputs[2:]  # add hidden states and attention if they are here
+        outputs = (prediction_scores, seq_relationship_score,) + outputs[
+            2:
+        ]  # add hidden states and attention if they are here

        if masked_lm_labels is not None and next_sentence_label is not None:
            loss_fct = CrossEntropyLoss()
@@ -822,9 +915,9 @@ class BertForPreTraining(BertPreTrainedModel):
        return outputs  # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions)


-@add_start_docstrings("""Bert Model with a `language modeling` head on top. """,
-                      BERT_START_DOCSTRING,
-                      BERT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING
+)
 class BertForMaskedLM(BertPreTrainedModel):
    r"""
        **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -862,6 +955,7 @@ class BertForMaskedLM(BertPreTrainedModel):
        loss, prediction_scores = outputs[:2]

    """
+
    def __init__(self, config):
        super(BertForMaskedLM, self).__init__(config)

@@ -873,17 +967,30 @@ class BertForMaskedLM(BertPreTrainedModel):
    def get_output_embeddings(self):
        return self.cls.predictions.decoder

-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                masked_lm_labels=None, encoder_hidden_states=None, encoder_attention_mask=None, lm_labels=None, ):
-
-        outputs = self.bert(input_ids,
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        masked_lm_labels=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        lm_labels=None,
+    ):
+
+        outputs = self.bert(
+            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            encoder_hidden_states=encoder_hidden_states,
-                            encoder_attention_mask=encoder_attention_mask)
+            encoder_attention_mask=encoder_attention_mask,
+        )

        sequence_output = outputs[0]
        prediction_scores = self.cls(sequence_output)
@@ -912,9 +1019,11 @@ class BertForMaskedLM(BertPreTrainedModel):
        return outputs  # (masked_lm_loss), (ltr_lm_loss), prediction_scores, (hidden_states), (attentions)


-@add_start_docstrings("""Bert Model with a `next sentence prediction (classification)` head on top. """,
+@add_start_docstrings(
+    """Bert Model with a `next sentence prediction (classification)` head on top. """,
    BERT_START_DOCSTRING,
-                      BERT_INPUTS_DOCSTRING)
+    BERT_INPUTS_DOCSTRING,
+)
 class BertForNextSentencePrediction(BertPreTrainedModel):
    r"""
        **next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -945,6 +1054,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
        seq_relationship_scores = outputs[0]

    """
+
    def __init__(self, config):
        super(BertForNextSentencePrediction, self).__init__(config)

@@ -953,15 +1063,25 @@ class BertForNextSentencePrediction(BertPreTrainedModel):

        self.init_weights()

-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                next_sentence_label=None):
-
-        outputs = self.bert(input_ids,
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        next_sentence_label=None,
+    ):
+
+        outputs = self.bert(
+            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+            inputs_embeds=inputs_embeds,
+        )

        pooled_output = outputs[1]

@@ -976,10 +1096,12 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
        return outputs  # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions)


-@add_start_docstrings("""Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings(
+    """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
                      the pooled output) e.g. for GLUE tasks. """,
    BERT_START_DOCSTRING,
-                      BERT_INPUTS_DOCSTRING)
+    BERT_INPUTS_DOCSTRING,
+)
 class BertForSequenceClassification(BertPreTrainedModel):
    r"""
        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -1011,6 +1133,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
        loss, logits = outputs[:2]

    """
+
    def __init__(self, config):
        super(BertForSequenceClassification, self).__init__(config)
        self.num_labels = config.num_labels
@@ -1021,15 +1144,25 @@ class BertForSequenceClassification(BertPreTrainedModel):

        self.init_weights()

-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
-                position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
-
-        outputs = self.bert(input_ids,
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+
+        outputs = self.bert(
+            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+            inputs_embeds=inputs_embeds,
+        )

        pooled_output = outputs[1]

@@ -1051,10 +1184,12 @@ class BertForSequenceClassification(BertPreTrainedModel):
        return outputs  # (loss), logits, (hidden_states), (attentions)


-@add_start_docstrings("""Bert Model with a multiple choice classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """Bert Model with a multiple choice classification head on top (a linear layer on top of
                      the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
    BERT_START_DOCSTRING,
-                      BERT_INPUTS_DOCSTRING)
+    BERT_INPUTS_DOCSTRING,
+)
 class BertForMultipleChoice(BertPreTrainedModel):
    r"""
        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -1087,6 +1222,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
        loss, classification_scores = outputs[:2]

    """
+
    def __init__(self, config):
        super(BertForMultipleChoice, self).__init__(config)

@@ -1096,8 +1232,16 @@ class BertForMultipleChoice(BertPreTrainedModel):

        self.init_weights()

-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
-                position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
        num_choices = input_ids.shape[1]

        input_ids = input_ids.view(-1, input_ids.size(-1))
@@ -1105,12 +1249,14 @@ class BertForMultipleChoice(BertPreTrainedModel):
        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None

-        outputs = self.bert(input_ids,
+        outputs = self.bert(
+            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+            inputs_embeds=inputs_embeds,
+        )

        pooled_output = outputs[1]

@@ -1128,10 +1274,12 @@ class BertForMultipleChoice(BertPreTrainedModel):
        return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)


-@add_start_docstrings("""Bert Model with a token classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """Bert Model with a token classification head on top (a linear layer on top of
                      the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
    BERT_START_DOCSTRING,
-                      BERT_INPUTS_DOCSTRING)
+    BERT_INPUTS_DOCSTRING,
+)
 class BertForTokenClassification(BertPreTrainedModel):
    r"""
        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -1161,6 +1309,7 @@ class BertForTokenClassification(BertPreTrainedModel):
        loss, scores = outputs[:2]

    """
+
    def __init__(self, config):
        super(BertForTokenClassification, self).__init__(config)
        self.num_labels = config.num_labels
@@ -1171,15 +1320,25 @@ class BertForTokenClassification(BertPreTrainedModel):

        self.init_weights()

-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
-                position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
-
-        outputs = self.bert(input_ids,
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+
+        outputs = self.bert(
+            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+            inputs_embeds=inputs_embeds,
+        )

        sequence_output = outputs[0]

@@ -1202,10 +1361,12 @@ class BertForTokenClassification(BertPreTrainedModel):
        return outputs  # (loss), scores, (hidden_states), (attentions)


-@add_start_docstrings("""Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
                      the hidden-states output to compute `span start logits` and `span end logits`). """,
    BERT_START_DOCSTRING,
-                      BERT_INPUTS_DOCSTRING)
+    BERT_INPUTS_DOCSTRING,
+)
 class BertForQuestionAnswering(BertPreTrainedModel):
    r"""
        **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -1247,6 +1408,7 @@ class BertForQuestionAnswering(BertPreTrainedModel):


    """
+
    def __init__(self, config):
        super(BertForQuestionAnswering, self).__init__(config)
        self.num_labels = config.num_labels
@@ -1256,15 +1418,26 @@ class BertForQuestionAnswering(BertPreTrainedModel):

        self.init_weights()

-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                start_positions=None, end_positions=None):
-
-        outputs = self.bert(input_ids,
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+    ):
+
+        outputs = self.bert(
+            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+            inputs_embeds=inputs_embeds,
+        )

        sequence_output = outputs[0]


--- a/transformers/modeling_camembert.py
+++ b/transformers/modeling_camembert.py
@@ -15,19 +15,25 @@
 # limitations under the License.
 """PyTorch CamemBERT model. """

-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals

 import logging

-from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, RobertaForMultipleChoice, RobertaForTokenClassification
 from .configuration_camembert import CamembertConfig
 from .file_utils import add_start_docstrings
+from .modeling_roberta import (
+    RobertaForMaskedLM,
+    RobertaForMultipleChoice,
+    RobertaForSequenceClassification,
+    RobertaForTokenClassification,
+    RobertaModel,
+)
+

 logger = logging.getLogger(__name__)

 CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'camembert-base': "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-pytorch_model.bin",
+    "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-pytorch_model.bin",
 }


@@ -100,8 +106,12 @@ CAMEMBERT_INPUTS_DOCSTRING = r"""
            than the model's internal embedding lookup matrix.
 """

-@add_start_docstrings("The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.",
-                      CAMEMBERT_START_DOCSTRING, CAMEMBERT_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.",
+    CAMEMBERT_START_DOCSTRING,
+    CAMEMBERT_INPUTS_DOCSTRING,
+)
 class CamembertModel(RobertaModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -149,8 +159,11 @@ class CamembertModel(RobertaModel):
    pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP


-@add_start_docstrings("""CamemBERT Model with a `language modeling` head on top. """,
-    CAMEMBERT_START_DOCSTRING, CAMEMBERT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """CamemBERT Model with a `language modeling` head on top. """,
+    CAMEMBERT_START_DOCSTRING,
+    CAMEMBERT_INPUTS_DOCSTRING,
+)
 class CamembertForMaskedLM(RobertaForMaskedLM):
    r"""
        **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -185,9 +198,12 @@ class CamembertForMaskedLM(RobertaForMaskedLM):
    pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP


-@add_start_docstrings("""CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer 
+@add_start_docstrings(
+    """CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer
    on top of the pooled output) e.g. for GLUE tasks. """,
-    CAMEMBERT_START_DOCSTRING, CAMEMBERT_INPUTS_DOCSTRING)
+    CAMEMBERT_START_DOCSTRING,
+    CAMEMBERT_INPUTS_DOCSTRING,
+)
 class CamembertForSequenceClassification(RobertaForSequenceClassification):
    r"""
        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -223,9 +239,12 @@ class CamembertForSequenceClassification(RobertaForSequenceClassification):
    pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP


-@add_start_docstrings("""CamemBERT Model with a multiple choice classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """CamemBERT Model with a multiple choice classification head on top (a linear layer on top of
    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
-    CAMEMBERT_START_DOCSTRING, CAMEMBERT_INPUTS_DOCSTRING)
+    CAMEMBERT_START_DOCSTRING,
+    CAMEMBERT_INPUTS_DOCSTRING,
+)
 class CamembertForMultipleChoice(RobertaForMultipleChoice):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -257,9 +276,12 @@ class CamembertForMultipleChoice(RobertaForMultipleChoice):
    pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP


-@add_start_docstrings("""CamemBERT Model with a token classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """CamemBERT Model with a token classification head on top (a linear layer on top of
    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    CAMEMBERT_START_DOCSTRING, CAMEMBERT_INPUTS_DOCSTRING)
+    CAMEMBERT_START_DOCSTRING,
+    CAMEMBERT_INPUTS_DOCSTRING,
+)
 class CamembertForTokenClassification(RobertaForTokenClassification):
    r"""
        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:

--- a/transformers/modeling_ctrl.py
+++ b/transformers/modeling_ctrl.py
@@ -17,22 +17,17 @@

 from __future__ import absolute_import, division, print_function, unicode_literals

-import collections
-import json
 import logging
-import math
-import os
-import sys
-from io import open
+
 import numpy as np
 import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss
-from torch.nn.parameter import Parameter

-from .modeling_utils import PreTrainedModel, Conv1D, prune_conv1d_layer, SequenceSummary
 from .configuration_ctrl import CTRLConfig
 from .file_utils import add_start_docstrings
+from .modeling_utils import Conv1D, PreTrainedModel
+

 logger = logging.getLogger(__name__)

@@ -40,14 +35,17 @@ CTRL_PRETRAINED_MODEL_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf-


 def angle_defn(pos, i, d_model_size):
-    angle_rates = 1 / torch.pow(10000, (2 * (i//2)) / d_model_size)
+    angle_rates = 1 / torch.pow(10000, (2 * (i // 2)) / d_model_size)
    return pos * angle_rates

+
 def positional_encoding(position, d_model_size, dtype):
    # create the sinusoidal pattern for the positional encoding
-    angle_rads = (angle_defn(torch.arange(position, dtype=dtype).unsqueeze(1),
+    angle_rads = angle_defn(
+        torch.arange(position, dtype=dtype).unsqueeze(1),
        torch.arange(d_model_size, dtype=dtype).unsqueeze(0),
-                  d_model_size))
+        d_model_size,
+    )

    sines = torch.sin(angle_rads[:, 0::2])
    cosines = torch.cos(angle_rads[:, 1::2])
@@ -55,16 +53,17 @@ def positional_encoding(position, d_model_size, dtype):
    pos_encoding = torch.cat([sines, cosines], dim=-1)
    return pos_encoding

+
 def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=None):
    # calculate attention
-    matmul_qk = torch.matmul(q, k.permute(0,1,3,2))
+    matmul_qk = torch.matmul(q, k.permute(0, 1, 3, 2))

    dk = k.shape[-1]
    scaled_attention_logits = matmul_qk / np.sqrt(dk)

    if mask is not None:
        nd, ns = scaled_attention_logits.size(-2), scaled_attention_logits.size(-1)
-        scaled_attention_logits += (mask[ns-nd:ns, :ns] * -1e4)
+        scaled_attention_logits += mask[ns - nd : ns, :ns] * -1e4

    if attention_mask is not None:
        # Apply the attention mask
@@ -128,11 +127,8 @@ class MultiHeadAttention(torch.nn.Module):
        return outputs


-
 def point_wise_feed_forward_network(d_model_size, dff):
-    return torch.nn.Sequential(torch.nn.Linear(d_model_size, dff),
-                               torch.nn.ReLU(),
-                               torch.nn.Linear(dff, d_model_size))
+    return torch.nn.Sequential(torch.nn.Linear(d_model_size, dff), torch.nn.ReLU(), torch.nn.Linear(dff, d_model_size))


 class EncoderLayer(torch.nn.Module):
@@ -150,10 +146,9 @@ class EncoderLayer(torch.nn.Module):

    def forward(self, x, mask, layer_past=None, attention_mask=None, head_mask=None):
        normed = self.layernorm1(x)
-        attn_outputs = self.multi_head_attention(normed, normed, normed, mask,
-                                                      layer_past=layer_past,
-                                                      attention_mask=attention_mask,
-                                                      head_mask=head_mask)
+        attn_outputs = self.multi_head_attention(
+            normed, normed, normed, mask, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask
+        )
        attn_output = attn_outputs[0]
        attn_output = self.dropout1(attn_output)
        out1 = x + attn_output
@@ -171,6 +166,7 @@ class CTRLPreTrainedModel(PreTrainedModel):
    """ An abstract class to handle weights initialization and
        a simple interface for dowloading and loading pretrained models.
    """
+
    config_class = CTRLConfig
    pretrained_model_archive_map = CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
    base_model_prefix = "transformer"
@@ -244,8 +240,12 @@ CTRL_INPUTS_DOCSTRING = r"""    Inputs:
            than the model's internal embedding lookup matrix.
 """

-@add_start_docstrings("The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
-                                            CTRL_START_DOCSTRING, CTRL_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
+    CTRL_START_DOCSTRING,
+    CTRL_INPUTS_DOCSTRING,
+)
 class CTRLModel(CTRLPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -273,6 +273,7 @@ class CTRLModel(CTRLPreTrainedModel):
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

    """
+
    def __init__(self, config):
        super(CTRLModel, self).__init__(config)
        self.output_hidden_states = config.output_hidden_states
@@ -287,11 +288,12 @@ class CTRLModel(CTRLPreTrainedModel):
        self.w = nn.Embedding(config.vocab_size, config.n_embd)

        self.dropout = nn.Dropout(config.embd_pdrop)
-        self.h = nn.ModuleList([EncoderLayer(config.n_embd,
-                                             config.n_head,
-                                             config.dff,
-                                             config.resid_pdrop,
-                                             config.output_attentions) for _ in range(config.n_layer)])
+        self.h = nn.ModuleList(
+            [
+                EncoderLayer(config.n_embd, config.n_head, config.dff, config.resid_pdrop, config.output_attentions)
+                for _ in range(config.n_layer)
+            ]
+        )
        self.layernorm = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)

        self.init_weights()
@@ -309,7 +311,16 @@ class CTRLModel(CTRLPreTrainedModel):
        for layer, heads in heads_to_prune.items():
            self.h[layer].attn.prune_heads(heads)

-    def forward(self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None):
+    def forward(
+        self,
+        input_ids=None,
+        past=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+    ):
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
@@ -357,8 +368,12 @@ class CTRLModel(CTRLPreTrainedModel):
                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
                head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1)
            elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+                head_mask = (
+                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+                )  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
        else:
            head_mask = [None] * self.config.n_layer

@@ -391,11 +406,9 @@ class CTRLModel(CTRLPreTrainedModel):
        for i, (h, layer_past) in enumerate(zip(self.h, past)):
            if self.output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
-            outputs = h(hidden_states,
-                        mask,
-                        layer_past=layer_past,
-                        attention_mask=attention_mask,
-                        head_mask=head_mask[i])
+            outputs = h(
+                hidden_states, mask, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask[i]
+            )
            hidden_states, present = outputs[:2]
            if self.output_past:
                presents = presents + (present,)
@@ -421,8 +434,12 @@ class CTRLModel(CTRLPreTrainedModel):
        return outputs


-@add_start_docstrings("""The CTRL Model transformer with a language modeling head on top
-(linear layer with weights tied to the input embeddings). """, CTRL_START_DOCSTRING, CTRL_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """The CTRL Model transformer with a language modeling head on top
+(linear layer with weights tied to the input embeddings). """,
+    CTRL_START_DOCSTRING,
+    CTRL_INPUTS_DOCSTRING,
+)
 class CTRLLMHeadModel(CTRLPreTrainedModel):
    r"""
        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -463,6 +480,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
        loss, logits = outputs[:2]

    """
+
    def __init__(self, config):
        super(CTRLLMHeadModel, self).__init__(config)
        self.transformer = CTRLModel(config)
@@ -473,15 +491,26 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
    def get_output_embeddings(self):
        return self.lm_head

-    def forward(self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                labels=None):
-        transformer_outputs = self.transformer(input_ids,
+    def forward(
+        self,
+        input_ids=None,
+        past=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+        transformer_outputs = self.transformer(
+            input_ids,
            past=past,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
-                                               inputs_embeds=inputs_embeds)
+            inputs_embeds=inputs_embeds,
+        )

        hidden_states = transformer_outputs[0]

@@ -495,8 +524,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
-                            shift_labels.view(-1))
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), lm_logits, presents, (all hidden_states), (attentions)
--- a/transformers/modeling_distilbert.py
+++ b/transformers/modeling_distilbert.py
@@ -18,60 +18,53 @@
 """
 from __future__ import absolute_import, division, print_function, unicode_literals

-import json
+import copy
 import logging
 import math
-import copy
-import sys
-from io import open

-import itertools
 import numpy as np
-
 import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss

-from .modeling_utils import PreTrainedModel, prune_linear_layer
 from .configuration_distilbert import DistilBertConfig
 from .file_utils import add_start_docstrings
+from .modeling_utils import PreTrainedModel, prune_linear_layer
+

-import logging
 logger = logging.getLogger(__name__)


 DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-pytorch_model.bin",
-    'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin",
-    'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-pytorch_model.bin",
-    'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-pytorch_model.bin",
+    "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-pytorch_model.bin",
+    "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin",
+    "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-pytorch_model.bin",
+    "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-pytorch_model.bin",
 }


-### UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE ###
+# UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE #
 def gelu(x):
    return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2.0)))

+
 def create_sinusoidal_embeddings(n_pos, dim, out):
-    position_enc = np.array([
-        [pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)]
-        for pos in range(n_pos)
-    ])
+    position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
    out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
    out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
    out.detach_()
    out.requires_grad = False

+
 class Embeddings(nn.Module):
-    def __init__(self,
-                 config):
+    def __init__(self, config):
        super(Embeddings, self).__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size, config.dim, padding_idx=0)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.dim)
        if config.sinusoidal_pos_embds:
-            create_sinusoidal_embeddings(n_pos=config.max_position_embeddings,
-                                         dim=config.dim,
-                                         out=self.position_embeddings.weight)
+            create_sinusoidal_embeddings(
+                n_pos=config.max_position_embeddings, dim=config.dim, out=self.position_embeddings.weight
+            )

        self.LayerNorm = nn.LayerNorm(config.dim, eps=1e-12)
        self.dropout = nn.Dropout(config.dropout)
@@ -100,6 +93,7 @@ class Embeddings(nn.Module):
        embeddings = self.dropout(embeddings)  # (bs, max_seq_length, dim)
        return embeddings

+
 class MultiHeadSelfAttention(nn.Module):
    def __init__(self, config):
        super(MultiHeadSelfAttention, self).__init__()
@@ -139,7 +133,7 @@ class MultiHeadSelfAttention(nn.Module):
        self.dim = attention_head_size * self.n_heads
        self.pruned_heads = self.pruned_heads.union(heads)

-    def forward(self, query, key, value, mask, head_mask = None):
+    def forward(self, query, key, value, mask, head_mask=None):
        """
        Parameters
        ----------
@@ -177,9 +171,9 @@ class MultiHeadSelfAttention(nn.Module):
        v = shape(self.v_lin(value))  # (bs, n_heads, k_length, dim_per_head)

        q = q / math.sqrt(dim_per_head)  # (bs, n_heads, q_length, dim_per_head)
-        scores = torch.matmul(q, k.transpose(2,3))          # (bs, n_heads, q_length, k_length)
-        mask = (mask==0).view(mask_reshp).expand_as(scores) # (bs, n_heads, q_length, k_length)
-        scores.masked_fill_(mask, -float('inf'))            # (bs, n_heads, q_length, k_length)
+        scores = torch.matmul(q, k.transpose(2, 3))  # (bs, n_heads, q_length, k_length)
+        mask = (mask == 0).view(mask_reshp).expand_as(scores)  # (bs, n_heads, q_length, k_length)
+        scores.masked_fill_(mask, -float("inf"))  # (bs, n_heads, q_length, k_length)

        weights = nn.Softmax(dim=-1)(scores)  # (bs, n_heads, q_length, k_length)
        weights = self.dropout(weights)  # (bs, n_heads, q_length, k_length)
@@ -197,14 +191,17 @@ class MultiHeadSelfAttention(nn.Module):
        else:
            return (context,)

+
 class FFN(nn.Module):
    def __init__(self, config):
        super(FFN, self).__init__()
        self.dropout = nn.Dropout(p=config.dropout)
        self.lin1 = nn.Linear(in_features=config.dim, out_features=config.hidden_dim)
        self.lin2 = nn.Linear(in_features=config.hidden_dim, out_features=config.dim)
-        assert config.activation in ['relu', 'gelu'], "activation ({}) must be in ['relu', 'gelu']".format(config.activation)
-        self.activation = gelu if config.activation == 'gelu' else nn.ReLU()
+        assert config.activation in ["relu", "gelu"], "activation ({}) must be in ['relu', 'gelu']".format(
+            config.activation
+        )
+        self.activation = gelu if config.activation == "gelu" else nn.ReLU()

    def forward(self, input):
        x = self.lin1(input)
@@ -213,6 +210,7 @@ class FFN(nn.Module):
        x = self.dropout(x)
        return x

+
 class TransformerBlock(nn.Module):
    def __init__(self, config):
        super(TransformerBlock, self).__init__()
@@ -303,9 +301,7 @@ class Transformer(nn.Module):
            if self.output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_state,)

-            layer_outputs = layer_module(x=hidden_state,
-                                         attn_mask=attn_mask,
-                                         head_mask=head_mask[i])
+            layer_outputs = layer_module(x=hidden_state, attn_mask=attn_mask, head_mask=head_mask[i])
            hidden_state = layer_outputs[-1]

            if self.output_attentions:
@@ -327,11 +323,12 @@ class Transformer(nn.Module):
        return outputs  # last-layer hidden state, (all hidden states), (all attentions)


-### INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL ###
+# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
 class DistilBertPreTrainedModel(PreTrainedModel):
    """ An abstract class to handle weights initialization and
        a simple interface for downloading and loading pretrained models.
    """
+
    config_class = DistilBertConfig
    pretrained_model_archive_map = DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
    load_tf_weights = None
@@ -396,8 +393,12 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
            than the model's internal embedding lookup matrix.
 """

-@add_start_docstrings("The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.",
-                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.",
+    DISTILBERT_START_DOCSTRING,
+    DISTILBERT_INPUTS_DOCSTRING,
+)
 class DistilBertModel(DistilBertPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -420,6 +421,7 @@ class DistilBertModel(DistilBertPreTrainedModel):
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

    """
+
    def __init__(self, config):
        super(DistilBertModel, self).__init__(config)

@@ -442,8 +444,7 @@ class DistilBertModel(DistilBertPreTrainedModel):
        for layer, heads in heads_to_prune.items():
            self.transformer.layer[layer].attention.prune_heads(heads)

-    def forward(self,
-                input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None):
+    def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None):
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
@@ -468,24 +469,29 @@ class DistilBertModel(DistilBertPreTrainedModel):
                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
                head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
            elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+                head_mask = (
+                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+                )  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
        else:
            head_mask = [None] * self.config.num_hidden_layers

        if inputs_embeds is None:
            inputs_embeds = self.embeddings(input_ids)  # (bs, seq_length, dim)
-        tfmr_output = self.transformer(x=inputs_embeds,
-                                       attn_mask=attention_mask,
-                                       head_mask=head_mask)
+        tfmr_output = self.transformer(x=inputs_embeds, attn_mask=attention_mask, head_mask=head_mask)
        hidden_state = tfmr_output[0]
-        output = (hidden_state, ) + tfmr_output[1:]
+        output = (hidden_state,) + tfmr_output[1:]

        return output  # last-layer hidden-state, (all hidden_states), (all attentions)


-@add_start_docstrings("""DistilBert Model with a `masked language modeling` head on top. """,
-                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """DistilBert Model with a `masked language modeling` head on top. """,
+    DISTILBERT_START_DOCSTRING,
+    DISTILBERT_INPUTS_DOCSTRING,
+)
 class DistilBertForMaskedLM(DistilBertPreTrainedModel):
    r"""
        **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -516,6 +522,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
        loss, prediction_scores = outputs[:2]

    """
+
    def __init__(self, config):
        super(DistilBertForMaskedLM, self).__init__(config)
        self.output_attentions = config.output_attentions
@@ -534,28 +541,31 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
        return self.vocab_projector

    def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, masked_lm_labels=None):
-        dlbrt_output = self.distilbert(input_ids=input_ids,
-                                       attention_mask=attention_mask,
-                                       head_mask=head_mask,
-                                       inputs_embeds=inputs_embeds)
+        dlbrt_output = self.distilbert(
+            input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds
+        )
        hidden_states = dlbrt_output[0]  # (bs, seq_length, dim)
        prediction_logits = self.vocab_transform(hidden_states)  # (bs, seq_length, dim)
        prediction_logits = gelu(prediction_logits)  # (bs, seq_length, dim)
        prediction_logits = self.vocab_layer_norm(prediction_logits)  # (bs, seq_length, dim)
        prediction_logits = self.vocab_projector(prediction_logits)  # (bs, seq_length, vocab_size)

-        outputs = (prediction_logits, ) + dlbrt_output[1:]
+        outputs = (prediction_logits,) + dlbrt_output[1:]
        if masked_lm_labels is not None:
-            mlm_loss = self.mlm_loss_fct(prediction_logits.view(-1, prediction_logits.size(-1)),
-                                         masked_lm_labels.view(-1))
+            mlm_loss = self.mlm_loss_fct(
+                prediction_logits.view(-1, prediction_logits.size(-1)), masked_lm_labels.view(-1)
+            )
            outputs = (mlm_loss,) + outputs

        return outputs  # (mlm_loss), prediction_logits, (all hidden_states), (all attentions)


-@add_start_docstrings("""DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings(
+    """DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
                         the pooled output) e.g. for GLUE tasks. """,
-                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+    DISTILBERT_START_DOCSTRING,
+    DISTILBERT_INPUTS_DOCSTRING,
+)
 class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
    r"""
        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -587,6 +597,7 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
        loss, logits = outputs[:2]

    """
+
    def __init__(self, config):
        super(DistilBertForSequenceClassification, self).__init__(config)
        self.num_labels = config.num_labels
@@ -599,10 +610,9 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None):
-        distilbert_output = self.distilbert(input_ids=input_ids,
-                                            attention_mask=attention_mask,
-                                            head_mask=head_mask,
-                                            inputs_embeds=inputs_embeds)
+        distilbert_output = self.distilbert(
+            input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds
+        )
        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
        pooled_output = hidden_state[:, 0]  # (bs, dim)
        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
@@ -623,9 +633,12 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
        return outputs  # (loss), logits, (hidden_states), (attentions)


-@add_start_docstrings("""DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
                         the hidden-states output to compute `span start logits` and `span end logits`). """,
-                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+    DISTILBERT_START_DOCSTRING,
+    DISTILBERT_INPUTS_DOCSTRING,
+)
 class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
    r"""
        **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -663,6 +676,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
        loss, start_scores, end_scores = outputs[:3]

    """
+
    def __init__(self, config):
        super(DistilBertForQuestionAnswering, self).__init__(config)

@@ -673,11 +687,18 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):

        self.init_weights()

-    def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None):
-        distilbert_output = self.distilbert(input_ids=input_ids,
-                                            attention_mask=attention_mask,
-                                            head_mask=head_mask,
-                                            inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+    ):
+        distilbert_output = self.distilbert(
+            input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds
+        )
        hidden_states = distilbert_output[0]  # (bs, max_query_len, dim)

        hidden_states = self.dropout(hidden_states)  # (bs, max_query_len, dim)
@@ -707,10 +728,12 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)


-@add_start_docstrings("""DistilBert Model with a token classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """DistilBert Model with a token classification head on top (a linear layer on top of
                      the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
    DISTILBERT_START_DOCSTRING,
-                      DISTILBERT_INPUTS_DOCSTRING)
+    DISTILBERT_INPUTS_DOCSTRING,
+)
 class DistilBertForTokenClassification(DistilBertPreTrainedModel):
    r"""
        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -740,6 +763,7 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
        loss, scores = outputs[:2]

    """
+
    def __init__(self, config):
        super(DistilBertForTokenClassification, self).__init__(config)
        self.num_labels = config.num_labels
@@ -750,13 +774,11 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):

        self.init_weights()

-    def forward(self, input_ids=None, attention_mask=None, head_mask=None,
-                inputs_embeds=None, labels=None):
+    def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None):

-        outputs = self.distilbert(input_ids,
-                            attention_mask=attention_mask,
-                            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+        outputs = self.distilbert(
+            input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds
+        )

        sequence_output = outputs[0]


--- a/transformers/modeling_encoder_decoder.py
+++ b/transformers/modeling_encoder_decoder.py
@@ -18,14 +18,13 @@ from __future__ import absolute_import, division, print_function, unicode_litera

 import logging
 import os
-import warnings

 import torch
 from torch import nn
-from tqdm import trange

 from .modeling_auto import AutoModel, AutoModelWithLMHead

+
 logger = logging.getLogger(__name__)


@@ -145,16 +144,12 @@ class PreTrainedEncoderDecoder(nn.Module):
        # by the value of the flag `is_decoder` that we need to set correctly.
        encoder = kwargs_encoder.pop("model", None)
        if encoder is None:
-            encoder = AutoModel.from_pretrained(
-                encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder
-            )
+            encoder = AutoModel.from_pretrained(encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder)
        encoder.config.is_decoder = False

        decoder = kwargs_decoder.pop("model", None)
        if decoder is None:
-            decoder = AutoModelWithLMHead.from_pretrained(
-                decoder_pretrained_model_name_or_path, **kwargs_decoder
-            )
+            decoder = AutoModelWithLMHead.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)
        decoder.config.is_decoder = True

        model = cls(encoder, decoder)
@@ -173,13 +168,18 @@ class PreTrainedEncoderDecoder(nn.Module):
            os.mkdir(save_directory)

        # Check whether the output directory is empty or not
-        sub_directories = [directory for directory in os.listdir(save_directory)
-            if os.path.isdir(os.path.join(save_directory, directory))]
+        sub_directories = [
+            directory
+            for directory in os.listdir(save_directory)
+            if os.path.isdir(os.path.join(save_directory, directory))
+        ]

        if len(sub_directories) > 0:
            if "encoder" in sub_directories and "decoder" in sub_directories:
-                print("WARNING: there is an older version of encoder-decoder saved in" +\
-                    " the output directory. The default behaviour is to overwrite them.")
+                print(
+                    "WARNING: there is an older version of encoder-decoder saved in"
+                    + " the output directory. The default behaviour is to overwrite them."
+                )

            # Empty the output directory
            for directory_to_remove in sub_directories:
@@ -190,7 +190,7 @@ class PreTrainedEncoderDecoder(nn.Module):
                # Remove the subdirectory itself
                os.rmdir(os.path.join(save_directory, directory_to_remove))

-            assert(len(os.listdir(save_directory)) == 0) # sanity check
+            assert len(os.listdir(save_directory)) == 0  # sanity check

        # Create the "encoder" directory inside the output directory and save the encoder into it
        if not os.path.exists(os.path.join(save_directory, "encoder")):

--- a/transformers/modeling_gpt2.py
+++ b/transformers/modeling_gpt2.py
@@ -17,41 +17,41 @@

 from __future__ import absolute_import, division, print_function, unicode_literals

-import collections
-import json
 import logging
 import math
 import os
-import sys
-from io import open

 import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss
-from torch.nn.parameter import Parameter

-from .modeling_utils import PreTrainedModel, Conv1D, prune_conv1d_layer, SequenceSummary
 from .configuration_gpt2 import GPT2Config
 from .file_utils import add_start_docstrings
+from .modeling_utils import Conv1D, PreTrainedModel, SequenceSummary, prune_conv1d_layer
+

 logger = logging.getLogger(__name__)

-GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
+GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
    "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin",
    "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin",
    "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-pytorch_model.bin",
-                                     "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-pytorch_model.bin",}
+    "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-pytorch_model.bin",
+}
+

 def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
    """ Load tf checkpoints in a pytorch model
    """
    try:
        import re
-        import numpy as np
        import tensorflow as tf
    except ImportError:
-        logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions.")
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
        raise
    tf_path = os.path.abspath(gpt2_checkpoint_path)
    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
@@ -67,24 +67,24 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):

    for name, array in zip(names, arrays):
        name = name[6:]  # skip "model/"
-        name = name.split('/')
+        name = name.split("/")
        pointer = model
        for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+\d+', m_name):
-                l = re.split(r'(\d+)', m_name)
+            if re.fullmatch(r"[A-Za-z]+\d+", m_name):
+                scope_names = re.split(r"(\d+)", m_name)
            else:
-                l = [m_name]
-            if l[0] == 'w' or l[0] == 'g':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'b':
-                pointer = getattr(pointer, 'bias')
-            elif l[0] == 'wpe' or l[0] == 'wte':
-                pointer = getattr(pointer, l[0])
-                pointer = getattr(pointer, 'weight')
+                scope_names = [m_name]
+            if scope_names[0] == "w" or scope_names[0] == "g":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "b":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "wpe" or scope_names[0] == "wte":
+                pointer = getattr(pointer, scope_names[0])
+                pointer = getattr(pointer, "weight")
            else:
-                pointer = getattr(pointer, l[0])
-            if len(l) >= 2:
-                num = int(l[1])
+                pointer = getattr(pointer, scope_names[0])
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
                pointer = pointer[num]
        try:
            assert pointer.shape == array.shape
@@ -130,7 +130,7 @@ class Attention(nn.Module):
            mask[head] = 0
        mask = mask.view(-1).contiguous().eq(1)
        index = torch.arange(len(mask))[mask].long()
-        index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)])
+        index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])

        # Prune conv1d layers
        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
@@ -146,7 +146,7 @@ class Attention(nn.Module):
        if self.scale:
            w = w / math.sqrt(v.size(-1))
        nd, ns = w.size(-2), w.size(-1)
-        b = self.bias[:, :, ns-nd:ns, :ns]
+        b = self.bias[:, :, ns - nd : ns, :ns]
        w = w * b - 1e4 * (1 - b)

        if attention_mask is not None:
@@ -226,10 +226,9 @@ class Block(nn.Module):
        self.mlp = MLP(4 * nx, config)

    def forward(self, x, layer_past=None, attention_mask=None, head_mask=None):
-        output_attn = self.attn(self.ln_1(x),
-                                layer_past=layer_past,
-                                attention_mask=attention_mask,
-                                head_mask=head_mask)
+        output_attn = self.attn(
+            self.ln_1(x), layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask
+        )
        a = output_attn[0]  # output_attn: a, present, (attentions)

        x = x + a
@@ -244,6 +243,7 @@ class GPT2PreTrainedModel(PreTrainedModel):
    """ An abstract class to handle weights initialization and
        a simple interface for dowloading and loading pretrained models.
    """
+
    config_class = GPT2Config
    pretrained_model_archive_map = GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
    load_tf_weights = load_tf_weights_in_gpt2
@@ -321,8 +321,12 @@ GPT2_INPUTS_DOCSTRING = r"""    Inputs:
            than the model's internal embedding lookup matrix.
 """

-@add_start_docstrings("The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
-                      GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
+    GPT2_START_DOCSTRING,
+    GPT2_INPUTS_DOCSTRING,
+)
 class GPT2Model(GPT2PreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -350,6 +354,7 @@ class GPT2Model(GPT2PreTrainedModel):
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

    """
+
    def __init__(self, config):
        super(GPT2Model, self).__init__(config)
        self.output_hidden_states = config.output_hidden_states
@@ -377,7 +382,16 @@ class GPT2Model(GPT2PreTrainedModel):
        for layer, heads in heads_to_prune.items():
            self.h[layer].attn.prune_heads(heads)

-    def forward(self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None):
+    def forward(
+        self,
+        input_ids=None,
+        past=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+    ):
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
@@ -430,8 +444,12 @@ class GPT2Model(GPT2PreTrainedModel):
                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
                head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1)
            elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+                head_mask = (
+                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+                )  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
        else:
            head_mask = [None] * self.config.n_layer

@@ -454,10 +472,9 @@ class GPT2Model(GPT2PreTrainedModel):
            if self.output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)

-            outputs = block(hidden_states,
-                            layer_past=layer_past,
-                            attention_mask=attention_mask,
-                            head_mask=head_mask[i])
+            outputs = block(
+                hidden_states, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask[i]
+            )

            hidden_states, present = outputs[:2]
            if self.output_past:
@@ -486,8 +503,12 @@ class GPT2Model(GPT2PreTrainedModel):
        return outputs  # last hidden state, (presents), (all hidden_states), (attentions)


-@add_start_docstrings("""The GPT2 Model transformer with a language modeling head on top
-(linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """The GPT2 Model transformer with a language modeling head on top
+(linear layer with weights tied to the input embeddings). """,
+    GPT2_START_DOCSTRING,
+    GPT2_INPUTS_DOCSTRING,
+)
 class GPT2LMHeadModel(GPT2PreTrainedModel):
    r"""
        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -528,6 +549,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
        loss, logits = outputs[:2]

    """
+
    def __init__(self, config):
        super(GPT2LMHeadModel, self).__init__(config)
        self.transformer = GPT2Model(config)
@@ -538,15 +560,26 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
    def get_output_embeddings(self):
        return self.lm_head

-    def forward(self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                labels=None):
-        transformer_outputs = self.transformer(input_ids,
+    def forward(
+        self,
+        input_ids=None,
+        past=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+        transformer_outputs = self.transformer(
+            input_ids,
            past=past,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
-                                               inputs_embeds=inputs_embeds)
+            inputs_embeds=inputs_embeds,
+        )
        hidden_states = transformer_outputs[0]

        lm_logits = self.lm_head(hidden_states)
@@ -558,18 +591,21 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
-                            shift_labels.view(-1))
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), lm_logits, presents, (all hidden_states), (attentions)


-@add_start_docstrings("""The GPT2 Model transformer with a language modeling and a multiple-choice classification
+@add_start_docstrings(
+    """The GPT2 Model transformer with a language modeling and a multiple-choice classification
 head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
 The language modeling head has its weights tied to the input embeddings,
 the classification head takes as input the input of a specified classification token index in the input sequence).
-""", GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
+""",
+    GPT2_START_DOCSTRING,
+    GPT2_INPUTS_DOCSTRING,
+)
 class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
    r"""
        **mc_token_ids**: (`optional`, default to index of the last token of the input) ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
@@ -632,6 +668,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
        lm_prediction_scores, mc_prediction_scores = outputs[:2]

    """
+
    def __init__(self, config):
        super(GPT2DoubleHeadsModel, self).__init__(config)
        config.num_labels = 1
@@ -644,15 +681,28 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
    def get_output_embeddings(self):
        return self.lm_head

-    def forward(self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                mc_token_ids=None, lm_labels=None, mc_labels=None):
-        transformer_outputs = self.transformer(input_ids,
+    def forward(
+        self,
+        input_ids=None,
+        past=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        mc_token_ids=None,
+        lm_labels=None,
+        mc_labels=None,
+    ):
+        transformer_outputs = self.transformer(
+            input_ids,
            past=past,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
-                                               inputs_embeds=inputs_embeds)
+            inputs_embeds=inputs_embeds,
+        )

        hidden_states = transformer_outputs[0]

@@ -662,15 +712,13 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
        outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
        if mc_labels is not None:
            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)),
-                            mc_labels.view(-1))
+            loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))
            outputs = (loss,) + outputs
        if lm_labels is not None:
            shift_logits = lm_logits[..., :-1, :].contiguous()
            shift_labels = lm_labels[..., 1:].contiguous()
            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
-                            shift_labels.view(-1))
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (lm loss), (mc loss), lm logits, mc logits, presents, (all hidden_states), (attentions)
--- a/transformers/modeling_mmbt.py
+++ b/transformers/modeling_mmbt.py
@@ -15,8 +15,7 @@
 # limitations under the License.
 """PyTorch MMBT model. """

-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals

 import logging

@@ -26,12 +25,14 @@ from torch.nn import CrossEntropyLoss, MSELoss

 from .file_utils import add_start_docstrings

+
 logger = logging.getLogger(__name__)


 class ModalEmbeddings(nn.Module):
    """Generic Modal Embeddings which takes in an encoder, and a transformer embedding.
    """
+
    def __init__(self, config, encoder, embeddings):
        super(ModalEmbeddings, self).__init__()
        self.config = config
@@ -62,7 +63,9 @@ class ModalEmbeddings(nn.Module):
            position_ids = position_ids.unsqueeze(0).expand(input_modal.size(0), seq_length)

        if token_type_ids is None:
-            token_type_ids = torch.zeros((input_modal.size(0), seq_length), dtype=torch.long, device=input_modal.device)
+            token_type_ids = torch.zeros(
+                (input_modal.size(0), seq_length), dtype=torch.long, device=input_modal.device
+            )

        position_embeddings = self.position_embeddings(position_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)
@@ -140,8 +143,12 @@ MMBT_INPUTS_DOCSTRING = r"""    Inputs:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
 """

-@add_start_docstrings("The bare MMBT Model outputting raw hidden-states without any specific head on top.",
-                      MMBT_START_DOCSTRING, MMBT_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare MMBT Model outputting raw hidden-states without any specific head on top.",
+    MMBT_START_DOCSTRING,
+    MMBT_INPUTS_DOCSTRING,
+)
 class MMBTModel(nn.Module):
    r"""
        Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -167,19 +174,29 @@ class MMBTModel(nn.Module):
            encoder = ImageEncoder(args)
            mmbt = MMBTModel(config, transformer, encoder)
        """
+
    def __init__(self, config, transformer, encoder):
        super(MMBTModel, self).__init__()
        self.config = config
        self.transformer = transformer
        self.modal_encoder = ModalEmbeddings(config, encoder, transformer.embeddings)

-    def forward(self, input_modal, input_ids=None, modal_start_tokens=None,
-                modal_end_tokens=None, attention_mask=None,
-                token_type_ids=None, modal_token_type_ids=None,
-                position_ids=None, modal_position_ids=None, head_mask=None,
-                inputs_embeds=None, encoder_hidden_states=None,
-                encoder_attention_mask=None):
-
+    def forward(
+        self,
+        input_modal,
+        input_ids=None,
+        modal_start_tokens=None,
+        modal_end_tokens=None,
+        attention_mask=None,
+        token_type_ids=None,
+        modal_token_type_ids=None,
+        position_ids=None,
+        modal_position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+    ):

        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
@@ -192,21 +209,22 @@ class MMBTModel(nn.Module):

        device = input_ids.device if input_ids is not None else inputs_embeds.device

-        modal_embeddings = self.modal_encoder(input_modal,
+        modal_embeddings = self.modal_encoder(
+            input_modal,
            start_token=modal_start_tokens,
            end_token=modal_end_tokens,
            position_ids=modal_position_ids,
-                                              token_type_ids=modal_token_type_ids)
+            token_type_ids=modal_token_type_ids,
+        )

        input_modal_shape = modal_embeddings.size()[:-1]

        if token_type_ids is None:
            token_type_ids = torch.ones(input_txt_shape, dtype=torch.long, device=device)

-        txt_embeddings = self.transformer.embeddings(input_ids=input_ids,
-                                                     position_ids=position_ids,
-                                                     token_type_ids=token_type_ids,
-                                                     inputs_embeds=inputs_embeds)
+        txt_embeddings = self.transformer.embeddings(
+            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+        )

        embedding_output = torch.cat([modal_embeddings, txt_embeddings], 1)

@@ -215,12 +233,16 @@ class MMBTModel(nn.Module):
        if attention_mask is None:
            attention_mask = torch.ones(input_shape, device=device)
        else:
-            attention_mask = torch.cat([torch.ones(input_modal_shape, device=device, dtype=torch.long),  attention_mask], dim=1)
+            attention_mask = torch.cat(
+                [torch.ones(input_modal_shape, device=device, dtype=torch.long), attention_mask], dim=1
+            )

        if encoder_attention_mask is None:
            encoder_attention_mask = torch.ones(input_shape, device=device)
        else:
-            encoder_attention_mask = torch.cat([torch.ones(input_modal_shape, device=device),  encoder_attention_mask], dim=1)
+            encoder_attention_mask = torch.cat(
+                [torch.ones(input_modal_shape, device=device), encoder_attention_mask], dim=1
+            )

        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
        # ourselves in which case we just need to make it broadcastable to all heads.
@@ -254,7 +276,9 @@ class MMBTModel(nn.Module):
        if encoder_attention_mask.dim() == 2:
            encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]

-        encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
+        encoder_extended_attention_mask = encoder_extended_attention_mask.to(
+            dtype=next(self.parameters()).dtype
+        )  # fp16 compatibility
        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0

        # Prepare head mask if needed
@@ -267,25 +291,31 @@ class MMBTModel(nn.Module):
                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
                head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
            elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype)  # switch to fload if need + fp16 compatibility
+                head_mask = (
+                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+                )  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
        else:
            head_mask = [None] * self.config.num_hidden_layers

-
-        encoder_outputs = self.transformer.encoder(embedding_output,
+        encoder_outputs = self.transformer.encoder(
+            embedding_output,
            attention_mask=extended_attention_mask,
            head_mask=head_mask,
            encoder_hidden_states=encoder_hidden_states,
-                                                   encoder_attention_mask=encoder_extended_attention_mask)
+            encoder_attention_mask=encoder_extended_attention_mask,
+        )

        sequence_output = encoder_outputs[0]
        pooled_output = self.transformer.pooler(sequence_output)

-        outputs = (sequence_output, pooled_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
+        outputs = (sequence_output, pooled_output,) + encoder_outputs[
+            1:
+        ]  # add hidden_states and attentions if they are here
        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)

-
    def get_input_embeddings(self):
        return self.embeddings.word_embeddings

@@ -293,8 +323,12 @@ class MMBTModel(nn.Module):
        self.embeddings.word_embeddings = value


-@add_start_docstrings("""MMBT Model with a sequence classification/regression head on top (a linear layer on top of
-                      the pooled output)""", MMBT_START_DOCSTRING, MMBT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """MMBT Model with a sequence classification/regression head on top (a linear layer on top of
+                      the pooled output)""",
+    MMBT_START_DOCSTRING,
+    MMBT_INPUTS_DOCSTRING,
+)
 class MMBTForClassification(nn.Module):
    r"""
            **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -333,11 +367,25 @@ class MMBTForClassification(nn.Module):
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

-    def forward(self, input_modal, input_ids=None, modal_start_tokens=None, modal_end_tokens=None,
-                attention_mask=None, token_type_ids=None, modal_token_type_ids=None, position_ids=None,
-                modal_position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
-
-        outputs = self.mmbt(input_modal=input_modal, input_ids=input_ids,
+    def forward(
+        self,
+        input_modal,
+        input_ids=None,
+        modal_start_tokens=None,
+        modal_end_tokens=None,
+        attention_mask=None,
+        token_type_ids=None,
+        modal_token_type_ids=None,
+        position_ids=None,
+        modal_position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+
+        outputs = self.mmbt(
+            input_modal=input_modal,
+            input_ids=input_ids,
            modal_start_tokens=modal_start_tokens,
            modal_end_tokens=modal_end_tokens,
            attention_mask=attention_mask,
@@ -346,7 +394,8 @@ class MMBTForClassification(nn.Module):
            position_ids=position_ids,
            modal_position_ids=modal_position_ids,
            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+            inputs_embeds=inputs_embeds,
+        )

        pooled_output = outputs[1]


--- a/transformers/modeling_openai.py
+++ b/transformers/modeling_openai.py
@@ -17,26 +17,26 @@

 from __future__ import absolute_import, division, print_function, unicode_literals

-import collections
 import json
 import logging
 import math
 import os
-import sys
 from io import open

 import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss
-from torch.nn.parameter import Parameter

-from .modeling_utils import PreTrainedModel, Conv1D, prune_conv1d_layer, SequenceSummary
 from .configuration_openai import OpenAIGPTConfig
 from .file_utils import add_start_docstrings
+from .modeling_utils import Conv1D, PreTrainedModel, SequenceSummary, prune_conv1d_layer
+

 logger = logging.getLogger(__name__)

-OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"}
+OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"
+}


 def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
@@ -45,17 +45,17 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
    import re
    import numpy as np

-    if '.ckpt' in openai_checkpoint_folder_path:
+    if ".ckpt" in openai_checkpoint_folder_path:
        openai_checkpoint_folder_path = os.path.dirname(openai_checkpoint_folder_path)

    logger.info("Loading weights from {}".format(openai_checkpoint_folder_path))

-    with open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8') as names_handle:
+    with open(openai_checkpoint_folder_path + "/parameters_names.json", "r", encoding="utf-8") as names_handle:
        names = json.load(names_handle)
-    with open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8') as shapes_handle:
+    with open(openai_checkpoint_folder_path + "/params_shapes.json", "r", encoding="utf-8") as shapes_handle:
        shapes = json.load(shapes_handle)
    offsets = np.cumsum([np.prod(shape) for shape in shapes])
-    init_params = [np.load(openai_checkpoint_folder_path + '/params_{}.npy'.format(n)) for n in range(10)]
+    init_params = [np.load(openai_checkpoint_folder_path + "/params_{}.npy".format(n)) for n in range(10)]
    init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
    init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)]

@@ -83,23 +83,23 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
        name = name[6:]  # skip "model/"
        assert name[-2:] == ":0"
        name = name[:-2]
-        name = name.split('/')
+        name = name.split("/")
        pointer = model
        for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+\d+', m_name):
-                l = re.split(r'(\d+)', m_name)
+            if re.fullmatch(r"[A-Za-z]+\d+", m_name):
+                scope_names = re.split(r"(\d+)", m_name)
            else:
-                l = [m_name]
-            if l[0] == 'g':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'b':
-                pointer = getattr(pointer, 'bias')
-            elif l[0] == 'w':
-                pointer = getattr(pointer, 'weight')
+                scope_names = [m_name]
+            if scope_names[0] == "g":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "b":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "w":
+                pointer = getattr(pointer, "weight")
            else:
-                pointer = getattr(pointer, l[0])
-            if len(l) >= 2:
-                num = int(l[1])
+                pointer = getattr(pointer, scope_names[0])
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
                pointer = pointer[num]
        try:
            assert pointer.shape == array.shape
@@ -156,7 +156,7 @@ class Attention(nn.Module):
            mask[head] = 0
        mask = mask.view(-1).contiguous().eq(1)
        index = torch.arange(len(mask))[mask].long()
-        index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)])
+        index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
        # Prune conv1d layers
        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
@@ -172,7 +172,7 @@ class Attention(nn.Module):
        # w = w * self.bias + -1e9 * (1 - self.bias)  # TF implem method: mask_attn_weights
        # XD: self.b may be larger than w, so we need to crop it
        b = self.bias[:, :, : w.size(-2), : w.size(-1)]
-        w = w * b + - 1e4 * (1 - b)
+        w = w * b + -1e4 * (1 - b)

        if attention_mask is not None:
            # Apply the attention mask
@@ -261,6 +261,7 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
    """ An abstract class to handle weights initialization and
        a simple interface for dowloading and loading pretrained models.
    """
+
    config_class = OpenAIGPTConfig
    pretrained_model_archive_map = OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
    load_tf_weights = load_tf_weights_in_openai_gpt
@@ -330,8 +331,12 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""    Inputs:
            than the model's internal embedding lookup matrix.
 """

-@add_start_docstrings("The bare OpenAI GPT transformer model outputting raw hidden-states without any specific head on top.",
-                      OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare OpenAI GPT transformer model outputting raw hidden-states without any specific head on top.",
+    OPENAI_GPT_START_DOCSTRING,
+    OPENAI_GPT_INPUTS_DOCSTRING,
+)
 class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -354,6 +359,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

    """
+
    def __init__(self, config):
        super(OpenAIGPTModel, self).__init__(config)
        self.output_attentions = config.output_attentions
@@ -379,7 +385,15 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
        for layer, heads in heads_to_prune.items():
            self.h[layer].attn.prune_heads(heads)

-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+    ):
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
@@ -422,8 +436,12 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
                head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1)
            elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+                head_mask = (
+                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+                )  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
        else:
            head_mask = [None] * self.config.n_layer

@@ -463,8 +481,12 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
        return outputs  # last hidden state, (all hidden states), (all attentions)


-@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling head on top
-(linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """OpenAI GPT Model transformer with a language modeling head on top
+(linear layer with weights tied to the input embeddings). """,
+    OPENAI_GPT_START_DOCSTRING,
+    OPENAI_GPT_INPUTS_DOCSTRING,
+)
 class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
    r"""
        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -496,6 +518,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
        loss, logits = outputs[:2]

    """
+
    def __init__(self, config):
        super(OpenAIGPTLMHeadModel, self).__init__(config)
        self.transformer = OpenAIGPTModel(config)
@@ -506,14 +529,24 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
    def get_output_embeddings(self):
        return self.lm_head

-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                labels=None):
-        transformer_outputs = self.transformer(input_ids,
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+        transformer_outputs = self.transformer(
+            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
-                                               inputs_embeds=inputs_embeds)
+            inputs_embeds=inputs_embeds,
+        )
        hidden_states = transformer_outputs[0]
        lm_logits = self.lm_head(hidden_states)

@@ -524,18 +557,21 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
-                            shift_labels.view(-1))
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), lm_logits, (all hidden states), (all attentions)


-@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
+@add_start_docstrings(
+    """OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
 head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
 The language modeling head has its weights tied to the input embeddings,
 the classification head takes as input the input of a specified classification token index in the input sequence).
-""", OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
+""",
+    OPENAI_GPT_START_DOCSTRING,
+    OPENAI_GPT_INPUTS_DOCSTRING,
+)
 class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
    r"""
        **mc_token_ids**: (`optional`, default to index of the last token of the input) ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
@@ -587,6 +623,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
        lm_prediction_scores, mc_prediction_scores = outputs[:2]

    """
+
    def __init__(self, config):
        super(OpenAIGPTDoubleHeadsModel, self).__init__(config)

@@ -600,14 +637,26 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
    def get_output_embeddings(self):
        return self.lm_head

-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                mc_token_ids=None, lm_labels=None, mc_labels=None):
-        transformer_outputs = self.transformer(input_ids,
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        mc_token_ids=None,
+        lm_labels=None,
+        mc_labels=None,
+    ):
+        transformer_outputs = self.transformer(
+            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
-                                               inputs_embeds=inputs_embeds)
+            inputs_embeds=inputs_embeds,
+        )
        hidden_states = transformer_outputs[0]

        lm_logits = self.lm_head(hidden_states)
@@ -616,15 +665,13 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
        outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
        if mc_labels is not None:
            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)),
-                            mc_labels.view(-1))
+            loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))
            outputs = (loss,) + outputs
        if lm_labels is not None:
            shift_logits = lm_logits[..., :-1, :].contiguous()
            shift_labels = lm_labels[..., 1:].contiguous()
            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
-                            shift_labels.view(-1))
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (lm loss), (mc loss), lm logits, mc logits, (all hidden_states), (attentions)
--- a/transformers/modeling_roberta.py
+++ b/transformers/modeling_roberta.py
@@ -15,8 +15,7 @@
 # limitations under the License.
 """PyTorch RoBERTa model. """

-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals

 import logging

@@ -24,31 +23,35 @@ import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss, MSELoss

-from .modeling_bert import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu
 from .configuration_roberta import RobertaConfig
 from .file_utils import add_start_docstrings
+from .modeling_bert import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu
+

 logger = logging.getLogger(__name__)

 ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_model.bin",
-    'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-pytorch_model.bin",
-    'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin",
-    'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-pytorch_model.bin",
-    'roberta-base-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-pytorch_model.bin",
-    'roberta-large-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-pytorch_model.bin",
+    "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_model.bin",
+    "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-pytorch_model.bin",
+    "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin",
+    "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-pytorch_model.bin",
+    "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-pytorch_model.bin",
+    "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-pytorch_model.bin",
 }

+
 class RobertaEmbeddings(BertEmbeddings):
    """
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    """
+
    def __init__(self, config):
        super(RobertaEmbeddings, self).__init__(config)
        self.padding_idx = 1
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size,
-                                                padding_idx=self.padding_idx)
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
+        )

    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
        if position_ids is None:
@@ -58,10 +61,9 @@ class RobertaEmbeddings(BertEmbeddings):
            else:
                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)

-        return super(RobertaEmbeddings, self).forward(input_ids,
-                                                      token_type_ids=token_type_ids,
-                                                      position_ids=position_ids,
-                                                      inputs_embeds=inputs_embeds)
+        return super(RobertaEmbeddings, self).forward(
+            input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds
+        )

    def create_position_ids_from_input_ids(self, x):
        """ Replace non-padding symbols with their position numbers. Position numbers begin at
@@ -85,8 +87,9 @@ class RobertaEmbeddings(BertEmbeddings):
        input_shape = inputs_embeds.size()[:-1]
        sequence_length = input_shape[1]

-        position_ids = torch.arange(self.padding_idx+1, sequence_length+self.padding_idx+1, dtype=torch.long,
-                                    device=inputs_embeds.device)
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
        return position_ids.unsqueeze(0).expand(input_shape)


@@ -162,8 +165,12 @@ ROBERTA_INPUTS_DOCSTRING = r"""
            than the model's internal embedding lookup matrix.
 """

-@add_start_docstrings("The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
-                      ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
+    ROBERTA_START_DOCSTRING,
+    ROBERTA_INPUTS_DOCSTRING,
+)
 class RobertaModel(BertModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -209,8 +216,10 @@ class RobertaModel(BertModel):
    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value

-@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """,
-    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    """RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING
+)
 class RobertaForMaskedLM(BertPreTrainedModel):
    r"""
        **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -256,14 +265,24 @@ class RobertaForMaskedLM(BertPreTrainedModel):
    def get_output_embeddings(self):
        return self.lm_head.decoder

-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                masked_lm_labels=None):
-        outputs = self.roberta(input_ids,
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        masked_lm_labels=None,
+    ):
+        outputs = self.roberta(
+            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
-                               inputs_embeds=inputs_embeds)
+            inputs_embeds=inputs_embeds,
+        )
        sequence_output = outputs[0]
        prediction_scores = self.lm_head(sequence_output)

@@ -299,9 +318,12 @@ class RobertaLMHead(nn.Module):
        return x


-@add_start_docstrings("""RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer 
+@add_start_docstrings(
+    """RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer
    on top of the pooled output) e.g. for GLUE tasks. """,
-    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+    ROBERTA_START_DOCSTRING,
+    ROBERTA_INPUTS_DOCSTRING,
+)
 class RobertaForSequenceClassification(BertPreTrainedModel):
    r"""
        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -344,14 +366,24 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
        self.roberta = RobertaModel(config)
        self.classifier = RobertaClassificationHead(config)

-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                labels=None):
-        outputs = self.roberta(input_ids,
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+        outputs = self.roberta(
+            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
-                               inputs_embeds=inputs_embeds)
+            inputs_embeds=inputs_embeds,
+        )
        sequence_output = outputs[0]
        logits = self.classifier(sequence_output)

@@ -369,9 +401,12 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
        return outputs  # (loss), logits, (hidden_states), (attentions)


-@add_start_docstrings("""Roberta Model with a multiple choice classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """Roberta Model with a multiple choice classification head on top (a linear layer on top of
    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
-    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+    ROBERTA_START_DOCSTRING,
+    ROBERTA_INPUTS_DOCSTRING,
+)
 class RobertaForMultipleChoice(BertPreTrainedModel):
    r"""
    Inputs:
@@ -455,16 +490,29 @@ class RobertaForMultipleChoice(BertPreTrainedModel):

        self.init_weights()

-    def forward(self, input_ids=None, token_type_ids=None, attention_mask=None, labels=None,
-                position_ids=None, head_mask=None, inputs_embeds=None):
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        attention_mask=None,
+        labels=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+    ):
        num_choices = input_ids.shape[1]

        flat_input_ids = input_ids.view(-1, input_ids.size(-1))
        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
-        outputs = self.roberta(flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids,
-                            attention_mask=flat_attention_mask, head_mask=head_mask)
+        outputs = self.roberta(
+            flat_input_ids,
+            position_ids=flat_position_ids,
+            token_type_ids=flat_token_type_ids,
+            attention_mask=flat_attention_mask,
+            head_mask=head_mask,
+        )
        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
@@ -481,9 +529,12 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
        return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)


-@add_start_docstrings("""Roberta Model with a token classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """Roberta Model with a token classification head on top (a linear layer on top of
    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+    ROBERTA_START_DOCSTRING,
+    ROBERTA_INPUTS_DOCSTRING,
+)
 class RobertaForTokenClassification(BertPreTrainedModel):
    r"""
        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -527,15 +578,25 @@ class RobertaForTokenClassification(BertPreTrainedModel):

        self.init_weights()

-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
-                position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
-
-        outputs = self.roberta(input_ids,
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+
+        outputs = self.roberta(
+            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
-                               inputs_embeds=inputs_embeds)
+            inputs_embeds=inputs_embeds,
+        )

        sequence_output = outputs[0]

@@ -577,9 +638,12 @@ class RobertaClassificationHead(nn.Module):
        return x


-@add_start_docstrings("""Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
    the hidden-states output to compute `span start logits` and `span end logits`). """,
-    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+    ROBERTA_START_DOCSTRING,
+    ROBERTA_INPUTS_DOCSTRING,
+)
 class RobertaForQuestionAnswering(BertPreTrainedModel):
    r"""
        **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -626,14 +690,24 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):

        self.init_weights()

-    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
-                start_positions=None, end_positions=None):
-
-        outputs = self.roberta(input_ids,
+    def forward(
+        self,
+        input_ids,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        start_positions=None,
+        end_positions=None,
+    ):
+
+        outputs = self.roberta(
+            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
-                               head_mask=head_mask)
+            head_mask=head_mask,
+        )

        sequence_output = outputs[0]


--- a/transformers/modeling_t5.py
+++ b/transformers/modeling_t5.py
@@ -16,23 +16,21 @@

 from __future__ import absolute_import, division, print_function, unicode_literals

-import json
+import copy
+import itertools
 import logging
 import math
 import os
-import sys
-import copy
-import itertools
-from io import open

 import torch
-from torch import nn
 import torch.nn.functional as F
-from torch.nn import CrossEntropyLoss, MSELoss
+from torch import nn
+from torch.nn import CrossEntropyLoss

-from .modeling_utils import PreTrainedModel, prune_linear_layer
 from .configuration_t5 import T5Config
-from .file_utils import add_start_docstrings, DUMMY_INPUTS, DUMMY_MASK
+from .file_utils import DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings
+from .modeling_utils import PreTrainedModel, prune_linear_layer
+

 logger = logging.getLogger(__name__)

@@ -41,13 +39,14 @@ logger = logging.getLogger(__name__)
 # for the pretrained weights provided with the models
 ####################################################
 T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-pytorch_model.bin",
-    't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-pytorch_model.bin",
-    't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-pytorch_model.bin",
-    't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-pytorch_model.bin",
-    't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-pytorch_model.bin",
+    "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-pytorch_model.bin",
+    "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-pytorch_model.bin",
+    "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-pytorch_model.bin",
+    "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-pytorch_model.bin",
+    "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-pytorch_model.bin",
 }

+
 ####################################################
 # This is a conversion method from TF 1.0 to PyTorch
 # More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
@@ -60,8 +59,10 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
        import numpy as np
        import tensorflow as tf
    except ImportError:
-        logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions.")
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
        raise
    tf_path = os.path.abspath(tf_checkpoint_path)
    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
@@ -76,44 +77,44 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
        tf_weights[name] = array

    for txt_name in names:
-        name = txt_name.split('/')
+        name = txt_name.split("/")
        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
        # which are not required for using pretrained model
        if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
            logger.info("Skipping {}".format("/".join(name)))
            tf_weights.pop(txt_name, None)
            continue
-        if '_slot_' in name[-1]:
+        if "_slot_" in name[-1]:
            logger.info("Skipping {}".format("/".join(name)))
            tf_weights.pop(txt_name, None)
            continue
        pointer = model
        array = tf_weights[txt_name]
        for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
-                l = re.split(r'_(\d+)', m_name)
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
            else:
-                l = [m_name]
-            if l[0] in ['kernel', 'scale', 'embedding']:
-                pointer = getattr(pointer, 'weight')
-            # elif l[0] == 'scale':
+                scope_names = [m_name]
+            if scope_names[0] in ["kernel", "scale", "embedding"]:
+                pointer = getattr(pointer, "weight")
+            # elif scope_names[0] == 'scale':
            #     pointer = getattr(pointer, 'weight')
-            # elif l[0] == 'output_bias' or l[0] == 'beta':
+            # elif scope_names[0] == 'output_bias' or scope_names[0] == 'beta':
            #     pointer = getattr(pointer, 'bias')
-            # elif l[0] == 'squad':
+            # elif scope_names[0] == 'squad':
            #     pointer = getattr(pointer, 'classifier')
            else:
                try:
-                    pointer = getattr(pointer, l[0])
+                    pointer = getattr(pointer, scope_names[0])
                except AttributeError:
                    logger.info("Skipping {}".format("/".join(name)))
                    continue
-            if len(l) >= 2:
-                num = int(l[1])
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
                pointer = pointer[num]
-        if l[0] not in ['kernel', 'scale', 'embedding']:
-            pointer = getattr(pointer, 'weight')
-        if l[0] != 'embedding':
+        if scope_names[0] not in ["kernel", "scale", "embedding"]:
+            pointer = getattr(pointer, "weight")
+        if scope_names[0] != "embedding":
            logger.info("Transposing numpy weight of shape {} for {}".format(array.shape, name))
            array = np.transpose(array)
        try:
@@ -125,7 +126,7 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
        pointer.data = torch.from_numpy(array.astype(np.float32))
        tf_weights.pop(txt_name, None)

-    logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
+    logger.info("Weights not copied to PyTorch model: {}".format(", ".join(tf_weights.keys())))
    # logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
    return model

@@ -136,6 +137,7 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
 # - PreTrainedModel for the models (it-self a sub-class of torch.nn.Module)
 ####################################################

+
 class T5LayerNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """ Construct a layernorm module in the T5 style
@@ -228,10 +230,7 @@ class T5Attention(nn.Module):
        self.pruned_heads = self.pruned_heads.union(heads)

    @staticmethod
-    def _relative_position_bucket(relative_position,
-                                  bidirectional=True,
-                                  num_buckets=32,
-                                  max_distance=128):
+    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
        """
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
@@ -267,12 +266,12 @@ class T5Attention(nn.Module):

        # half of the buckets are for exact increments in positions
        max_exact = num_buckets // 2
-        is_small = (n < max_exact)
+        is_small = n < max_exact

        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
        val_if_large = max_exact + (
-            torch.log(n.float() / max_exact)
-            / math.log(max_distance / max_exact) * (num_buckets - max_exact)).to(torch.long)
+            torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
+        ).to(torch.long)
        val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))

        ret += torch.where(is_small, n, val_if_large)
@@ -283,9 +282,11 @@ class T5Attention(nn.Module):
        context_position = torch.arange(qlen, dtype=torch.long)[:, None]
        memory_position = torch.arange(klen, dtype=torch.long)[None, :]
        relative_position = memory_position - context_position  # shape (qlen, klen)
-        rp_bucket = self._relative_position_bucket(relative_position,  # shape (qlen, klen)
+        rp_bucket = self._relative_position_bucket(
+            relative_position,  # shape (qlen, klen)
            bidirectional=not self.is_decoder,
-                                                   num_buckets=self.relative_attention_num_buckets)
+            num_buckets=self.relative_attention_num_buckets,
+        )
        values = self.relative_attention_bias(rp_bucket)  # shape (qlen, klen, num_heads)
        values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, qlen, klen)
        return values
@@ -298,7 +299,7 @@ class T5Attention(nn.Module):
        # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
        bs, qlen, dim = input.size()
        if kv is None:
-            klen = qlen if cache is None else cache['slen'] + qlen
+            klen = qlen if cache is None else cache["slen"] + qlen
        else:
            klen = kv.size(1)

@@ -330,7 +331,7 @@ class T5Attention(nn.Module):
            cache[self.layer_id] = (k, v)

        # q = q / math.sqrt(dim_per_head)                                     # No scaling in T5
-        scores = torch.einsum('bnqd,bnkd->bnqk', q, k)                        # (bs, n_heads, qlen, klen)
+        scores = torch.einsum("bnqd,bnkd->bnqk", q, k)  # (bs, n_heads, qlen, klen)

        if position_bias is None:
            if not self.has_relative_attention_bias:
@@ -369,10 +370,9 @@ class T5LayerSelfAttention(nn.Module):

    def forward(self, hidden_states, attention_mask=None, position_bias=None, head_mask=None):
        norm_x = self.layer_norm(hidden_states)
-        attention_output = self.SelfAttention(norm_x,
-                                              mask=attention_mask,
-                                              position_bias=position_bias,
-                                              head_mask=head_mask)
+        attention_output = self.SelfAttention(
+            norm_x, mask=attention_mask, position_bias=position_bias, head_mask=head_mask
+        )
        y = attention_output[0]
        layer_output = hidden_states + self.dropout(y)
        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
@@ -388,11 +388,9 @@ class T5LayerCrossAttention(nn.Module):

    def forward(self, hidden_states, kv, attention_mask=None, position_bias=None, head_mask=None):
        norm_x = self.layer_norm(hidden_states)
-        attention_output = self.EncDecAttention(norm_x,
-                                                mask=attention_mask,
-                                                kv=kv,
-                                                position_bias=position_bias,
-                                                head_mask=head_mask)
+        attention_output = self.EncDecAttention(
+            norm_x, mask=attention_mask, kv=kv, position_bias=position_bias, head_mask=head_mask
+        )
        y = attention_output[0]
        layer_output = hidden_states + self.dropout(y)
        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
@@ -411,26 +409,36 @@ class T5Block(nn.Module):
        else:
            self.layer.append(T5LayerFF(config))

-    def forward(self, hidden_states, attention_mask=None, position_bias=None,
-                encoder_hidden_states=None, encoder_attention_mask=None, encoder_decoder_position_bias=None,
-                head_mask=None):
-        self_attention_outputs = self.layer[0](hidden_states,
-                                                attention_mask=attention_mask,
-                                                position_bias=position_bias,
-                                                head_mask=head_mask)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        encoder_decoder_position_bias=None,
+        head_mask=None,
+    ):
+        self_attention_outputs = self.layer[0](
+            hidden_states, attention_mask=attention_mask, position_bias=position_bias, head_mask=head_mask
+        )
        hidden_states = self_attention_outputs[0]
        outputs = self_attention_outputs[1:]  # Keep self-attention outputs and relative position weights

        if not self.is_decoder:
            hidden_states = self.layer[1](hidden_states)
        else:
-            cross_attention_outputs = self.layer[1](hidden_states,
+            cross_attention_outputs = self.layer[1](
+                hidden_states,
                kv=encoder_hidden_states,
                attention_mask=encoder_attention_mask,
                position_bias=encoder_decoder_position_bias,
-                                                    head_mask=head_mask)
+                head_mask=head_mask,
+            )
            hidden_states = cross_attention_outputs[0]
-            outputs = outputs + cross_attention_outputs[1:]  # Keep cross-attention outputs and relative position weights
+            outputs = (
+                outputs + cross_attention_outputs[1:]
+            )  # Keep cross-attention outputs and relative position weights
            hidden_states = self.layer[2](hidden_states)

        outputs = (hidden_states,) + outputs  # add attentions if we output them
@@ -441,6 +449,7 @@ class T5PreTrainedModel(PreTrainedModel):
    """ An abstract class to handle weights initialization and
        a simple interface for dowloading and loading pretrained models.
    """
+
    config_class = T5Config
    pretrained_model_archive_map = T5_PRETRAINED_MODEL_ARCHIVE_MAP
    load_tf_weights = load_tf_weights_in_t5
@@ -450,29 +459,31 @@ class T5PreTrainedModel(PreTrainedModel):
    def dummy_inputs(self):
        input_ids = torch.tensor(DUMMY_INPUTS)
        input_mask = torch.tensor(DUMMY_MASK)
-        dummy_inputs = {'decoder_input_ids': input_ids,
-                        'encoder_input_ids': input_ids,
-                        'decoder_attention_mask': input_mask}
+        dummy_inputs = {
+            "decoder_input_ids": input_ids,
+            "encoder_input_ids": input_ids,
+            "decoder_attention_mask": input_mask,
+        }
        return dummy_inputs

    def _init_weights(self, module):
        """ Initialize the weights """
        factor = self.config.initializer_factor  # Used for testing weights initialization
        if isinstance(module, T5LayerNorm):
-            module.weight.data.fill_(factor*1.0)
+            module.weight.data.fill_(factor * 1.0)
        elif isinstance(module, (T5Model, T5WithLMHeadModel)):
            # Mesh TensorFlow embeddings initialization
            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
-            module.shared.weight.data.normal_(mean=0.0, std=factor*1.0)
+            module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
        elif isinstance(module, T5DenseReluDense):
            # Mesh TensorFlow FF initialization
            # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
            # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
-            module.wi.weight.data.normal_(mean=0.0, std=factor*((self.config.d_model) ** -0.5))
-            if hasattr(module.wi, 'bias') and module.wi.bias is not None:
+            module.wi.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.wi, "bias") and module.wi.bias is not None:
                module.wi.bias.data.zero_()
-            module.wo.weight.data.normal_(mean=0.0, std=factor*((self.config.d_ff) ** -0.5))
-            if hasattr(module.wo, 'bias') and module.wo.bias is not None:
+            module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
+            if hasattr(module.wo, "bias") and module.wo.bias is not None:
                module.wo.bias.data.zero_()
        elif isinstance(module, T5Attention):
            # Mesh TensorFlow attention initialization to avoid scaling before softmax
@@ -480,12 +491,12 @@ class T5PreTrainedModel(PreTrainedModel):
            d_model = self.config.d_model
            d_kv = self.config.d_kv
            n_heads = self.config.num_heads
-            module.q.weight.data.normal_(mean=0.0, std=factor*((d_model * d_kv) ** -0.5))
-            module.k.weight.data.normal_(mean=0.0, std=factor*(d_model ** -0.5))
-            module.v.weight.data.normal_(mean=0.0, std=factor*(d_model ** -0.5))
-            module.o.weight.data.normal_(mean=0.0, std=factor*((n_heads * d_kv) ** -0.5))
+            module.q.weight.data.normal_(mean=0.0, std=factor * ((d_model * d_kv) ** -0.5))
+            module.k.weight.data.normal_(mean=0.0, std=factor * (d_model ** -0.5))
+            module.v.weight.data.normal_(mean=0.0, std=factor * (d_model ** -0.5))
+            module.o.weight.data.normal_(mean=0.0, std=factor * ((n_heads * d_kv) ** -0.5))
            if module.has_relative_attention_bias:
-                module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor*((d_model) ** -0.5))
+                module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5))


 class T5Stack(T5PreTrainedModel):
@@ -495,19 +506,22 @@ class T5Stack(T5PreTrainedModel):
        self.output_hidden_states = config.output_hidden_states
        self.is_decoder = config.is_decoder

-        self.block = nn.ModuleList([T5Block(config, has_relative_attention_bias=bool(i == 0))
-                                    for i in range(config.num_layers)])
+        self.block = nn.ModuleList(
+            [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
+        )
        self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
        self.dropout = nn.Dropout(config.dropout_rate)

        self.init_weights()

-    def forward(self,
+    def forward(
+        self,
        hidden_states,
        attention_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
-                head_mask=None):
+        head_mask=None,
+    ):

        batch_size, seq_length = hidden_states.shape[0], hidden_states.shape[1]
        if attention_mask is None:
@@ -557,7 +571,9 @@ class T5Stack(T5PreTrainedModel):
            # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
            # encoder_extended_attention_mask = (encoder_extended_attention_mask == encoder_extended_attention_mask.transpose(-1, -2))

-            encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
+            encoder_extended_attention_mask = encoder_extended_attention_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # fp16 compatibility
            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9
        else:
            encoder_extended_attention_mask = None
@@ -572,8 +588,12 @@ class T5Stack(T5PreTrainedModel):
                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
                head_mask = head_mask.expand(self.config.num_layers, -1, -1, -1, -1)
            elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+                head_mask = (
+                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+                )  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
        else:
            head_mask = [None] * self.config.num_layers

@@ -587,13 +607,15 @@ class T5Stack(T5PreTrainedModel):
            if self.output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

-            layer_outputs = layer_module(hidden_states,
+            layer_outputs = layer_module(
+                hidden_states,
                attention_mask=extended_attention_mask,
                position_bias=position_bias,
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_extended_attention_mask,
                encoder_decoder_position_bias=encoder_decoder_position_bias,
-                                         head_mask=head_mask[i])
+                head_mask=head_mask[i],
+            )
            # layer_outputs is a tuple with:
            # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
            hidden_states = layer_outputs[0]
@@ -672,9 +694,12 @@ T5_INPUTS_DOCSTRING = r"""
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
 """

-@add_start_docstrings("The bare T5 Model transformer outputting raw hidden-states"
-                      "without any specific head on top.",
-                      T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare T5 Model transformer outputting raw hidden-states" "without any specific head on top.",
+    T5_START_DOCSTRING,
+    T5_INPUTS_DOCSTRING,
+)
 class T5Model(T5PreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -697,6 +722,7 @@ class T5Model(T5PreTrainedModel):
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

    """
+
    def __init__(self, config):
        super(T5Model, self).__init__(config)
        self.shared = nn.Embedding(config.vocab_size, config.d_model)
@@ -729,12 +755,13 @@ class T5Model(T5PreTrainedModel):
        # `encoder_`), decoder-specific (prefixed by `decoder_`) and those
        # that apply to the model as whole.
        # We let the specific kwargs override the common ones in case of conflict.
-        kwargs_common = dict((k, v) for k, v in kwargs.items()
-                             if not k.startswith("encoder_") and not k.startswith("decoder_"))
+        kwargs_common = dict(
+            (k, v) for k, v in kwargs.items() if not k.startswith("encoder_") and not k.startswith("decoder_")
+        )
        kwargs_encoder = kwargs_common.copy()
        kwargs_decoder = kwargs_common.copy()
-        kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_")))
-        kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_")))
+        kwargs_encoder.update(dict((k[len("encoder_") :], v) for k, v in kwargs.items() if k.startswith("encoder_")))
+        kwargs_decoder.update(dict((k[len("decoder_") :], v) for k, v in kwargs.items() if k.startswith("decoder_")))

        # Encode if needed (training, first prediction pass)
        encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
@@ -770,8 +797,7 @@ class T5Model(T5PreTrainedModel):
        return decoder_outputs + encoder_outputs


-@add_start_docstrings("""T5 Model with a `language modeling` head on top. """,
-    T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
+@add_start_docstrings("""T5 Model with a `language modeling` head on top. """, T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
 class T5WithLMHeadModel(T5PreTrainedModel):
    r"""
        **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -802,6 +828,7 @@ class T5WithLMHeadModel(T5PreTrainedModel):
        loss, prediction_scores = outputs[:2]

    """
+
    def __init__(self, config):
        super(T5WithLMHeadModel, self).__init__(config)
        self.model_dim = config.d_model
@@ -834,14 +861,15 @@ class T5WithLMHeadModel(T5PreTrainedModel):
        # that apply to the model as whole.
        # We let the specific kwargs override the common ones in case of conflict.

-        lm_labels = kwargs.pop('decoder_lm_labels', None)
+        lm_labels = kwargs.pop("decoder_lm_labels", None)

-        kwargs_common = dict((k, v) for k, v in kwargs.items()
-                             if not k.startswith("encoder_") and not k.startswith("decoder_"))
+        kwargs_common = dict(
+            (k, v) for k, v in kwargs.items() if not k.startswith("encoder_") and not k.startswith("decoder_")
+        )
        kwargs_encoder = kwargs_common.copy()
        kwargs_decoder = kwargs_common.copy()
-        kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_")))
-        kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_")))
+        kwargs_encoder.update(dict((k[len("encoder_") :], v) for k, v in kwargs.items() if k.startswith("encoder_")))
+        kwargs_decoder.update(dict((k[len("decoder_") :], v) for k, v in kwargs.items() if k.startswith("decoder_")))

        # Encode if needed (training, first prediction pass)
        encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
@@ -879,8 +907,9 @@ class T5WithLMHeadModel(T5PreTrainedModel):
            shift_logits = lm_logits[..., :-1, :].contiguous()
            shift_labels = lm_labels[..., 1:].contiguous()
            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
-                            shift_labels.view(-1))
-            decoder_outputs = (loss,) + decoder_outputs  # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+            decoder_outputs = (
+                loss,
+            ) + decoder_outputs  # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666

        return decoder_outputs + encoder_outputs
--- a/transformers/modeling_tf_albert.py
+++ b/transformers/modeling_tf_albert.py
@@ -22,23 +22,22 @@ import sys
 import tensorflow as tf

 from .configuration_albert import AlbertConfig
-from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
-from .modeling_tf_bert import ACT2FN, TFBertSelfAttention
 from .file_utils import add_start_docstrings
+from .modeling_tf_bert import ACT2FN, TFBertSelfAttention
+from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list

-import logging

 logger = logging.getLogger(__name__)

 TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tf_model.h5",
-    'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v1-tf_model.h5",
-    'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v1-tf_model.h5",
-    'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v1-tf_model.h5",
-    'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-tf_model.h5",
-    'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-tf_model.h5",
-    'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-tf_model.h5",
-    'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-tf_model.h5",
+    "albert-base-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tf_model.h5",
+    "albert-large-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v1-tf_model.h5",
+    "albert-xlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v1-tf_model.h5",
+    "albert-xxlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v1-tf_model.h5",
+    "albert-base-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-tf_model.h5",
+    "albert-large-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-tf_model.h5",
+    "albert-xlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-tf_model.h5",
+    "albert-xxlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-tf_model.h5",
 }


@@ -50,21 +49,22 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
        super(TFAlbertEmbeddings, self).__init__(**kwargs)

        self.config = config
-        self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings,
+        self.position_embeddings = tf.keras.layers.Embedding(
+            config.max_position_embeddings,
            config.embedding_size,
-                                                             embeddings_initializer=get_initializer(
-                                                                 self.config.initializer_range),
-                                                             name='position_embeddings')
-        self.token_type_embeddings = tf.keras.layers.Embedding(config.type_vocab_size,
+            embeddings_initializer=get_initializer(self.config.initializer_range),
+            name="position_embeddings",
+        )
+        self.token_type_embeddings = tf.keras.layers.Embedding(
+            config.type_vocab_size,
            config.embedding_size,
-                                                               embeddings_initializer=get_initializer(
-                                                                   self.config.initializer_range),
-                                                               name='token_type_embeddings')
+            embeddings_initializer=get_initializer(self.config.initializer_range),
+            name="token_type_embeddings",
+        )

        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
        # any TensorFlow checkpoint file
-        self.LayerNorm = tf.keras.layers.LayerNormalization(
-            epsilon=config.layer_norm_eps, name='LayerNorm')
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)

    def build(self, input_shape):
@@ -75,7 +75,8 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
            self.word_embeddings = self.add_weight(
                "weight",
                shape=[self.config.vocab_size, self.config.embedding_size],
-                initializer=get_initializer(self.config.initializer_range))
+                initializer=get_initializer(self.config.initializer_range),
+            )
        super(TFAlbertEmbeddings, self).build(input_shape)

    def call(self, inputs, mode="embedding", training=False):
@@ -145,34 +146,29 @@ class TFAlbertSelfAttention(tf.keras.layers.Layer):
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
        self.output_attentions = config.output_attentions

        self.num_attention_heads = config.num_attention_heads
        assert config.hidden_size % config.num_attention_heads == 0
-        self.attention_head_size = int(
-            config.hidden_size / config.num_attention_heads)
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

-        self.query = tf.keras.layers.Dense(self.all_head_size,
-                                           kernel_initializer=get_initializer(
-                                               config.initializer_range),
-                                           name='query')
-        self.key = tf.keras.layers.Dense(self.all_head_size,
-                                         kernel_initializer=get_initializer(
-                                             config.initializer_range),
-                                         name='key')
-        self.value = tf.keras.layers.Dense(self.all_head_size,
-                                           kernel_initializer=get_initializer(
-                                               config.initializer_range),
-                                           name='value')
-
-        self.dropout = tf.keras.layers.Dropout(
-            config.attention_probs_dropout_prob)
+        self.query = tf.keras.layers.Dense(
+            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
+        )
+        self.key = tf.keras.layers.Dense(
+            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
+        )
+        self.value = tf.keras.layers.Dense(
+            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
+        )
+
+        self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)

    def transpose_for_scores(self, x, batch_size):
-        x = tf.reshape(
-            x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
+        x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs, training=False):
@@ -212,23 +208,21 @@ class TFAlbertSelfAttention(tf.keras.layers.Layer):
        context_layer = tf.matmul(attention_probs, value_layer)

        context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
-        context_layer = tf.reshape(context_layer,
-                                   (batch_size, -1, self.all_head_size))  # (batch_size, seq_len_q, all_head_size)
+        context_layer = tf.reshape(
+            context_layer, (batch_size, -1, self.all_head_size)
+        )  # (batch_size, seq_len_q, all_head_size)

-        outputs = (context_layer, attention_probs) if self.output_attentions else (
-            context_layer,)
+        outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
        return outputs


 class TFAlbertSelfOutput(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super(TFAlbertSelfOutput, self).__init__(**kwargs)
-        self.dense = tf.keras.layers.Dense(config.hidden_size,
-                                           kernel_initializer=get_initializer(
-                                               config.initializer_range),
-                                           name='dense')
-        self.LayerNorm = tf.keras.layers.LayerNormalization(
-            epsilon=config.layer_norm_eps, name='LayerNorm')
+        self.dense = tf.keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)

    def call(self, inputs, training=False):
@@ -245,12 +239,10 @@ class TFAlbertAttention(TFBertSelfAttention):
        super(TFAlbertAttention, self).__init__(config, **kwargs)

        self.hidden_size = config.hidden_size
-        self.dense = tf.keras.layers.Dense(config.hidden_size,
-                                           kernel_initializer=get_initializer(
-                                               config.initializer_range),
-                                           name='dense')
-        self.LayerNorm = tf.keras.layers.LayerNormalization(
-            epsilon=config.layer_norm_eps, name='LayerNorm')
+        self.dense = tf.keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        self.pruned_heads = set()

    def prune_heads(self, heads):
@@ -293,11 +285,11 @@ class TFAlbertAttention(TFBertSelfAttention):
        context_layer = tf.matmul(attention_probs, value_layer)

        context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
-        context_layer = tf.reshape(context_layer,
-                                   (batch_size, -1, self.all_head_size))  # (batch_size, seq_len_q, all_head_size)
+        context_layer = tf.reshape(
+            context_layer, (batch_size, -1, self.all_head_size)
+        )  # (batch_size, seq_len_q, all_head_size)

-        self_outputs = (context_layer, attention_probs) if self.output_attentions else (
-            context_layer,)
+        self_outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)

        hidden_states = self_outputs[0]

@@ -313,34 +305,37 @@ class TFAlbertAttention(TFBertSelfAttention):
 class TFAlbertLayer(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super(TFAlbertLayer, self).__init__(**kwargs)
-        self.attention = TFAlbertAttention(config, name='attention')
+        self.attention = TFAlbertAttention(config, name="attention")

-        self.ffn = tf.keras.layers.Dense(config.intermediate_size, kernel_initializer=get_initializer(
-            config.initializer_range), name='ffn')
+        self.ffn = tf.keras.layers.Dense(
+            config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn"
+        )

-        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+        if isinstance(config.hidden_act, str) or (
+            sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)  # noqa: F821
+        ):
            self.activation = ACT2FN[config.hidden_act]
        else:
            self.activation = config.hidden_act

-        self.ffn_output = tf.keras.layers.Dense(config.hidden_size, kernel_initializer=get_initializer(
-            config.initializer_range), name='ffn_output')
+        self.ffn_output = tf.keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn_output"
+        )
        self.full_layer_layer_norm = tf.keras.layers.LayerNormalization(
-            epsilon=config.layer_norm_eps, name='full_layer_layer_norm')
+            epsilon=config.layer_norm_eps, name="full_layer_layer_norm"
+        )
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)

    def call(self, inputs, training=False):
        hidden_states, attention_mask, head_mask = inputs

-        attention_outputs = self.attention(
-            [hidden_states, attention_mask, head_mask], training=training)
+        attention_outputs = self.attention([hidden_states, attention_mask, head_mask], training=training)
        ffn_output = self.ffn(attention_outputs[0])
        ffn_output = self.activation(ffn_output)
        ffn_output = self.ffn_output(ffn_output)

        hidden_states = self.dropout(hidden_states, training=training)
-        hidden_states = self.full_layer_layer_norm(
-            ffn_output + attention_outputs[0])
+        hidden_states = self.full_layer_layer_norm(ffn_output + attention_outputs[0])

        # add attentions if we output them
        outputs = (hidden_states,) + attention_outputs[1:]
@@ -353,8 +348,9 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer):

        self.output_attentions = config.output_attentions
        self.output_hidden_states = config.output_hidden_states
-        self.albert_layers = [TFAlbertLayer(config, name="albert_layers_._{}".format(
-            i)) for i in range(config.inner_group_num)]
+        self.albert_layers = [
+            TFAlbertLayer(config, name="albert_layers_._{}".format(i)) for i in range(config.inner_group_num)
+        ]

    def call(self, inputs, training=False):
        hidden_states, attention_mask, head_mask = inputs
@@ -363,8 +359,7 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer):
        layer_attentions = ()

        for layer_index, albert_layer in enumerate(self.albert_layers):
-            layer_output = albert_layer(
-                [hidden_states, attention_mask, head_mask[layer_index]], training=training)
+            layer_output = albert_layer([hidden_states, attention_mask, head_mask[layer_index]], training=training)
            hidden_states = layer_output[0]

            if self.output_attentions:
@@ -389,10 +384,15 @@ class TFAlbertTransformer(tf.keras.layers.Layer):
        self.config = config
        self.output_attentions = config.output_attentions
        self.output_hidden_states = config.output_hidden_states
-        self.embedding_hidden_mapping_in = tf.keras.layers.Dense(config.hidden_size, kernel_initializer=get_initializer(
-            config.initializer_range), name='embedding_hidden_mapping_in')
-        self.albert_layer_groups = [TFAlbertLayerGroup(
-            config, name="albert_layer_groups_._{}".format(i)) for i in range(config.num_hidden_groups)]
+        self.embedding_hidden_mapping_in = tf.keras.layers.Dense(
+            config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="embedding_hidden_mapping_in",
+        )
+        self.albert_layer_groups = [
+            TFAlbertLayerGroup(config, name="albert_layer_groups_._{}".format(i))
+            for i in range(config.num_hidden_groups)
+        ]

    def call(self, inputs, training=False):
        hidden_states, attention_mask, head_mask = inputs
@@ -405,15 +405,19 @@ class TFAlbertTransformer(tf.keras.layers.Layer):

        for i in range(self.config.num_hidden_layers):
            # Number of layers in a hidden group
-            layers_per_group = int(
-                self.config.num_hidden_layers / self.config.num_hidden_groups)
+            layers_per_group = int(self.config.num_hidden_layers / self.config.num_hidden_groups)

            # Index of the hidden group
-            group_idx = int(
-                i / (self.config.num_hidden_layers / self.config.num_hidden_groups))
+            group_idx = int(i / (self.config.num_hidden_layers / self.config.num_hidden_groups))

            layer_group_output = self.albert_layer_groups[group_idx](
-                [hidden_states, attention_mask, head_mask[group_idx*layers_per_group:(group_idx+1)*layers_per_group]], training=training)
+                [
+                    hidden_states,
+                    attention_mask,
+                    head_mask[group_idx * layers_per_group : (group_idx + 1) * layers_per_group],
+                ],
+                training=training,
+            )
            hidden_states = layer_group_output[0]

            if self.output_attentions:
@@ -436,6 +440,7 @@ class TFAlbertPreTrainedModel(TFPreTrainedModel):
    """ An abstract class to handle weights initialization and
        a simple interface for dowloading and loading pretrained models.
    """
+
    config_class = AlbertConfig
    pretrained_model_archive_map = TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
    base_model_prefix = "albert"
@@ -446,31 +451,27 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
        super(TFAlbertMLMHead, self).__init__(**kwargs)
        self.vocab_size = config.vocab_size

-        self.dense = tf.keras.layers.Dense(config.embedding_size,
-                                           kernel_initializer=get_initializer(
-                                               config.initializer_range),
-                                           name='dense')
-        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+        self.dense = tf.keras.layers.Dense(
+            config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        if isinstance(config.hidden_act, str) or (
+            sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)  # noqa: F821
+        ):
            self.activation = ACT2FN[config.hidden_act]
        else:
            self.activation = config.hidden_act

-        self.LayerNorm = tf.keras.layers.LayerNormalization(
-            epsilon=config.layer_norm_eps, name='LayerNorm')
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        self.decoder = input_embeddings

    def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,),
-                                    initializer='zeros',
-                                    trainable=True,
-                                    name='bias')
-        self.decoder_bias = self.add_weight(shape=(self.vocab_size,),
-                                    initializer='zeros',
-                                    trainable=True,
-                                    name='decoder/bias')
+        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
+        self.decoder_bias = self.add_weight(
+            shape=(self.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias"
+        )
        super(TFAlbertMLMHead, self).build(input_shape)

    def call(self, hidden_states):
@@ -560,8 +561,12 @@ ALBERT_INPUTS_DOCSTRING = r"""
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
 """

-@add_start_docstrings("The bare Albert Model transformer outputing raw hidden-states without any specific head on top.",
-                      ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare Albert Model transformer outputing raw hidden-states without any specific head on top.",
+    ALBERT_START_DOCSTRING,
+    ALBERT_INPUTS_DOCSTRING,
+)
 class TFAlbertModel(TFAlbertPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -601,8 +606,12 @@ class TFAlbertModel(TFAlbertPreTrainedModel):

        self.embeddings = TFAlbertEmbeddings(config, name="embeddings")
        self.encoder = TFAlbertTransformer(config, name="encoder")
-        self.pooler = tf.keras.layers.Dense(config.hidden_size, kernel_initializer=get_initializer(
-            config.initializer_range), activation='tanh', name='pooler')
+        self.pooler = tf.keras.layers.Dense(
+            config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            activation="tanh",
+            name="pooler",
+        )

    def get_input_embeddings(self):
        return self.embeddings
@@ -617,7 +626,16 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
        """
        raise NotImplementedError

-    def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
+    def call(
+        self,
+        inputs,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        training=False,
+    ):
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
@@ -627,12 +645,12 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
            inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
            assert len(inputs) <= 6, "Too many inputs."
        elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            token_type_ids = inputs.get('token_type_ids', token_type_ids)
-            position_ids = inputs.get('position_ids', position_ids)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
            assert len(inputs) <= 6, "Too many inputs."
        else:
            input_ids = inputs
@@ -672,16 +690,14 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
        # attention_probs has shape bsz x n_heads x N x N
        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        if not head_mask is None:
+        if head_mask is not None:
            raise NotImplementedError
        else:
            head_mask = [None] * self.num_hidden_layers
            # head_mask = tf.constant([0] * self.num_hidden_layers)

-        embedding_output = self.embeddings(
-            [input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
-        encoder_outputs = self.encoder(
-            [embedding_output, extended_attention_mask, head_mask], training=training)
+        embedding_output = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
+        encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training)

        sequence_output = encoder_outputs[0]
        pooled_output = self.pooler(sequence_output[:, 0])
@@ -692,8 +708,9 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
        return outputs


-@add_start_docstrings("""Albert Model with a `language modeling` head on top. """,
-                      ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """Albert Model with a `language modeling` head on top. """, ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING
+)
 class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -723,9 +740,8 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super(TFAlbertForMaskedLM, self).__init__(config, *inputs, **kwargs)

-        self.albert = TFAlbertModel(config, name='albert')
-        self.predictions = TFAlbertMLMHead(
-            config, self.albert.embeddings, name='predictions')
+        self.albert = TFAlbertModel(config, name="albert")
+        self.predictions = TFAlbertMLMHead(config, self.albert.embeddings, name="predictions")

    def get_output_embeddings(self):
        return self.albert.embeddings
@@ -734,8 +750,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
        outputs = self.albert(inputs, **kwargs)

        sequence_output = outputs[0]
-        prediction_scores = self.predictions(
-            sequence_output, training=kwargs.get('training', False))
+        prediction_scores = self.predictions(sequence_output, training=kwargs.get("training", False))

        # Add hidden states and attention if they are here
        outputs = (prediction_scores,) + outputs[2:]
@@ -743,9 +758,12 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
        return outputs  # prediction_scores, (hidden_states), (attentions)


-@add_start_docstrings("""Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings(
+    """Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks. """,
-    ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
+    ALBERT_START_DOCSTRING,
+    ALBERT_INPUTS_DOCSTRING,
+)
 class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -771,22 +789,23 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel):
        logits = outputs[0]

    """
+
    def __init__(self, config, *inputs, **kwargs):
        super(TFAlbertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels

-        self.albert = TFAlbertModel(config, name='albert')
+        self.albert = TFAlbertModel(config, name="albert")
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='classifier')
+        self.classifier = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )

    def call(self, inputs, **kwargs):
        outputs = self.albert(inputs, **kwargs)

        pooled_output = outputs[1]

-        pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False))
+        pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False))
        logits = self.classifier(pooled_output)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

--- a/transformers/modeling_tf_auto.py
+++ b/transformers/modeling_tf_auto.py
@@ -18,32 +18,77 @@ from __future__ import absolute_import, division, print_function, unicode_litera

 import logging

-from .configuration_auto import (BertConfig, CTRLConfig, DistilBertConfig,
-                                 GPT2Config, OpenAIGPTConfig, RobertaConfig,
-                                 TransfoXLConfig, XLMConfig, XLNetConfig)
-
-from .modeling_tf_bert import TFBertModel, TFBertForMaskedLM, TFBertForSequenceClassification, \
-    TFBertForQuestionAnswering, TFBertForTokenClassification, TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_openai import TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_gpt2 import TFGPT2Model, TFGPT2LMHeadModel, TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_transfo_xl import TFTransfoXLModel, TFTransfoXLLMHeadModel, TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_xlnet import TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSequenceClassification, \
-    TFXLNetForQuestionAnsweringSimple, TFXLNetForTokenClassification, TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, \
-    TFXLMForQuestionAnsweringSimple, TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, \
-    TFRobertaForTokenClassification, TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification, TFDistilBertForTokenClassification, TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel, TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_albert import TFAlbertModel, TFAlbertForMaskedLM, TFAlbertForSequenceClassification, TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_t5 import TFT5Model, TFT5WithLMHeadModel, TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP
-
-from .file_utils import add_start_docstrings
+from .configuration_auto import (
+    BertConfig,
+    CTRLConfig,
+    DistilBertConfig,
+    GPT2Config,
+    OpenAIGPTConfig,
+    RobertaConfig,
+    TransfoXLConfig,
+    XLMConfig,
+    XLNetConfig,
+)
+from .modeling_tf_albert import (
+    TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    TFAlbertForMaskedLM,
+    TFAlbertForSequenceClassification,
+    TFAlbertModel,
+)
+from .modeling_tf_bert import (
+    TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    TFBertForMaskedLM,
+    TFBertForQuestionAnswering,
+    TFBertForSequenceClassification,
+    TFBertForTokenClassification,
+    TFBertModel,
+)
+from .modeling_tf_ctrl import TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, TFCTRLLMHeadModel, TFCTRLModel
+from .modeling_tf_distilbert import (
+    TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    TFDistilBertForMaskedLM,
+    TFDistilBertForQuestionAnswering,
+    TFDistilBertForSequenceClassification,
+    TFDistilBertForTokenClassification,
+    TFDistilBertModel,
+)
+from .modeling_tf_gpt2 import TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, TFGPT2LMHeadModel, TFGPT2Model
+from .modeling_tf_openai import TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, TFOpenAIGPTLMHeadModel, TFOpenAIGPTModel
+from .modeling_tf_roberta import (
+    TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+    TFRobertaForMaskedLM,
+    TFRobertaForSequenceClassification,
+    TFRobertaForTokenClassification,
+    TFRobertaModel,
+)
+from .modeling_tf_t5 import TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP, TFT5Model, TFT5WithLMHeadModel
+from .modeling_tf_transfo_xl import (
+    TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
+    TFTransfoXLLMHeadModel,
+    TFTransfoXLModel,
+)
+from .modeling_tf_xlm import (
+    TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+    TFXLMForQuestionAnsweringSimple,
+    TFXLMForSequenceClassification,
+    TFXLMModel,
+    TFXLMWithLMHeadModel,
+)
+from .modeling_tf_xlnet import (
+    TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+    TFXLNetForQuestionAnsweringSimple,
+    TFXLNetForSequenceClassification,
+    TFXLNetForTokenClassification,
+    TFXLNetLMHeadModel,
+    TFXLNetModel,
+)
+

 logger = logging.getLogger(__name__)


-TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value)
+TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict(
+    (key, value)
    for pretrained_map in [
        TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
        TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
@@ -57,7 +102,8 @@ TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value)
        TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
        TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP,
    ]
-    for key, value, in pretrained_map.items())
+    for key, value, in pretrained_map.items()
+)


 class TFAutoModel(object):
@@ -85,10 +131,13 @@ class TFAutoModel(object):

        This class cannot be instantiated using `__init__()` (throws an error).
    """
+
    def __init__(self):
-        raise EnvironmentError("TFAutoModel is designed to be instantiated "
+        raise EnvironmentError(
+            "TFAutoModel is designed to be instantiated "
            "using the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)` or "
-            "`TFAutoModel.from_config(config)` methods.")
+            "`TFAutoModel.from_config(config)` methods."
+        )

    @classmethod
    def from_config(cls, config):
@@ -209,32 +258,34 @@ class TFAutoModel(object):
            model = TFAutoModel.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)

        """
-        if 't5' in pretrained_model_name_or_path:
+        if "t5" in pretrained_model_name_or_path:
            return TFT5Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'distilbert' in pretrained_model_name_or_path:
+        elif "distilbert" in pretrained_model_name_or_path:
            return TFDistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'albert' in pretrained_model_name_or_path:
+        elif "albert" in pretrained_model_name_or_path:
            return TFAlbertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'roberta' in pretrained_model_name_or_path:
+        elif "roberta" in pretrained_model_name_or_path:
            return TFRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
+        elif "bert" in pretrained_model_name_or_path:
            return TFBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'openai-gpt' in pretrained_model_name_or_path:
+        elif "openai-gpt" in pretrained_model_name_or_path:
            return TFOpenAIGPTModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'gpt2' in pretrained_model_name_or_path:
+        elif "gpt2" in pretrained_model_name_or_path:
            return TFGPT2Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'transfo-xl' in pretrained_model_name_or_path:
+        elif "transfo-xl" in pretrained_model_name_or_path:
            return TFTransfoXLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
+        elif "xlnet" in pretrained_model_name_or_path:
            return TFXLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm' in pretrained_model_name_or_path:
+        elif "xlm" in pretrained_model_name_or_path:
            return TFXLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'ctrl' in pretrained_model_name_or_path:
+        elif "ctrl" in pretrained_model_name_or_path:
            return TFCTRLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)

-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
            "'distilbert', 'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path))
+            "'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path)
+        )


 class TFAutoModelWithLMHead(object):
@@ -262,10 +313,13 @@ class TFAutoModelWithLMHead(object):

        This class cannot be instantiated using `__init__()` (throws an error).
    """
+
    def __init__(self):
-        raise EnvironmentError("TFAutoModelWithLMHead is designed to be instantiated "
+        raise EnvironmentError(
+            "TFAutoModelWithLMHead is designed to be instantiated "
            "using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` or "
-            "`TFAutoModelWithLMHead.from_config(config)` methods.")
+            "`TFAutoModelWithLMHead.from_config(config)` methods."
+        )

    @classmethod
    def from_config(cls, config):
@@ -390,32 +444,34 @@ class TFAutoModelWithLMHead(object):
            model = TFAutoModelWithLMHead.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)

        """
-        if 't5' in pretrained_model_name_or_path:
+        if "t5" in pretrained_model_name_or_path:
            return TFT5WithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'distilbert' in pretrained_model_name_or_path:
+        elif "distilbert" in pretrained_model_name_or_path:
            return TFDistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'albert' in pretrained_model_name_or_path:
+        elif "albert" in pretrained_model_name_or_path:
            return TFAlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'roberta' in pretrained_model_name_or_path:
+        elif "roberta" in pretrained_model_name_or_path:
            return TFRobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
+        elif "bert" in pretrained_model_name_or_path:
            return TFBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'openai-gpt' in pretrained_model_name_or_path:
+        elif "openai-gpt" in pretrained_model_name_or_path:
            return TFOpenAIGPTLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'gpt2' in pretrained_model_name_or_path:
+        elif "gpt2" in pretrained_model_name_or_path:
            return TFGPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'transfo-xl' in pretrained_model_name_or_path:
+        elif "transfo-xl" in pretrained_model_name_or_path:
            return TFTransfoXLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
+        elif "xlnet" in pretrained_model_name_or_path:
            return TFXLNetLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm' in pretrained_model_name_or_path:
+        elif "xlm" in pretrained_model_name_or_path:
            return TFXLMWithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'ctrl' in pretrained_model_name_or_path:
+        elif "ctrl" in pretrained_model_name_or_path:
            return TFCTRLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)

-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
            "'distilbert', 'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path))
+            "'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path)
+        )


 class TFAutoModelForSequenceClassification(object):
@@ -438,10 +494,13 @@ class TFAutoModelForSequenceClassification(object):

        This class cannot be instantiated using `__init__()` (throws an error).
    """
+
    def __init__(self):
-        raise EnvironmentError("TFAutoModelForSequenceClassification is designed to be instantiated "
+        raise EnvironmentError(
+            "TFAutoModelForSequenceClassification is designed to be instantiated "
            "using the `TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)` or "
-            "`TFAutoModelForSequenceClassification.from_config(config)` methods.")
+            "`TFAutoModelForSequenceClassification.from_config(config)` methods."
+        )

    @classmethod
    def from_config(cls, config):
@@ -552,21 +611,33 @@ class TFAutoModelForSequenceClassification(object):
            model = TFAutoModelForSequenceClassification.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)

        """
-        if 'distilbert' in pretrained_model_name_or_path:
-            return TFDistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'albert' in pretrained_model_name_or_path:
-            return TFAlbertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'roberta' in pretrained_model_name_or_path:
-            return TFRobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
-            return TFBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
-            return TFXLNetForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm' in pretrained_model_name_or_path:
+        if "distilbert" in pretrained_model_name_or_path:
+            return TFDistilBertForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "albert" in pretrained_model_name_or_path:
+            return TFAlbertForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "roberta" in pretrained_model_name_or_path:
+            return TFRobertaForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "bert" in pretrained_model_name_or_path:
+            return TFBertForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "xlnet" in pretrained_model_name_or_path:
+            return TFXLNetForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "xlm" in pretrained_model_name_or_path:
            return TFXLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)

-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'distilbert', 'bert', 'xlnet', 'xlm', 'roberta'".format(pretrained_model_name_or_path))
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
+            "'distilbert', 'bert', 'xlnet', 'xlm', 'roberta'".format(pretrained_model_name_or_path)
+        )


 class TFAutoModelForQuestionAnswering(object):
@@ -588,10 +659,13 @@ class TFAutoModelForQuestionAnswering(object):

        This class cannot be instantiated using `__init__()` (throws an error).
    """
+
    def __init__(self):
-        raise EnvironmentError("TFAutoModelForQuestionAnswering is designed to be instantiated "
+        raise EnvironmentError(
+            "TFAutoModelForQuestionAnswering is designed to be instantiated "
            "using the `TFAutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)` or "
-            "`TFAutoModelForQuestionAnswering.from_config(config)` methods.")
+            "`TFAutoModelForQuestionAnswering.from_config(config)` methods."
+        )

    @classmethod
    def from_config(cls, config):
@@ -615,9 +689,9 @@ class TFAutoModelForQuestionAnswering(object):
        elif isinstance(config, BertConfig):
            return TFBertForQuestionAnswering(config)
        elif isinstance(config, XLNetConfig):
-            return TFXLNetForQuestionAnswering(config)
+            raise NotImplementedError("TFXLNetForQuestionAnswering isn't implemented")
        elif isinstance(config, XLMConfig):
-            return TFXLMForQuestionAnswering(config)
+            raise NotImplementedError("TFXLMForQuestionAnswering isn't implemented")
        raise ValueError("Unrecognized configuration class {}".format(config))

    @classmethod
@@ -698,24 +772,34 @@ class TFAutoModelForQuestionAnswering(object):
            model = TFAutoModelForQuestionAnswering.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)

        """
-        if 'distilbert' in pretrained_model_name_or_path:
-            return TFDistilBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
+        if "distilbert" in pretrained_model_name_or_path:
+            return TFDistilBertForQuestionAnswering.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "bert" in pretrained_model_name_or_path:
            return TFBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
-            return TFXLNetForQuestionAnsweringSimple.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm' in pretrained_model_name_or_path:
-            return TFXLMForQuestionAnsweringSimple.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif "xlnet" in pretrained_model_name_or_path:
+            return TFXLNetForQuestionAnsweringSimple.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "xlm" in pretrained_model_name_or_path:
+            return TFXLMForQuestionAnsweringSimple.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )

-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'distilbert', 'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path))
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
+            "'distilbert', 'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path)
+        )


 class TFAutoModelForTokenClassification:
    def __init__(self):
-        raise EnvironmentError("TFAutoModelForTokenClassification is designed to be instantiated "
+        raise EnvironmentError(
+            "TFAutoModelForTokenClassification is designed to be instantiated "
            "using the `TFAutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` or "
-                               "`AutoModelForTokenClassification.from_config(config)` methods.")
+            "`AutoModelForTokenClassification.from_config(config)` methods."
+        )

    @classmethod
    def from_config(cls, config):
@@ -815,14 +899,20 @@ class TFAutoModelForTokenClassification:
            model = TFAutoModelForTokenClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)

        """
-        if 'bert' in pretrained_model_name_or_path:
+        if "bert" in pretrained_model_name_or_path:
            return TFBertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
+        elif "xlnet" in pretrained_model_name_or_path:
            return TFXLNetForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'distilbert' in pretrained_model_name_or_path:
-            return TFDistilBertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'roberta' in pretrained_model_name_or_path:
-            return TFRobertaForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-
-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'bert', 'xlnet', 'distilbert', 'roberta'".format(pretrained_model_name_or_path))
+        elif "distilbert" in pretrained_model_name_or_path:
+            return TFDistilBertForTokenClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "roberta" in pretrained_model_name_or_path:
+            return TFRobertaForTokenClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
+            "'bert', 'xlnet', 'distilbert', 'roberta'".format(pretrained_model_name_or_path)
+        )
--- a/transformers/modeling_tf_bert.py
+++ b/transformers/modeling_tf_bert.py
@@ -17,43 +17,40 @@

 from __future__ import absolute_import, division, print_function, unicode_literals

-import json
 import logging
-import math
-import os
 import sys
-from io import open

 import numpy as np
 import tensorflow as tf

 from .configuration_bert import BertConfig
-from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
 from .file_utils import add_start_docstrings
+from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
+

 logger = logging.getLogger(__name__)


 TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tf_model.h5",
-    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-tf_model.h5",
-    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-tf_model.h5",
-    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-tf_model.h5",
-    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-tf_model.h5",
-    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-tf_model.h5",
-    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-tf_model.h5",
-    'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-tf_model.h5",
-    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-tf_model.h5",
-    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-tf_model.h5",
-    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5",
-    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5",
-    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5",
-    'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-tf_model.h5",
-    'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-tf_model.h5",
-    'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-tf_model.h5",
-    'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5",
-    'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/tf_model.h5",
-    'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/tf_model.h5",
+    "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tf_model.h5",
+    "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-tf_model.h5",
+    "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-tf_model.h5",
+    "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-tf_model.h5",
+    "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-tf_model.h5",
+    "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-tf_model.h5",
+    "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-tf_model.h5",
+    "bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-tf_model.h5",
+    "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-tf_model.h5",
+    "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-tf_model.h5",
+    "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5",
+    "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5",
+    "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5",
+    "bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-tf_model.h5",
+    "bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-tf_model.h5",
+    "bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-tf_model.h5",
+    "bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5",
+    "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/tf_model.h5",
+    "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/tf_model.h5",
 }


@@ -67,6 +64,7 @@ def gelu(x):
    cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0)))
    return x * cdf

+
 def gelu_new(x):
    """Gaussian Error Linear Unit.
    This is a smoother version of the RELU.
@@ -76,41 +74,48 @@ def gelu_new(x):
    Returns:
        `x` with the GELU activation applied.
    """
-    cdf = 0.5 * (1.0 + tf.tanh(
-        (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
+    cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
    return x * cdf

+
 def swish(x):
    return x * tf.sigmoid(x)


-ACT2FN = {"gelu": tf.keras.layers.Activation(gelu),
+ACT2FN = {
+    "gelu": tf.keras.layers.Activation(gelu),
    "relu": tf.keras.activations.relu,
    "swish": tf.keras.layers.Activation(swish),
-          "gelu_new": tf.keras.layers.Activation(gelu_new)}
+    "gelu_new": tf.keras.layers.Activation(gelu_new),
+}


 class TFBertEmbeddings(tf.keras.layers.Layer):
    """Construct the embeddings from word, position and token_type embeddings.
    """
+
    def __init__(self, config, **kwargs):
        super(TFBertEmbeddings, self).__init__(**kwargs)
        self.vocab_size = config.vocab_size
        self.hidden_size = config.hidden_size
        self.initializer_range = config.initializer_range

-        self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings,
+        self.position_embeddings = tf.keras.layers.Embedding(
+            config.max_position_embeddings,
            config.hidden_size,
            embeddings_initializer=get_initializer(self.initializer_range),
-                                                             name='position_embeddings')
-        self.token_type_embeddings = tf.keras.layers.Embedding(config.type_vocab_size,
+            name="position_embeddings",
+        )
+        self.token_type_embeddings = tf.keras.layers.Embedding(
+            config.type_vocab_size,
            config.hidden_size,
            embeddings_initializer=get_initializer(self.initializer_range),
-                                                               name='token_type_embeddings')
+            name="token_type_embeddings",
+        )

        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
        # any TensorFlow checkpoint file
-        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='LayerNorm')
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)

    def build(self, input_shape):
@@ -121,7 +126,8 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
            self.word_embeddings = self.add_weight(
                "weight",
                shape=[self.vocab_size, self.hidden_size],
-                initializer=get_initializer(self.initializer_range))
+                initializer=get_initializer(self.initializer_range),
+            )
        super(TFBertEmbeddings, self).build(input_shape)

    def call(self, inputs, mode="embedding", training=False):
@@ -193,7 +199,8 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
        self.output_attentions = config.output_attentions

        self.num_attention_heads = config.num_attention_heads
@@ -201,15 +208,15 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

-        self.query = tf.keras.layers.Dense(self.all_head_size,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           name='query')
-        self.key = tf.keras.layers.Dense(self.all_head_size,
-                                         kernel_initializer=get_initializer(config.initializer_range),
-                                         name='key')
-        self.value = tf.keras.layers.Dense(self.all_head_size,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           name='value')
+        self.query = tf.keras.layers.Dense(
+            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
+        )
+        self.key = tf.keras.layers.Dense(
+            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
+        )
+        self.value = tf.keras.layers.Dense(
+            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
+        )

        self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)

@@ -230,7 +237,9 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
        value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)

        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)  # (batch size, num_heads, seq_len_q, seq_len_k)
+        attention_scores = tf.matmul(
+            query_layer, key_layer, transpose_b=True
+        )  # (batch size, num_heads, seq_len_q, seq_len_k)
        dk = tf.cast(shape_list(key_layer)[-1], tf.float32)  # scale attention_scores
        attention_scores = attention_scores / tf.math.sqrt(dk)

@@ -252,8 +261,9 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
        context_layer = tf.matmul(attention_probs, value_layer)

        context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
-        context_layer = tf.reshape(context_layer,
-                                  (batch_size, -1, self.all_head_size))  # (batch_size, seq_len_q, all_head_size)
+        context_layer = tf.reshape(
+            context_layer, (batch_size, -1, self.all_head_size)
+        )  # (batch_size, seq_len_q, all_head_size)

        outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
        return outputs
@@ -262,10 +272,10 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
 class TFBertSelfOutput(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super(TFBertSelfOutput, self).__init__(**kwargs)
-        self.dense = tf.keras.layers.Dense(config.hidden_size,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           name='dense')
-        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='LayerNorm')
+        self.dense = tf.keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)

    def call(self, inputs, training=False):
@@ -280,8 +290,8 @@ class TFBertSelfOutput(tf.keras.layers.Layer):
 class TFBertAttention(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super(TFBertAttention, self).__init__(**kwargs)
-        self.self_attention = TFBertSelfAttention(config, name='self')
-        self.dense_output = TFBertSelfOutput(config, name='output')
+        self.self_attention = TFBertSelfAttention(config, name="self")
+        self.dense_output = TFBertSelfOutput(config, name="output")

    def prune_heads(self, heads):
        raise NotImplementedError
@@ -298,10 +308,12 @@ class TFBertAttention(tf.keras.layers.Layer):
 class TFBertIntermediate(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super(TFBertIntermediate, self).__init__(**kwargs)
-        self.dense = tf.keras.layers.Dense(config.intermediate_size,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           name='dense')
-        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+        self.dense = tf.keras.layers.Dense(
+            config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        if isinstance(config.hidden_act, str) or (
+            sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)  # noqa: F821
+        ):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act
@@ -315,10 +327,10 @@ class TFBertIntermediate(tf.keras.layers.Layer):
 class TFBertOutput(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super(TFBertOutput, self).__init__(**kwargs)
-        self.dense = tf.keras.layers.Dense(config.hidden_size,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           name='dense')
-        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='LayerNorm')
+        self.dense = tf.keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)

    def call(self, inputs, training=False):
@@ -333,9 +345,9 @@ class TFBertOutput(tf.keras.layers.Layer):
 class TFBertLayer(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super(TFBertLayer, self).__init__(**kwargs)
-        self.attention = TFBertAttention(config, name='attention')
-        self.intermediate = TFBertIntermediate(config, name='intermediate')
-        self.bert_output = TFBertOutput(config, name='output')
+        self.attention = TFBertAttention(config, name="attention")
+        self.intermediate = TFBertIntermediate(config, name="intermediate")
+        self.bert_output = TFBertOutput(config, name="output")

    def call(self, inputs, training=False):
        hidden_states, attention_mask, head_mask = inputs
@@ -353,7 +365,7 @@ class TFBertEncoder(tf.keras.layers.Layer):
        super(TFBertEncoder, self).__init__(**kwargs)
        self.output_attentions = config.output_attentions
        self.output_hidden_states = config.output_hidden_states
-        self.layer = [TFBertLayer(config, name='layer_._{}'.format(i)) for i in range(config.num_hidden_layers)]
+        self.layer = [TFBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)]

    def call(self, inputs, training=False):
        hidden_states, attention_mask, head_mask = inputs
@@ -385,10 +397,12 @@ class TFBertEncoder(tf.keras.layers.Layer):
 class TFBertPooler(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super(TFBertPooler, self).__init__(**kwargs)
-        self.dense = tf.keras.layers.Dense(config.hidden_size,
+        self.dense = tf.keras.layers.Dense(
+            config.hidden_size,
            kernel_initializer=get_initializer(config.initializer_range),
-                                           activation='tanh',
-                                           name='dense')
+            activation="tanh",
+            name="dense",
+        )

    def call(self, hidden_states):
        # We "pool" the model by simply taking the hidden state corresponding
@@ -401,14 +415,16 @@ class TFBertPooler(tf.keras.layers.Layer):
 class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super(TFBertPredictionHeadTransform, self).__init__(**kwargs)
-        self.dense = tf.keras.layers.Dense(config.hidden_size,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           name='dense')
-        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+        self.dense = tf.keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        if isinstance(config.hidden_act, str) or (
+            sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)  # noqa: F821
+        ):
            self.transform_act_fn = ACT2FN[config.hidden_act]
        else:
            self.transform_act_fn = config.hidden_act
-        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='LayerNorm')
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")

    def call(self, hidden_states):
        hidden_states = self.dense(hidden_states)
@@ -421,17 +437,14 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
    def __init__(self, config, input_embeddings, **kwargs):
        super(TFBertLMPredictionHead, self).__init__(**kwargs)
        self.vocab_size = config.vocab_size
-        self.transform = TFBertPredictionHeadTransform(config, name='transform')
+        self.transform = TFBertPredictionHeadTransform(config, name="transform")

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        self.input_embeddings = input_embeddings

    def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,),
-                                    initializer='zeros',
-                                    trainable=True,
-                                    name='bias')
+        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
        super(TFBertLMPredictionHead, self).build(input_shape)

    def call(self, hidden_states):
@@ -444,7 +457,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
 class TFBertMLMHead(tf.keras.layers.Layer):
    def __init__(self, config, input_embeddings, **kwargs):
        super(TFBertMLMHead, self).__init__(**kwargs)
-        self.predictions = TFBertLMPredictionHead(config, input_embeddings, name='predictions')
+        self.predictions = TFBertLMPredictionHead(config, input_embeddings, name="predictions")

    def call(self, sequence_output):
        prediction_scores = self.predictions(sequence_output)
@@ -454,9 +467,9 @@ class TFBertMLMHead(tf.keras.layers.Layer):
 class TFBertNSPHead(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super(TFBertNSPHead, self).__init__(**kwargs)
-        self.seq_relationship = tf.keras.layers.Dense(2,
-                                                      kernel_initializer=get_initializer(config.initializer_range),
-                                                      name='seq_relationship')
+        self.seq_relationship = tf.keras.layers.Dense(
+            2, kernel_initializer=get_initializer(config.initializer_range), name="seq_relationship"
+        )

    def call(self, pooled_output):
        seq_relationship_score = self.seq_relationship(pooled_output)
@@ -468,9 +481,9 @@ class TFBertMainLayer(tf.keras.layers.Layer):
        super(TFBertMainLayer, self).__init__(**kwargs)
        self.num_hidden_layers = config.num_hidden_layers

-        self.embeddings = TFBertEmbeddings(config, name='embeddings')
-        self.encoder = TFBertEncoder(config, name='encoder')
-        self.pooler = TFBertPooler(config, name='pooler')
+        self.embeddings = TFBertEmbeddings(config, name="embeddings")
+        self.encoder = TFBertEncoder(config, name="encoder")
+        self.pooler = TFBertPooler(config, name="pooler")

    def get_input_embeddings(self):
        return self.embeddings
@@ -485,7 +498,16 @@ class TFBertMainLayer(tf.keras.layers.Layer):
        """
        raise NotImplementedError

-    def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
+    def call(
+        self,
+        inputs,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        training=False,
+    ):
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
@@ -495,12 +517,12 @@ class TFBertMainLayer(tf.keras.layers.Layer):
            inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
            assert len(inputs) <= 6, "Too many inputs."
        elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            token_type_ids = inputs.get('token_type_ids', token_type_ids)
-            position_ids = inputs.get('position_ids', position_ids)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
            assert len(inputs) <= 6, "Too many inputs."
        else:
            input_ids = inputs
@@ -540,7 +562,7 @@ class TFBertMainLayer(tf.keras.layers.Layer):
        # attention_probs has shape bsz x n_heads x N x N
        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        if not head_mask is None:
+        if head_mask is not None:
            raise NotImplementedError
        else:
            head_mask = [None] * self.num_hidden_layers
@@ -552,7 +574,9 @@ class TFBertMainLayer(tf.keras.layers.Layer):
        sequence_output = encoder_outputs[0]
        pooled_output = self.pooler(sequence_output)

-        outputs = (sequence_output, pooled_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
+        outputs = (sequence_output, pooled_output,) + encoder_outputs[
+            1:
+        ]  # add hidden_states and attentions if they are here
        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)


@@ -560,6 +584,7 @@ class TFBertPreTrainedModel(TFPreTrainedModel):
    """ An abstract class to handle weights initialization and
        a simple interface for dowloading and loading pretrained models.
    """
+
    config_class = BertConfig
    pretrained_model_archive_map = TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP
    base_model_prefix = "bert"
@@ -648,8 +673,12 @@ BERT_INPUTS_DOCSTRING = r"""
            than the model's internal embedding lookup matrix.
 """

-@add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
-                      BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class TFBertModel(TFBertPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -682,18 +711,22 @@ class TFBertModel(TFBertPreTrainedModel):
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

    """
+
    def __init__(self, config, *inputs, **kwargs):
        super(TFBertModel, self).__init__(config, *inputs, **kwargs)
-        self.bert = TFBertMainLayer(config, name='bert')
+        self.bert = TFBertMainLayer(config, name="bert")

    def call(self, inputs, **kwargs):
        outputs = self.bert(inputs, **kwargs)
        return outputs


-@add_start_docstrings("""Bert Model with two heads on top as done during the pre-training:
+@add_start_docstrings(
+    """Bert Model with two heads on top as done during the pre-training:
    a `masked language modeling` head and a `next sentence prediction (classification)` head. """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class TFBertForPreTraining(TFBertPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -721,12 +754,13 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
        prediction_scores, seq_relationship_scores = outputs[:2]

    """
+
    def __init__(self, config, *inputs, **kwargs):
        super(TFBertForPreTraining, self).__init__(config, *inputs, **kwargs)

-        self.bert = TFBertMainLayer(config, name='bert')
-        self.nsp = TFBertNSPHead(config, name='nsp___cls')
-        self.mlm = TFBertMLMHead(config, self.bert.embeddings, name='mlm___cls')
+        self.bert = TFBertMainLayer(config, name="bert")
+        self.nsp = TFBertNSPHead(config, name="nsp___cls")
+        self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls")

    def get_output_embeddings(self):
        return self.bert.embeddings
@@ -735,16 +769,19 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
        outputs = self.bert(inputs, **kwargs)

        sequence_output, pooled_output = outputs[:2]
-        prediction_scores = self.mlm(sequence_output, training=kwargs.get('training', False))
+        prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False))
        seq_relationship_score = self.nsp(pooled_output)

-        outputs = (prediction_scores, seq_relationship_score,) + outputs[2:]  # add hidden states and attention if they are here
+        outputs = (prediction_scores, seq_relationship_score,) + outputs[
+            2:
+        ]  # add hidden states and attention if they are here

        return outputs  # prediction_scores, seq_relationship_score, (hidden_states), (attentions)


-@add_start_docstrings("""Bert Model with a `language modeling` head on top. """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING
+)
 class TFBertForMaskedLM(TFBertPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -770,11 +807,12 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
        prediction_scores = outputs[0]

    """
+
    def __init__(self, config, *inputs, **kwargs):
        super(TFBertForMaskedLM, self).__init__(config, *inputs, **kwargs)

-        self.bert = TFBertMainLayer(config, name='bert')
-        self.mlm = TFBertMLMHead(config, self.bert.embeddings, name='mlm___cls')
+        self.bert = TFBertMainLayer(config, name="bert")
+        self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls")

    def get_output_embeddings(self):
        return self.bert.embeddings
@@ -783,15 +821,18 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
        outputs = self.bert(inputs, **kwargs)

        sequence_output = outputs[0]
-        prediction_scores = self.mlm(sequence_output, training=kwargs.get('training', False))
+        prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False))

        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here

        return outputs  # prediction_scores, (hidden_states), (attentions)


-@add_start_docstrings("""Bert Model with a `next sentence prediction (classification)` head on top. """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """Bert Model with a `next sentence prediction (classification)` head on top. """,
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -817,11 +858,12 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
        seq_relationship_scores = outputs[0]

    """
+
    def __init__(self, config, *inputs, **kwargs):
        super(TFBertForNextSentencePrediction, self).__init__(config, *inputs, **kwargs)

-        self.bert = TFBertMainLayer(config, name='bert')
-        self.nsp = TFBertNSPHead(config, name='nsp___cls')
+        self.bert = TFBertMainLayer(config, name="bert")
+        self.nsp = TFBertNSPHead(config, name="nsp___cls")

    def call(self, inputs, **kwargs):
        outputs = self.bert(inputs, **kwargs)
@@ -834,9 +876,12 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
        return outputs  # seq_relationship_score, (hidden_states), (attentions)


-@add_start_docstrings("""Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings(
+    """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks. """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class TFBertForSequenceClassification(TFBertPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -862,22 +907,23 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
        logits = outputs[0]

    """
+
    def __init__(self, config, *inputs, **kwargs):
        super(TFBertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels

-        self.bert = TFBertMainLayer(config, name='bert')
+        self.bert = TFBertMainLayer(config, name="bert")
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='classifier')
+        self.classifier = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )

    def call(self, inputs, **kwargs):
        outputs = self.bert(inputs, **kwargs)

        pooled_output = outputs[1]

-        pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False))
+        pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False))
        logits = self.classifier(pooled_output)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
@@ -885,9 +931,12 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
        return outputs  # logits, (hidden_states), (attentions)


-@add_start_docstrings("""Bert Model with a multiple choice classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """Bert Model with a multiple choice classification head on top (a linear layer on top of
    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class TFBertForMultipleChoice(TFBertPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -915,16 +964,26 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
        classification_scores = outputs[0]

    """
+
    def __init__(self, config, *inputs, **kwargs):
        super(TFBertForMultipleChoice, self).__init__(config, *inputs, **kwargs)

-        self.bert = TFBertMainLayer(config, name='bert')
+        self.bert = TFBertMainLayer(config, name="bert")
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(1,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='classifier')
-
-    def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
+        self.classifier = tf.keras.layers.Dense(
+            1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
+
+    def call(
+        self,
+        inputs,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        training=False,
+    ):
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
@@ -934,12 +993,12 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
            inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
            assert len(inputs) <= 6, "Too many inputs."
        elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            token_type_ids = inputs.get('token_type_ids', token_type_ids)
-            position_ids = inputs.get('position_ids', position_ids)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
            assert len(inputs) <= 6, "Too many inputs."
        else:
            input_ids = inputs
@@ -956,7 +1015,14 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None

-        flat_inputs = [flat_input_ids, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask, inputs_embeds]
+        flat_inputs = [
+            flat_input_ids,
+            flat_attention_mask,
+            flat_token_type_ids,
+            flat_position_ids,
+            head_mask,
+            inputs_embeds,
+        ]

        outputs = self.bert(flat_inputs, training=training)

@@ -971,9 +1037,12 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
        return outputs  # reshaped_logits, (hidden_states), (attentions)


-@add_start_docstrings("""Bert Model with a token classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """Bert Model with a token classification head on top (a linear layer on top of
    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class TFBertForTokenClassification(TFBertPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -999,22 +1068,23 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
        scores = outputs[0]

    """
+
    def __init__(self, config, *inputs, **kwargs):
        super(TFBertForTokenClassification, self).__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels

-        self.bert = TFBertMainLayer(config, name='bert')
+        self.bert = TFBertMainLayer(config, name="bert")
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='classifier')
+        self.classifier = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )

    def call(self, inputs, **kwargs):
        outputs = self.bert(inputs, **kwargs)

        sequence_output = outputs[0]

-        sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False))
+        sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False))
        logits = self.classifier(sequence_output)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
@@ -1022,9 +1092,12 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
        return outputs  # scores, (hidden_states), (attentions)


-@add_start_docstrings("""Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
    the hidden-states output to compute `span start logits` and `span end logits`). """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class TFBertForQuestionAnswering(TFBertPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -1052,14 +1125,15 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel):
        start_scores, end_scores = outputs[:2]

    """
+
    def __init__(self, config, *inputs, **kwargs):
        super(TFBertForQuestionAnswering, self).__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels

-        self.bert = TFBertMainLayer(config, name='bert')
-        self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='qa_outputs')
+        self.bert = TFBertMainLayer(config, name="bert")
+        self.qa_outputs = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
+        )

    def call(self, inputs, **kwargs):
        outputs = self.bert(inputs, **kwargs)

--- a/transformers/modeling_tf_ctrl.py
+++ b/transformers/modeling_tf_ctrl.py
@@ -18,29 +18,28 @@
 from __future__ import absolute_import, division, print_function, unicode_literals

 import logging
-import os
-import sys
-from io import open
+
 import numpy as np
 import tensorflow as tf

 from .configuration_ctrl import CTRLConfig
-from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list, TFSharedEmbeddings
 from .file_utils import add_start_docstrings
+from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list
+

 logger = logging.getLogger(__name__)

 TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP = {"ctrl": "https://s3.amazonaws.com/models.huggingface.co/bert/ctrl-tf_model.h5"}

+
 def angle_defn(pos, i, d_model_size):
-    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model_size))
+    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model_size))
    return pos * angle_rates

+
 def positional_encoding(position, d_model_size):
    # create the sinusoidal pattern for the positional encoding
-    angle_rads = angle_defn(np.arange(position)[:, np.newaxis],
-                            np.arange(d_model_size)[np.newaxis, :],
-                            d_model_size)
+    angle_rads = angle_defn(np.arange(position)[:, np.newaxis], np.arange(d_model_size)[np.newaxis, :], d_model_size)

    sines = np.sin(angle_rads[:, 0::2])
    cosines = np.cos(angle_rads[:, 1::2])
@@ -49,6 +48,7 @@ def positional_encoding(position, d_model_size):
    pos_encoding = tf.cast(np.concatenate([sines, cosines], axis=-1), dtype=tf.float32)
    return pos_encoding

+
 def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=None):
    # calculate attention
    matmul_qk = tf.matmul(q, k, transpose_b=True)
@@ -57,7 +57,7 @@ def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=N
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    if mask is not None:
-        scaled_attention_logits += (mask * -1e4)
+        scaled_attention_logits += mask * -1e4

    if attention_mask is not None:
        # Apply the attention mask
@@ -83,11 +83,11 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):

        self.depth = int(d_model_size / self.num_heads)

-        self.Wq = tf.keras.layers.Dense(d_model_size, name='Wq')
-        self.Wk = tf.keras.layers.Dense(d_model_size, name='Wk')
-        self.Wv = tf.keras.layers.Dense(d_model_size, name='Wv')
+        self.Wq = tf.keras.layers.Dense(d_model_size, name="Wq")
+        self.Wk = tf.keras.layers.Dense(d_model_size, name="Wk")
+        self.Wv = tf.keras.layers.Dense(d_model_size, name="Wv")

-        self.dense = tf.keras.layers.Dense(d_model_size, name='dense')
+        self.dense = tf.keras.layers.Dense(d_model_size, name="dense")

    def split_into_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
@@ -122,22 +122,22 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
        return outputs


-
 def point_wise_feed_forward_network(d_model_size, dff, name=""):
-    return tf.keras.Sequential([
-            tf.keras.layers.Dense(dff, activation='relu', name="0"), 
-            tf.keras.layers.Dense(d_model_size, name="2")
-        ], name="ffn")
+    return tf.keras.Sequential(
+        [tf.keras.layers.Dense(dff, activation="relu", name="0"), tf.keras.layers.Dense(d_model_size, name="2")],
+        name="ffn",
+    )


 class TFEncoderLayer(tf.keras.layers.Layer):
-    def __init__(self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e-6, output_attentions=False, **kwargs):
+    def __init__(
+        self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e-6, output_attentions=False, **kwargs
+    ):
        super(TFEncoderLayer, self).__init__(**kwargs)

-        self.multi_head_attention = TFMultiHeadAttention(d_model_size,
-                                                         num_heads,
-                                                         output_attentions,
-                                                         name="multi_head_attention")
+        self.multi_head_attention = TFMultiHeadAttention(
+            d_model_size, num_heads, output_attentions, name="multi_head_attention"
+        )
        self.ffn = point_wise_feed_forward_network(d_model_size, dff, name="ffn")

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm1")
@@ -149,8 +149,9 @@ class TFEncoderLayer(tf.keras.layers.Layer):
    def call(self, inputs, training=False):
        x, mask, layer_past, attention_mask, head_mask = inputs
        normed = self.layernorm1(x)
-        attn_outputs = self.multi_head_attention([normed, normed, normed, mask, layer_past,
-                                                  attention_mask, head_mask], training=training)
+        attn_outputs = self.multi_head_attention(
+            [normed, normed, normed, mask, layer_past, attention_mask, head_mask], training=training
+        )
        attn_output = attn_outputs[0]
        attn_output = self.dropout1(attn_output, training=training)
        out1 = x + attn_output
@@ -176,20 +177,23 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):

        self.pos_encoding = positional_encoding(config.n_positions, self.d_model_size)

-
-        self.w = TFSharedEmbeddings(config.vocab_size,
-                                    config.n_embd,
-                                    initializer_range=config.initializer_range,
-                                    name="w")
+        self.w = TFSharedEmbeddings(
+            config.vocab_size, config.n_embd, initializer_range=config.initializer_range, name="w"
+        )

        self.dropout = tf.keras.layers.Dropout(config.embd_pdrop)
-        self.h = [TFEncoderLayer(config.n_embd,
+        self.h = [
+            TFEncoderLayer(
+                config.n_embd,
                config.n_head,
                config.dff,
                config.resid_pdrop,
                config.layer_norm_epsilon,
                config.output_attentions,
-                                 name='h_._{}'.format(i)) for i in range(config.n_layer)]
+                name="h_._{}".format(i),
+            )
+            for i in range(config.n_layer)
+        ]
        self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="layernorm")

    def get_input_embeddings(self):
@@ -204,7 +208,17 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
        """
        raise NotImplementedError

-    def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
+    def call(
+        self,
+        inputs,
+        past=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        training=False,
+    ):
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
            past = inputs[1] if len(inputs) > 1 else past
@@ -215,13 +229,13 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
            inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds
            assert len(inputs) <= 7, "Too many inputs."
        elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            past = inputs.get('past', past)
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            token_type_ids = inputs.get('token_type_ids', token_type_ids)
-            position_ids = inputs.get('position_ids', position_ids)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
+            input_ids = inputs.get("input_ids")
+            past = inputs.get("past", past)
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
            assert len(inputs) <= 7, "Too many inputs."
        else:
            input_ids = inputs
@@ -276,14 +290,14 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):

        if token_type_ids is not None:
            token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
-            token_type_embeds = self.w(token_type_ids, mode='embedding')
+            token_type_embeds = self.w(token_type_ids, mode="embedding")
            token_type_embeds *= tf.math.sqrt(tf.cast(self.d_model_size, tf.float32))
        else:
            token_type_embeds = 0
        position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])

        if inputs_embeds is None:
-            inputs_embeds = self.w(input_ids, mode='embedding')
+            inputs_embeds = self.w(input_ids, mode="embedding")
        seq_len = input_shape[-1]
        mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)

@@ -333,6 +347,7 @@ class TFCTRLPreTrainedModel(TFPreTrainedModel):
    """ An abstract class to handle weights initialization and
        a simple interface for dowloading and loading pretrained models.
    """
+
    config_class = CTRLConfig
    pretrained_model_archive_map = TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
    base_model_prefix = "transformer"
@@ -392,8 +407,12 @@ CTRL_INPUTS_DOCSTRING = r"""    Inputs:
            than the model's internal embedding lookup matrix.
 """

-@add_start_docstrings("The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
-                                            CTRL_START_DOCSTRING, CTRL_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
+    CTRL_START_DOCSTRING,
+    CTRL_INPUTS_DOCSTRING,
+)
 class TFCTRLModel(TFCTRLPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -423,9 +442,10 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

    """
+
    def __init__(self, config, *inputs, **kwargs):
        super(TFCTRLModel, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFCTRLMainLayer(config, name='transformer')
+        self.transformer = TFCTRLMainLayer(config, name="transformer")

    def call(self, inputs, **kwargs):
        outputs = self.transformer(inputs, **kwargs)
@@ -442,10 +462,7 @@ class TFCTRLLMHead(tf.keras.layers.Layer):
        self.input_embeddings = input_embeddings

    def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,),
-                                    initializer='zeros',
-                                    trainable=True,
-                                    name='bias')
+        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
        super(TFCTRLLMHead, self).build(input_shape)

    def call(self, hidden_states):
@@ -454,8 +471,12 @@ class TFCTRLLMHead(tf.keras.layers.Layer):
        return hidden_states


-@add_start_docstrings("""The CTRL Model transformer with a language modeling head on top
-(linear layer with weights tied to the input embeddings). """, CTRL_START_DOCSTRING, CTRL_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """The CTRL Model transformer with a language modeling head on top
+(linear layer with weights tied to the input embeddings). """,
+    CTRL_START_DOCSTRING,
+    CTRL_INPUTS_DOCSTRING,
+)
 class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -486,9 +507,10 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
        loss, logits = outputs[:2]

    """
+
    def __init__(self, config, *inputs, **kwargs):
        super(TFCTRLLMHeadModel, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFCTRLMainLayer(config, name='transformer')
+        self.transformer = TFCTRLMainLayer(config, name="transformer")

        self.lm_head = TFCTRLLMHead(config, self.transformer.w, name="lm_head")


--- a/transformers/modeling_tf_distilbert.py
+++ b/transformers/modeling_tf_distilbert.py
@@ -16,33 +16,28 @@
 """
 from __future__ import absolute_import, division, print_function, unicode_literals

-import json
 import logging
 import math
-import copy
-import sys
-from io import open
-
-import itertools

 import numpy as np
 import tensorflow as tf

 from .configuration_distilbert import DistilBertConfig
-from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list, get_initializer
 from .file_utils import add_start_docstrings
+from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, get_initializer, shape_list
+

 logger = logging.getLogger(__name__)


 TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-tf_model.h5",
-    'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-tf_model.h5",
-    'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-tf_model.h5",
+    "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-tf_model.h5",
+    "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-tf_model.h5",
+    "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-tf_model.h5",
 }


-### UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE ###
+# UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE #
 def gelu(x):
    """ Gaussian Error Linear Unit.
    Original Implementation of the gelu activation function in Google Bert repo when initially created.
@@ -53,6 +48,7 @@ def gelu(x):
    cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0)))
    return x * cdf

+
 def gelu_new(x):
    """Gaussian Error Linear Unit.
    This is a smoother version of the RELU.
@@ -62,24 +58,25 @@ def gelu_new(x):
    Returns:
        `x` with the GELU activation applied.
    """
-    cdf = 0.5 * (1.0 + tf.tanh(
-        (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
+    cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
    return x * cdf

+
 class TFEmbeddings(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super(TFEmbeddings, self).__init__(**kwargs)
        self.vocab_size = config.vocab_size
        self.dim = config.dim
        self.initializer_range = config.initializer_range
-        self.word_embeddings = TFSharedEmbeddings(config.vocab_size,
-                                                  config.dim,
-                                                  initializer_range=config.initializer_range,
-                                                  name='word_embeddings')  # padding_idx=0)
-        self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings,
+        self.word_embeddings = TFSharedEmbeddings(
+            config.vocab_size, config.dim, initializer_range=config.initializer_range, name="word_embeddings"
+        )  # padding_idx=0)
+        self.position_embeddings = tf.keras.layers.Embedding(
+            config.max_position_embeddings,
            config.dim,
            embeddings_initializer=get_initializer(config.initializer_range),
-                                                             name='position_embeddings')
+            name="position_embeddings",
+        )
        if config.sinusoidal_pos_embds:
            raise NotImplementedError

@@ -92,9 +89,8 @@ class TFEmbeddings(tf.keras.layers.Layer):
            # Create and initialize weights. The random normal initializer was chosen
            # arbitrarily, and works well.
            self.word_embeddings = self.add_weight(
-                "weight",
-                shape=[self.vocab_size, self.dim],
-                initializer=get_initializer(self.initializer_range))
+                "weight", shape=[self.vocab_size, self.dim], initializer=get_initializer(self.initializer_range)
+            )
        super(TFEmbeddings, self).build(input_shape)

    def call(self, inputs, inputs_embeds=None, mode="embedding", training=False):
@@ -181,18 +177,18 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):

        assert self.dim % self.n_heads == 0

-        self.q_lin = tf.keras.layers.Dense(config.dim,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           name="q_lin")
-        self.k_lin = tf.keras.layers.Dense(config.dim,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           name="k_lin")
-        self.v_lin = tf.keras.layers.Dense(config.dim,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           name="v_lin")
-        self.out_lin = tf.keras.layers.Dense(config.dim,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           name="out_lin")
+        self.q_lin = tf.keras.layers.Dense(
+            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="q_lin"
+        )
+        self.k_lin = tf.keras.layers.Dense(
+            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="k_lin"
+        )
+        self.v_lin = tf.keras.layers.Dense(
+            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="v_lin"
+        )
+        self.out_lin = tf.keras.layers.Dense(
+            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="out_lin"
+        )

        self.pruned_heads = set()

@@ -259,18 +255,23 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
        else:
            return (context,)

+
 class TFFFN(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super(TFFFN, self).__init__(**kwargs)
        self.dropout = tf.keras.layers.Dropout(config.dropout)
-        self.lin1 = tf.keras.layers.Dense(config.hidden_dim,
-                                          kernel_initializer=get_initializer(config.initializer_range),
-                                          name="lin1")
-        self.lin2 = tf.keras.layers.Dense(config.dim,
-                                          kernel_initializer=get_initializer(config.initializer_range),
-                                          name="lin2")
-        assert config.activation in ['relu', 'gelu'], "activation ({}) must be in ['relu', 'gelu']".format(config.activation)
-        self.activation = tf.keras.layers.Activation(gelu) if config.activation=='gelu' else tf.keras.activations.relu
+        self.lin1 = tf.keras.layers.Dense(
+            config.hidden_dim, kernel_initializer=get_initializer(config.initializer_range), name="lin1"
+        )
+        self.lin2 = tf.keras.layers.Dense(
+            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="lin2"
+        )
+        assert config.activation in ["relu", "gelu"], "activation ({}) must be in ['relu', 'gelu']".format(
+            config.activation
+        )
+        self.activation = (
+            tf.keras.layers.Activation(gelu) if config.activation == "gelu" else tf.keras.activations.relu
+        )

    def call(self, input, training=False):
        x = self.lin1(input)
@@ -341,8 +342,7 @@ class TFTransformer(tf.keras.layers.Layer):
        self.output_attentions = config.output_attentions
        self.output_hidden_states = config.output_hidden_states

-        self.layer = [TFTransformerBlock(config, name='layer_._{}'.format(i))
-                      for i in range(config.n_layers)]
+        self.layer = [TFTransformerBlock(config, name="layer_._{}".format(i)) for i in range(config.n_layers)]

    def call(self, inputs, training=False):
        """
@@ -421,10 +421,10 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
            inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds
            assert len(inputs) <= 4, "Too many inputs."
        elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
            assert len(inputs) <= 4, "Too many inputs."
        else:
            input_ids = inputs
@@ -458,11 +458,12 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
        return tfmr_output  # last-layer hidden-state, (all hidden_states), (all attentions)


-### INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL ###
+# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
 class TFDistilBertPreTrainedModel(TFPreTrainedModel):
    """ An abstract class to handle weights initialization and
        a simple interface for downloading and loading pretrained models.
    """
+
    config_class = DistilBertConfig
    pretrained_model_archive_map = TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
    base_model_prefix = "distilbert"
@@ -534,8 +535,12 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
            than the model's internal embedding lookup matrix.
 """

-@add_start_docstrings("The bare DistilBERT encoder/transformer outputing raw hidden-states without any specific head on top.",
-                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare DistilBERT encoder/transformer outputing raw hidden-states without any specific head on top.",
+    DISTILBERT_START_DOCSTRING,
+    DISTILBERT_INPUTS_DOCSTRING,
+)
 class TFDistilBertModel(TFDistilBertPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -561,6 +566,7 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

    """
+
    def __init__(self, config, *inputs, **kwargs):
        super(TFDistilBertModel, self).__init__(config, *inputs, **kwargs)
        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")  # Embeddings
@@ -580,10 +586,7 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
        self.input_embeddings = input_embeddings

    def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,),
-                                    initializer='zeros',
-                                    trainable=True,
-                                    name='bias')
+        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
        super(TFDistilBertLMHead, self).build(input_shape)

    def call(self, hidden_states):
@@ -592,8 +595,11 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
        return hidden_states


-@add_start_docstrings("""DistilBert Model with a `masked language modeling` head on top. """,
-                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """DistilBert Model with a `masked language modeling` head on top. """,
+    DISTILBERT_START_DOCSTRING,
+    DISTILBERT_INPUTS_DOCSTRING,
+)
 class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -619,6 +625,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
        prediction_scores = outputs[0]

    """
+
    def __init__(self, config, *inputs, **kwargs):
        super(TFDistilBertForMaskedLM, self).__init__(config, *inputs, **kwargs)
        self.output_attentions = config.output_attentions
@@ -626,9 +633,9 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
        self.vocab_size = config.vocab_size

        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
-        self.vocab_transform = tf.keras.layers.Dense(config.dim,
-                                                     kernel_initializer=get_initializer(config.initializer_range),
-                                                     name="vocab_transform")
+        self.vocab_transform = tf.keras.layers.Dense(
+            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="vocab_transform"
+        )
        self.act = tf.keras.layers.Activation(gelu)
        self.vocab_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm")
        self.vocab_projector = TFDistilBertLMHead(config, self.distilbert.embeddings, name="vocab_projector")
@@ -649,9 +656,12 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
        return outputs  # logits, (hidden_states), (attentions)


-@add_start_docstrings("""DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings(
+    """DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
                         the pooled output) e.g. for GLUE tasks. """,
-                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+    DISTILBERT_START_DOCSTRING,
+    DISTILBERT_INPUTS_DOCSTRING,
+)
 class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -677,18 +687,21 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
        logits = outputs[0]

    """
+
    def __init__(self, config, *inputs, **kwargs):
        super(TFDistilBertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels

        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
-        self.pre_classifier = tf.keras.layers.Dense(config.dim,
-                                                    kernel_initializer=get_initializer(config.initializer_range),
-                                                    activation='relu',
-                                                    name="pre_classifier")
-        self.classifier = tf.keras.layers.Dense(config.num_labels,
+        self.pre_classifier = tf.keras.layers.Dense(
+            config.dim,
            kernel_initializer=get_initializer(config.initializer_range),
-                                                name="classifier")
+            activation="relu",
+            name="pre_classifier",
+        )
+        self.classifier = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
        self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout)

    def call(self, inputs, **kwargs):
@@ -697,16 +710,19 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
        pooled_output = hidden_state[:, 0]  # (bs, dim)
        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
-        pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False))         # (bs, dim)
+        pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False))  # (bs, dim)
        logits = self.classifier(pooled_output)  # (bs, dim)

        outputs = (logits,) + distilbert_output[1:]
        return outputs  # logits, (hidden_states), (attentions)


-@add_start_docstrings("""DistilBert Model with a token classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """DistilBert Model with a token classification head on top (a linear layer on top of
    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+    DISTILBERT_START_DOCSTRING,
+    DISTILBERT_INPUTS_DOCSTRING,
+)
 class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -728,22 +744,23 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
        outputs = model(input_ids)
        scores = outputs[0]
    """
+
    def __init__(self, config, *inputs, **kwargs):
        super(TFDistilBertForTokenClassification, self).__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels

-        self.distilbert = TFDistilBertMainLayer(config, name='distilbert')
+        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
        self.dropout = tf.keras.layers.Dropout(config.dropout)
-        self.classifier = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='classifier')
+        self.classifier = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )

    def call(self, inputs, **kwargs):
        outputs = self.distilbert(inputs, **kwargs)

        sequence_output = outputs[0]

-        sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False))
+        sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False))
        logits = self.classifier(sequence_output)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
@@ -751,9 +768,12 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
        return outputs  # scores, (hidden_states), (attentions)


-@add_start_docstrings("""DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
                         the hidden-states output to compute `span start logits` and `span end logits`). """,
-                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+    DISTILBERT_START_DOCSTRING,
+    DISTILBERT_INPUTS_DOCSTRING,
+)
 class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -781,13 +801,14 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
        start_scores, end_scores = outputs[:2]

    """
+
    def __init__(self, config, *inputs, **kwargs):
        super(TFDistilBertForQuestionAnswering, self).__init__(config, *inputs, **kwargs)

        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
-        self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='qa_outputs')
+        self.qa_outputs = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
+        )
        assert config.num_labels == 2
        self.dropout = tf.keras.layers.Dropout(config.qa_dropout)

@@ -795,7 +816,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
        distilbert_output = self.distilbert(inputs, **kwargs)

        hidden_states = distilbert_output[0]  # (bs, max_query_len, dim)
-        hidden_states = self.dropout(hidden_states, training=kwargs.get('training', False))                       # (bs, max_query_len, dim)
+        hidden_states = self.dropout(hidden_states, training=kwargs.get("training", False))  # (bs, max_query_len, dim)
        logits = self.qa_outputs(hidden_states)  # (bs, max_query_len, 2)
        start_logits, end_logits = tf.split(logits, 2, axis=-1)
        start_logits = tf.squeeze(start_logits, axis=-1)

--- a/transformers/modeling_tf_gpt2.py
+++ b/transformers/modeling_tf_gpt2.py
@@ -17,28 +17,31 @@

 from __future__ import absolute_import, division, print_function, unicode_literals

-import collections
-import json
 import logging
-import math
-import os
-import sys
-from io import open

 import numpy as np
 import tensorflow as tf

-from .modeling_tf_utils import (TFPreTrainedModel, TFConv1D, TFSharedEmbeddings,
-                                TFSequenceSummary, shape_list, get_initializer)
 from .configuration_gpt2 import GPT2Config
 from .file_utils import add_start_docstrings
+from .modeling_tf_utils import (
+    TFConv1D,
+    TFPreTrainedModel,
+    TFSequenceSummary,
+    TFSharedEmbeddings,
+    get_initializer,
+    shape_list,
+)
+

 logger = logging.getLogger(__name__)

-TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-tf_model.h5",
+TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-tf_model.h5",
    "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-tf_model.h5",
    "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-tf_model.h5",
-                                     "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-tf_model.h5",}
+    "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-tf_model.h5",
+}


 def gelu(x):
@@ -50,8 +53,7 @@ def gelu(x):
    Returns:
        `x` with the GELU activation applied.
    """
-    cdf = 0.5 * (1.0 + tf.tanh(
-        (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
+    cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
    return x * cdf


@@ -68,8 +70,8 @@ class TFAttention(tf.keras.layers.Layer):
        self.split_size = n_state
        self.scale = scale

-        self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name='c_attn')
-        self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_proj')
+        self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name="c_attn")
+        self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_proj")
        self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop)
        self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop)
        self.pruned_heads = set()
@@ -82,7 +84,7 @@ class TFAttention(tf.keras.layers.Layer):
        """1's in the lower triangle, counting from the lower right corner.
        Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
        """
-        i = tf.range(nd)[:,None]
+        i = tf.range(nd)[:, None]
        j = tf.range(ns)
        m = i >= j - ns + nd
        return tf.cast(m, dtype)
@@ -158,8 +160,8 @@ class TFMLP(tf.keras.layers.Layer):
    def __init__(self, n_state, config, **kwargs):
        super(TFMLP, self).__init__(**kwargs)
        nx = config.n_embd
-        self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_fc')
-        self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name='c_proj')
+        self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc")
+        self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj")
        self.act = gelu
        self.dropout = tf.keras.layers.Dropout(config.resid_pdrop)

@@ -174,10 +176,10 @@ class TFBlock(tf.keras.layers.Layer):
    def __init__(self, n_ctx, config, scale=False, **kwargs):
        super(TFBlock, self).__init__(**kwargs)
        nx = config.n_embd
-        self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_1')
-        self.attn = TFAttention(nx, n_ctx, config, scale, name='attn')
-        self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_2')
-        self.mlp = TFMLP(4 * nx, config, name='mlp')
+        self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
+        self.attn = TFAttention(nx, n_ctx, config, scale, name="attn")
+        self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2")
+        self.mlp = TFMLP(4 * nx, config, name="mlp")

    def call(self, inputs, training=False):
        x, layer_past, attention_mask, head_mask = inputs
@@ -204,20 +206,18 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
        self.vocab_size = config.vocab_size
        self.n_embd = config.n_embd

-        self.wte = TFSharedEmbeddings(config.vocab_size,
-                                      config.hidden_size,
-                                      initializer_range=config.initializer_range,
-                                      name='wte')
-        self.wpe = tf.keras.layers.Embedding(config.n_positions,
+        self.wte = TFSharedEmbeddings(
+            config.vocab_size, config.hidden_size, initializer_range=config.initializer_range, name="wte"
+        )
+        self.wpe = tf.keras.layers.Embedding(
+            config.n_positions,
            config.n_embd,
            embeddings_initializer=get_initializer(config.initializer_range),
-                                             name='wpe')
+            name="wpe",
+        )
        self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
-        self.h = [TFBlock(config.n_ctx,
-                          config,
-                          scale=True,
-                          name='h_._{}'.format(i)) for i in range(config.n_layer)]
-        self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_f')
+        self.h = [TFBlock(config.n_ctx, config, scale=True, name="h_._{}".format(i)) for i in range(config.n_layer)]
+        self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_f")

    def get_input_embeddings(self):
        return self.wte
@@ -231,7 +231,17 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
        """
        raise NotImplementedError

-    def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
+    def call(
+        self,
+        inputs,
+        past=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        training=False,
+    ):
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
            past = inputs[1] if len(inputs) > 1 else past
@@ -242,13 +252,13 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
            inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds
            assert len(inputs) <= 7, "Too many inputs."
        elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            past = inputs.get('past', past)
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            token_type_ids = inputs.get('token_type_ids', token_type_ids)
-            position_ids = inputs.get('position_ids', position_ids)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
+            input_ids = inputs.get("input_ids")
+            past = inputs.get("past", past)
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
            assert len(inputs) <= 7, "Too many inputs."
        else:
            input_ids = inputs
@@ -295,7 +305,7 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
        # attention_probs has shape bsz x n_heads x N x N
        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        if not head_mask is None:
+        if head_mask is not None:
            raise NotImplementedError
        else:
            head_mask = [None] * self.num_hidden_layers
@@ -304,11 +314,11 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
        position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])

        if inputs_embeds is None:
-            inputs_embeds = self.wte(input_ids, mode='embedding')
+            inputs_embeds = self.wte(input_ids, mode="embedding")
        position_embeds = self.wpe(position_ids)
        if token_type_ids is not None:
            token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
-            token_type_embeds = self.wte(token_type_ids, mode='embedding')
+            token_type_embeds = self.wte(token_type_ids, mode="embedding")
        else:
            token_type_embeds = 0
        hidden_states = inputs_embeds + position_embeds + token_type_embeds
@@ -353,6 +363,7 @@ class TFGPT2PreTrainedModel(TFPreTrainedModel):
    """ An abstract class to handle weights initialization and
        a simple interface for dowloading and loading pretrained models.
    """
+
    config_class = GPT2Config
    pretrained_model_archive_map = TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
    base_model_prefix = "transformer"
@@ -428,8 +439,12 @@ GPT2_INPUTS_DOCSTRING = r"""    Inputs:
            than the model's internal embedding lookup matrix.
 """

-@add_start_docstrings("The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.",
-                      GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.",
+    GPT2_START_DOCSTRING,
+    GPT2_INPUTS_DOCSTRING,
+)
 class TFGPT2Model(TFGPT2PreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -459,17 +474,22 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

    """
+
    def __init__(self, config, *inputs, **kwargs):
        super(TFGPT2Model, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFGPT2MainLayer(config, name='transformer')
+        self.transformer = TFGPT2MainLayer(config, name="transformer")

    def call(self, inputs, **kwargs):
        outputs = self.transformer(inputs, **kwargs)
        return outputs


-@add_start_docstrings("""The GPT2 Model transformer with a language modeling head on top
-(linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """The GPT2 Model transformer with a language modeling head on top
+(linear layer with weights tied to the input embeddings). """,
+    GPT2_START_DOCSTRING,
+    GPT2_INPUTS_DOCSTRING,
+)
 class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -500,9 +520,10 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
        logits = outputs[0]

    """
+
    def __init__(self, config, *inputs, **kwargs):
        super(TFGPT2LMHeadModel, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFGPT2MainLayer(config, name='transformer')
+        self.transformer = TFGPT2MainLayer(config, name="transformer")

    def get_output_embeddings(self):
        return self.transformer.wte
@@ -518,11 +539,15 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
        return outputs  # lm_logits, presents, (all hidden_states), (attentions)


-@add_start_docstrings("""The GPT2 Model transformer with a language modeling and a multiple-choice classification
+@add_start_docstrings(
+    """The GPT2 Model transformer with a language modeling and a multiple-choice classification
 head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
 The language modeling head has its weights tied to the input embeddings,
 the classification head takes as input the input of a specified classification token index in the input sequence).
-""", GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
+""",
+    GPT2_START_DOCSTRING,
+    GPT2_INPUTS_DOCSTRING,
+)
 class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
    r"""
        **mc_token_ids**: (`optional`, default to index of the last token of the input) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, num_choices)``:
@@ -572,16 +597,30 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
        lm_prediction_scores, mc_prediction_scores = outputs[:2]

    """
+
    def __init__(self, config, *inputs, **kwargs):
        super(TFGPT2DoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
        config.num_labels = 1
-        self.transformer = TFGPT2MainLayer(config, name='transformer')
-        self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
+        self.transformer = TFGPT2MainLayer(config, name="transformer")
+        self.multiple_choice_head = TFSequenceSummary(
+            config, initializer_range=config.initializer_range, name="multiple_choice_head"
+        )

    def get_output_embeddings(self):
        return self.transformer.wte

-    def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, mc_token_ids=None, training=False):
+    def call(
+        self,
+        inputs,
+        past=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        mc_token_ids=None,
+        training=False,
+    ):
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
            past = inputs[1] if len(inputs) > 1 else past
@@ -593,14 +632,14 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
            mc_token_ids = inputs[7] if len(inputs) > 7 else mc_token_ids
            assert len(inputs) <= 8, "Too many inputs."
        elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            past = inputs.get('past', past)
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            token_type_ids = inputs.get('token_type_ids', token_type_ids)
-            position_ids = inputs.get('position_ids', position_ids)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
-            mc_token_ids = inputs.get('mc_token_ids', mc_token_ids)
+            input_ids = inputs.get("input_ids")
+            past = inputs.get("past", past)
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
+            mc_token_ids = inputs.get("mc_token_ids", mc_token_ids)
            assert len(inputs) <= 8, "Too many inputs."
        else:
            input_ids = inputs
@@ -617,7 +656,15 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None

-        flat_inputs = [flat_input_ids, past, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask, inputs_embeds]
+        flat_inputs = [
+            flat_input_ids,
+            past,
+            flat_attention_mask,
+            flat_token_type_ids,
+            flat_position_ids,
+            head_mask,
+            inputs_embeds,
+        ]

        transformer_outputs = self.transformer(flat_inputs, training=training)
        hidden_states = transformer_outputs[0]

--- a/transformers/modeling_tf_openai.py
+++ b/transformers/modeling_tf_openai.py
@@ -17,25 +17,28 @@

 from __future__ import absolute_import, division, print_function, unicode_literals

-import collections
-import json
 import logging
-import math
-import os
-import sys
-from io import open

 import numpy as np
 import tensorflow as tf

-from .modeling_tf_utils import (TFPreTrainedModel, TFConv1D, TFSharedEmbeddings,
-                                TFSequenceSummary, shape_list, get_initializer)
 from .configuration_openai import OpenAIGPTConfig
 from .file_utils import add_start_docstrings
+from .modeling_tf_utils import (
+    TFConv1D,
+    TFPreTrainedModel,
+    TFSequenceSummary,
+    TFSharedEmbeddings,
+    get_initializer,
+    shape_list,
+)
+

 logger = logging.getLogger(__name__)

-TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-tf_model.h5"}
+TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-tf_model.h5"
+}


 def gelu(x):
@@ -47,8 +50,7 @@ def gelu(x):
    Returns:
        `x` with the GELU activation applied.
    """
-    cdf = 0.5 * (1.0 + tf.tanh(
-        (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
+    cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
    return x * cdf


@@ -56,9 +58,11 @@ def swish(x):
    return x * tf.math.sigmoid(x)


-ACT_FNS = {"gelu": tf.keras.layers.Activation(gelu),
+ACT_FNS = {
+    "gelu": tf.keras.layers.Activation(gelu),
    "relu": tf.keras.activations.relu,
-           "swish": tf.keras.layers.Activation(swish)}
+    "swish": tf.keras.layers.Activation(swish),
+}


 class TFAttention(tf.keras.layers.Layer):
@@ -74,8 +78,8 @@ class TFAttention(tf.keras.layers.Layer):
        self.split_size = n_state
        self.scale = scale

-        self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name='c_attn')
-        self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_proj')
+        self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name="c_attn")
+        self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_proj")
        self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop)
        self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop)
        self.pruned_heads = set()
@@ -88,7 +92,7 @@ class TFAttention(tf.keras.layers.Layer):
        """1's in the lower triangle, counting from the lower right corner.
        Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
        """
-        i = tf.range(nd)[:,None]
+        i = tf.range(nd)[:, None]
        j = tf.range(ns)
        m = i >= j - ns + nd
        return tf.cast(m, dtype)
@@ -159,8 +163,8 @@ class TFMLP(tf.keras.layers.Layer):
    def __init__(self, n_state, config, **kwargs):
        super(TFMLP, self).__init__(**kwargs)
        nx = config.n_embd
-        self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_fc')
-        self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name='c_proj')
+        self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc")
+        self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj")
        self.act = gelu
        self.dropout = tf.keras.layers.Dropout(config.resid_pdrop)

@@ -175,10 +179,10 @@ class TFBlock(tf.keras.layers.Layer):
    def __init__(self, n_ctx, config, scale=False, **kwargs):
        super(TFBlock, self).__init__(**kwargs)
        nx = config.n_embd
-        self.attn = TFAttention(nx, n_ctx, config, scale, name='attn')
-        self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_1')
-        self.mlp = TFMLP(4 * nx, config, name='mlp')
-        self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_2')
+        self.attn = TFAttention(nx, n_ctx, config, scale, name="attn")
+        self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
+        self.mlp = TFMLP(4 * nx, config, name="mlp")
+        self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2")

    def call(self, inputs, training=False):
        x, attention_mask, head_mask = inputs
@@ -203,19 +207,17 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
        self.vocab_size = config.vocab_size
        self.n_embd = config.n_embd

-        self.tokens_embed = TFSharedEmbeddings(config.vocab_size,
-                                               config.n_embd,
-                                               initializer_range=config.initializer_range,
-                                               name='tokens_embed')
-        self.positions_embed = tf.keras.layers.Embedding(config.n_positions,
+        self.tokens_embed = TFSharedEmbeddings(
+            config.vocab_size, config.n_embd, initializer_range=config.initializer_range, name="tokens_embed"
+        )
+        self.positions_embed = tf.keras.layers.Embedding(
+            config.n_positions,
            config.n_embd,
            embeddings_initializer=get_initializer(config.initializer_range),
-                                                         name='positions_embed')
+            name="positions_embed",
+        )
        self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
-        self.h = [TFBlock(config.n_ctx,
-                          config,
-                          scale=True,
-                          name='h_._{}'.format(i)) for i in range(config.n_layer)]
+        self.h = [TFBlock(config.n_ctx, config, scale=True, name="h_._{}".format(i)) for i in range(config.n_layer)]

    def get_input_embeddings(self):
        return self.tokens_embed
@@ -229,7 +231,16 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
        """
        raise NotImplementedError

-    def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
+    def call(
+        self,
+        inputs,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        training=False,
+    ):
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
@@ -239,12 +250,12 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
            inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
            assert len(inputs) <= 6, "Too many inputs."
        elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            token_type_ids = inputs.get('token_type_ids', token_type_ids)
-            position_ids = inputs.get('position_ids', position_ids)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
            assert len(inputs) <= 6, "Too many inputs."
        else:
            input_ids = inputs
@@ -286,7 +297,7 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
        # attention_probs has shape bsz x n_heads x N x N
        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        if not head_mask is None:
+        if head_mask is not None:
            raise NotImplementedError
        else:
            head_mask = [None] * self.num_hidden_layers
@@ -295,11 +306,11 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
        position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])

        if inputs_embeds is None:
-            inputs_embeds = self.tokens_embed(input_ids, mode='embedding')
+            inputs_embeds = self.tokens_embed(input_ids, mode="embedding")
        position_embeds = self.positions_embed(position_ids)
        if token_type_ids is not None:
            token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
-            token_type_embeds = self.tokens_embed(token_type_ids, mode='embedding')
+            token_type_embeds = self.tokens_embed(token_type_ids, mode="embedding")
        else:
            token_type_embeds = 0
        hidden_states = inputs_embeds + position_embeds + token_type_embeds
@@ -338,6 +349,7 @@ class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel):
    """ An abstract class to handle weights initialization and
        a simple interface for dowloading and loading pretrained models.
    """
+
    config_class = OpenAIGPTConfig
    pretrained_model_archive_map = TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
    base_model_prefix = "transformer"
@@ -409,8 +421,12 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""    Inputs:
            than the model's internal embedding lookup matrix.
 """

-@add_start_docstrings("The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top.",
-                      OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top.",
+    OPENAI_GPT_START_DOCSTRING,
+    OPENAI_GPT_INPUTS_DOCSTRING,
+)
 class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -436,17 +452,22 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

    """
+
    def __init__(self, config, *inputs, **kwargs):
        super(TFOpenAIGPTModel, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
+        self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")

    def call(self, inputs, **kwargs):
        outputs = self.transformer(inputs, **kwargs)
        return outputs


-@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling head on top
-(linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """OpenAI GPT Model transformer with a language modeling head on top
+(linear layer with weights tied to the input embeddings). """,
+    OPENAI_GPT_START_DOCSTRING,
+    OPENAI_GPT_INPUTS_DOCSTRING,
+)
 class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -472,9 +493,10 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
        logits = outputs[0]

    """
+
    def __init__(self, config, *inputs, **kwargs):
        super(TFOpenAIGPTLMHeadModel, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
+        self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")

    def get_output_embeddings(self):
        return self.transformer.tokens_embed
@@ -490,11 +512,15 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
        return outputs  # lm_logits, (all hidden_states), (attentions)


-@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
+@add_start_docstrings(
+    """OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
 head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
 The language modeling head has its weights tied to the input embeddings,
 the classification head takes as input the input of a specified classification token index in the input sequence).
-""", OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
+""",
+    OPENAI_GPT_START_DOCSTRING,
+    OPENAI_GPT_INPUTS_DOCSTRING,
+)
 class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
    r"""
        **mc_token_ids**: (`optional`, default to index of the last token of the input) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, num_choices)``:
@@ -536,16 +562,29 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
        lm_prediction_scores, mc_prediction_scores = outputs[:2]

    """
+
    def __init__(self, config, *inputs, **kwargs):
        super(TFOpenAIGPTDoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
        config.num_labels = 1
-        self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
-        self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
+        self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
+        self.multiple_choice_head = TFSequenceSummary(
+            config, initializer_range=config.initializer_range, name="multiple_choice_head"
+        )

    def get_output_embeddings(self):
        return self.transformer.tokens_embed

-    def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, mc_token_ids=None, training=False):
+    def call(
+        self,
+        inputs,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        mc_token_ids=None,
+        training=False,
+    ):
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
@@ -556,13 +595,13 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
            mc_token_ids = inputs[6] if len(inputs) > 6 else mc_token_ids
            assert len(inputs) <= 7, "Too many inputs."
        elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            token_type_ids = inputs.get('token_type_ids', token_type_ids)
-            position_ids = inputs.get('position_ids', position_ids)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
-            mc_token_ids = inputs.get('mc_token_ids', mc_token_ids)
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
+            mc_token_ids = inputs.get("mc_token_ids", mc_token_ids)
            assert len(inputs) <= 7, "Too many inputs."
        else:
            input_ids = inputs
@@ -579,7 +618,14 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None

-        flat_inputs = [flat_input_ids, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask, inputs_embeds]
+        flat_inputs = [
+            flat_input_ids,
+            flat_attention_mask,
+            flat_token_type_ids,
+            flat_position_ids,
+            head_mask,
+            inputs_embeds,
+        ]

        transformer_outputs = self.transformer(flat_inputs, training=training)
        hidden_states = transformer_outputs[0]