Black 20 release

a75c64d8 · Lysandre · e78c1103 · a75c64d8 · a75c64d8 · a75c64d8
Commit a75c64d8 authored Aug 26, 2020 by Lysandre
20 changed files
--- a/src/transformers/configuration_pegasus.py
+++ b/src/transformers/configuration_pegasus.py
@@ -92,8 +92,8 @@ expected_alpha = {
 @add_start_docstrings_to_callable(BART_CONFIG_ARGS_DOC)
 class PegasusConfig(BartConfig):
    r"""
-        :class:`~transformers.PegasusConfig` is the configuration class to store the configuration of a
-        `PegasusModel`.
+    :class:`~transformers.PegasusConfig` is the configuration class to store the configuration of a
+    `PegasusModel`.
    """
    model_type = "pegasus"
    # The implementation of the config object is in BartConfig
--- a/src/transformers/configuration_reformer.py
+++ b/src/transformers/configuration_reformer.py
@@ -29,105 +29,105 @@ REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class ReformerConfig(PretrainedConfig):
    r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.ReformerModel`.
-        It is used to instantiate an Reformer model according to the specified arguments, defining the model
-        architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.ReformerModel`.
+    It is used to instantiate an Reformer model according to the specified arguments, defining the model
+    architecture.

-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.

-        Args:
-            attention_head_size (:obj:`int`, optional, defaults to 64):
-                Dimensionality of the projected key, query and value vectors
-            attn_layers (:obj:`list(str)`, optional, defaults to ["local", "lsh", "local", "lsh", "local", "lsh"]):
-                List of attention layer types in ascending order. It can be chosen between a
-                LSHSelfAttention layer ("lsh") and a LocalSelfAttention layer ("local").
-                For more information on LSHSelfAttention layer, see `LSH Self Attention <reformer.html#lsh-self-attention>`__ .
-                For more information on LocalSelfAttention layer, see `Local Self Attention <reformer.html#local-sensitive-hashing-self-attention>`__ .
-            axial_pos_embds (:obj:`bool`, optional, defaults to True):
-                If `True` use axial position embeddings. For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__
-            axial_norm_std (:obj:`float`, optional, defaluts to 1.0):
-                The standard deviation of the normal_initializer for initializing the weight matrices of the axial positional encodings.
-            axial_pos_shape (:obj:`list(int)`, optional, defaults to `[64, 64]`):
-                The position dims of the axial position encodings.
-                During training the product of the position dims has to equal the sequence length.
-                For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__.
-            axial_pos_embds_dim (:obj:`list(int)`, optional, defaults to `[64, 192]`):
-                The embedding dims of the axial position encodings.
-                The sum of the embedding dims has to equal the hidden size.
-                For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__.
-            chunk_size_lm_head (:obj:`int`, optional, defaults to 0):
-                The chunk size of the final language model feed forward head layer.
-                A chunk size of 0 means that the feed forward layer is not chunked.
-                A chunk size of n means that the feed forward layer processes n < sequence_length embeddings at a time.
-                For more information on feed forward chunking, see `How does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ .
-            eos_token_id (:obj:`int`, optional, defaults to 2):
-                The token id for the <EOS> token.
-            feed_forward_size (:obj:`int`, optional, defaults to 512):
-                Dimensionality of the "feed_forward" (i.e., feed-forward) layer in the residual attention block.
-            hash_seed (:obj:`int`, optional, defaults to `None`):
-                Seed that can be used to make local sensitive hashing in LSHSelfAttention deterministic. This should only be set for testing purposed. For evaluation and training purposes `hash_seed` should be set to `None` to ensure fully random rotations in local sensitive hashing scheme.
-            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "relu"):
-                The non-linear activation function (function or string) in the feed forward layer in the residual attention block.
-                If string, "gelu", "relu", "swish", "gelu_new" and "gelu_fast" are supported.
-            hidden_dropout_prob (:obj:`float`, optional, defaults to 0.05):
-                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-            hidden_size (:obj:`int`, optional, defaults to 256):
-                Dimensionality of the output hidden states of the residual attention blocks.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            is_decoder (:obj:`bool`, optional, defaults to False):
-                If `is_decoder` is True, a causal mask is used in addition to `attention_mask`.
-                When using the Reformer for causal language modeling, `is_decoder` is set to `True`.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            local_chunk_length (:obj:`int`, optional, defaults to 64):
-                Length of chunk which attends to itself in LocalSelfAttention. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention).
-            local_num_chunks_before (:obj:`int`, optional, defaults to 1):
-                Number of previous neighbouring chunks to attend to in LocalSelfAttention layer to itself.
-            local_num_chunks_after (:obj:`int`, optional, defaults to 0):
-                Number of following neighbouring chunks to attend to in LocalSelfAttention layer in addition to itself.
-            local_attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities in LocalSelfAttention.
-            lsh_attn_chunk_length (:obj:`int`, optional, defaults to 64):
-                Length of chunk which attends to itself in LSHSelfAttention. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention).
-            lsh_num_chunks_before (:obj:`int`, optional, defaults to 1):
-                Number of previous neighbouring chunks to attend to in LSHSelfAttention layer to itself.
-            lsh_num_chunks_after (:obj:`int`, optional, defaults to 0):
-                Number of following neighbouring chunks to attend to in LSHSelfAttention layer to itself.
-            lsh_attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities in LSHSelfAttention.
-            max_position_embeddings (:obj:`int`, optional, defaults to 4096):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            num_attention_heads (:obj:`int`, optional, defaults to 12):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            num_buckets (:obj:`int` or :obj:`list(int)`, optional, defaults to `None`):
-                Number of buckets, the key query vectors can be "hashed into" using the locality sensitive hashing scheme. Each query key vector is hashed into a hash in `1, ..., num_buckets`.
-                The number of buckets can also be factorized into a list for improved memory complexity. In this case, each query key vector is hashed into a hash in `1-1, 1-2, ..., num_buckets[0]-1, ..., num_buckets[0]-num_buckets[1]` if `num_buckets` is factorized into two factors.
-                The number of buckets (or the product the factors) should approximately equal sequence length / lsh_chunk_length. If `num_buckets` is set to `None`, a good value for `num_buckets` is calculated on the fly.
-            num_hashes (:obj:`int`, optional, defaults to 1):
-                Number of hashing rounds (e.g. number of random rotations) in Local Sensitive Hashing scheme.
-                The higher `num_hashes`, the more accurate the `LSHSelfAttention` becomes, but also the more memory and time intensive the hashing becomes.
-            pad_token_id (:obj:`int`, optional, defaults to 0):
-                The token id for the <PAD> token.
-            vocab_size (:obj:`int`, optional, defaults to 320):
-                Vocabulary size of the Reformer model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ReformerModel`.
+    Args:
+        attention_head_size (:obj:`int`, optional, defaults to 64):
+            Dimensionality of the projected key, query and value vectors
+        attn_layers (:obj:`list(str)`, optional, defaults to ["local", "lsh", "local", "lsh", "local", "lsh"]):
+            List of attention layer types in ascending order. It can be chosen between a
+            LSHSelfAttention layer ("lsh") and a LocalSelfAttention layer ("local").
+            For more information on LSHSelfAttention layer, see `LSH Self Attention <reformer.html#lsh-self-attention>`__ .
+            For more information on LocalSelfAttention layer, see `Local Self Attention <reformer.html#local-sensitive-hashing-self-attention>`__ .
+        axial_pos_embds (:obj:`bool`, optional, defaults to True):
+            If `True` use axial position embeddings. For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__
+        axial_norm_std (:obj:`float`, optional, defaluts to 1.0):
+            The standard deviation of the normal_initializer for initializing the weight matrices of the axial positional encodings.
+        axial_pos_shape (:obj:`list(int)`, optional, defaults to `[64, 64]`):
+            The position dims of the axial position encodings.
+            During training the product of the position dims has to equal the sequence length.
+            For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__.
+        axial_pos_embds_dim (:obj:`list(int)`, optional, defaults to `[64, 192]`):
+            The embedding dims of the axial position encodings.
+            The sum of the embedding dims has to equal the hidden size.
+            For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__.
+        chunk_size_lm_head (:obj:`int`, optional, defaults to 0):
+            The chunk size of the final language model feed forward head layer.
+            A chunk size of 0 means that the feed forward layer is not chunked.
+            A chunk size of n means that the feed forward layer processes n < sequence_length embeddings at a time.
+            For more information on feed forward chunking, see `How does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ .
+        eos_token_id (:obj:`int`, optional, defaults to 2):
+            The token id for the <EOS> token.
+        feed_forward_size (:obj:`int`, optional, defaults to 512):
+            Dimensionality of the "feed_forward" (i.e., feed-forward) layer in the residual attention block.
+        hash_seed (:obj:`int`, optional, defaults to `None`):
+            Seed that can be used to make local sensitive hashing in LSHSelfAttention deterministic. This should only be set for testing purposed. For evaluation and training purposes `hash_seed` should be set to `None` to ensure fully random rotations in local sensitive hashing scheme.
+        hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "relu"):
+            The non-linear activation function (function or string) in the feed forward layer in the residual attention block.
+            If string, "gelu", "relu", "swish", "gelu_new" and "gelu_fast" are supported.
+        hidden_dropout_prob (:obj:`float`, optional, defaults to 0.05):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        hidden_size (:obj:`int`, optional, defaults to 256):
+            Dimensionality of the output hidden states of the residual attention blocks.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        is_decoder (:obj:`bool`, optional, defaults to False):
+            If `is_decoder` is True, a causal mask is used in addition to `attention_mask`.
+            When using the Reformer for causal language modeling, `is_decoder` is set to `True`.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        local_chunk_length (:obj:`int`, optional, defaults to 64):
+            Length of chunk which attends to itself in LocalSelfAttention. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention).
+        local_num_chunks_before (:obj:`int`, optional, defaults to 1):
+            Number of previous neighbouring chunks to attend to in LocalSelfAttention layer to itself.
+        local_num_chunks_after (:obj:`int`, optional, defaults to 0):
+            Number of following neighbouring chunks to attend to in LocalSelfAttention layer in addition to itself.
+        local_attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities in LocalSelfAttention.
+        lsh_attn_chunk_length (:obj:`int`, optional, defaults to 64):
+            Length of chunk which attends to itself in LSHSelfAttention. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention).
+        lsh_num_chunks_before (:obj:`int`, optional, defaults to 1):
+            Number of previous neighbouring chunks to attend to in LSHSelfAttention layer to itself.
+        lsh_num_chunks_after (:obj:`int`, optional, defaults to 0):
+            Number of following neighbouring chunks to attend to in LSHSelfAttention layer to itself.
+        lsh_attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities in LSHSelfAttention.
+        max_position_embeddings (:obj:`int`, optional, defaults to 4096):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        num_attention_heads (:obj:`int`, optional, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_buckets (:obj:`int` or :obj:`list(int)`, optional, defaults to `None`):
+            Number of buckets, the key query vectors can be "hashed into" using the locality sensitive hashing scheme. Each query key vector is hashed into a hash in `1, ..., num_buckets`.
+            The number of buckets can also be factorized into a list for improved memory complexity. In this case, each query key vector is hashed into a hash in `1-1, 1-2, ..., num_buckets[0]-1, ..., num_buckets[0]-num_buckets[1]` if `num_buckets` is factorized into two factors.
+            The number of buckets (or the product the factors) should approximately equal sequence length / lsh_chunk_length. If `num_buckets` is set to `None`, a good value for `num_buckets` is calculated on the fly.
+        num_hashes (:obj:`int`, optional, defaults to 1):
+            Number of hashing rounds (e.g. number of random rotations) in Local Sensitive Hashing scheme.
+            The higher `num_hashes`, the more accurate the `LSHSelfAttention` becomes, but also the more memory and time intensive the hashing becomes.
+        pad_token_id (:obj:`int`, optional, defaults to 0):
+            The token id for the <PAD> token.
+        vocab_size (:obj:`int`, optional, defaults to 320):
+            Vocabulary size of the Reformer model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ReformerModel`.

-        Example::
+    Example::

-            >>> from transformers import ReformerModel, ReformerConfig
+        >>> from transformers import ReformerModel, ReformerConfig

-            >>> # Initializing a Reformer configuration
-            >>> configuration = ReformerConfig()
+        >>> # Initializing a Reformer configuration
+        >>> configuration = ReformerConfig()

-            >>> # Initializing a Reformer model
-            >>> model = ReformerModel(configuration)
+        >>> # Initializing a Reformer model
+        >>> model = ReformerModel(configuration)

-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """
    model_type = "reformer"


--- a/src/transformers/configuration_retribert.py
+++ b/src/transformers/configuration_retribert.py
@@ -28,47 +28,47 @@ RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class RetriBertConfig(PretrainedConfig):
    r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.RetriBertModel`.
-        It is used to instantiate a RetriBertModel model according to the specified arguments, defining the model
-        architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.RetriBertModel`.
+    It is used to instantiate a RetriBertModel model according to the specified arguments, defining the model
+    architecture.

-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.


-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30522):
-                Vocabulary size of the BERT model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
-            hidden_size (:obj:`int`, optional, defaults to 768):
-                Dimensionality of the encoder layers and the pooler layer.
-            num_hidden_layers (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            num_attention_heads (:obj:`int`, optional, defaults to 12):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            intermediate_size (:obj:`int`, optional, defaults to 3072):
-                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            type_vocab_size (:obj:`int`, optional, defaults to 2):
-                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            share_encoders (:obj:`bool`, optional, defaults to True):
-                Whether to use the same Bert-type encoder for the queries and document
-            projection_dim (:obj:`int`, optional, defaults to 128):
-                Final dimension of the query and document representation after projection
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
+        hidden_size (:obj:`int`, optional, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, optional, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, optional, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, optional, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        share_encoders (:obj:`bool`, optional, defaults to True):
+            Whether to use the same Bert-type encoder for the queries and document
+        projection_dim (:obj:`int`, optional, defaults to 128):
+            Final dimension of the query and document representation after projection

    """
    model_type = "retribert"

--- a/src/transformers/configuration_roberta.py
+++ b/src/transformers/configuration_roberta.py
@@ -33,34 +33,33 @@ ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class RobertaConfig(BertConfig):
    r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.RobertaModel`.
-        It is used to instantiate an RoBERTa model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.RobertaModel`.
+    It is used to instantiate an RoBERTa model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.

-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.

-        The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`.
-        It reuses the same defaults. Please check the parent class for more information.
+    The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`.
+    It reuses the same defaults. Please check the parent class for more information.

-        Example::
+    Example::

-            >>> from transformers import RobertaConfig, RobertaModel
+        >>> from transformers import RobertaConfig, RobertaModel

-            >>> # Initializing a RoBERTa configuration
-            >>> configuration = RobertaConfig()
+        >>> # Initializing a RoBERTa configuration
+        >>> configuration = RobertaConfig()

-            >>> # Initializing a model from the configuration
-            >>> model = RobertaModel(configuration)
+        >>> # Initializing a model from the configuration
+        >>> model = RobertaModel(configuration)

-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """
    model_type = "roberta"

    def __init__(self, pad_token_id=1, bos_token_id=0, eos_token_id=2, **kwargs):
-        """Constructs RobertaConfig.
-        """
+        """Constructs RobertaConfig."""
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
--- a/src/transformers/configuration_t5.py
+++ b/src/transformers/configuration_t5.py
@@ -31,33 +31,33 @@ T5_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class T5Config(PretrainedConfig):
    r"""
-        :class:`~transformers.T5Config` is the configuration class to store the configuration of a
-        `T5Model`.
+    :class:`~transformers.T5Config` is the configuration class to store the configuration of a
+    `T5Model`.


-        Arguments:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `T5Model`.
-            d_model: Size of the encoder layers and the pooler layer. `d_model` can also accesed via the property `hidden_size`.
-            num_layers: Number of hidden layers in the Transformer encoder. `num_layers` can also be accessed via the property `num_hidden_layers`.
-            d_kv: Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model // num_heads`.
-            d_ff: Size of the intermediate feed forward layer in each `T5Block`.
-            num_heads: Number of attention heads for each attention layer in
-                the Transformer encoder. `num_heads` can also be accessed via the property `num_attention_heads`.
-            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
-                layer in the Transformer encoder.
-            hidden_act: The non-linear activation function (function or string) in the
-                encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob: The dropout probabilitiy for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob: The dropout ratio for the attention
-                probabilities.
-            n_positions: The maximum sequence length that this model might
-                ever be used with. Typically set this to something large just in case
-                (e.g., 512 or 1024 or 2048). `n_positions` can also be accessed via the property `max_position_embeddings`.
-            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
-                `T5Model`.
-            initializer_factor: A factor for initializing all weight matrices (should be kept to 1.0, used for initialization testing).
-            layer_norm_eps: The epsilon used by LayerNorm.
+    Arguments:
+        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `T5Model`.
+        d_model: Size of the encoder layers and the pooler layer. `d_model` can also accesed via the property `hidden_size`.
+        num_layers: Number of hidden layers in the Transformer encoder. `num_layers` can also be accessed via the property `num_hidden_layers`.
+        d_kv: Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model // num_heads`.
+        d_ff: Size of the intermediate feed forward layer in each `T5Block`.
+        num_heads: Number of attention heads for each attention layer in
+            the Transformer encoder. `num_heads` can also be accessed via the property `num_attention_heads`.
+        intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+            layer in the Transformer encoder.
+        hidden_act: The non-linear activation function (function or string) in the
+            encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        hidden_dropout_prob: The dropout probabilitiy for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob: The dropout ratio for the attention
+            probabilities.
+        n_positions: The maximum sequence length that this model might
+            ever be used with. Typically set this to something large just in case
+            (e.g., 512 or 1024 or 2048). `n_positions` can also be accessed via the property `max_position_embeddings`.
+        type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+            `T5Model`.
+        initializer_factor: A factor for initializing all weight matrices (should be kept to 1.0, used for initialization testing).
+        layer_norm_eps: The epsilon used by LayerNorm.
    """
    model_type = "t5"

@@ -80,7 +80,10 @@ class T5Config(PretrainedConfig):
        **kwargs
    ):
        super().__init__(
-            pad_token_id=pad_token_id, eos_token_id=eos_token_id, is_encoder_decoder=is_encoder_decoder, **kwargs,
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            **kwargs,
        )
        self.vocab_size = vocab_size
        self.n_positions = n_positions

--- a/src/transformers/configuration_transfo_xl.py
+++ b/src/transformers/configuration_transfo_xl.py
@@ -31,84 +31,84 @@ TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class TransfoXLConfig(PretrainedConfig):
    """
-        This is the configuration class to store the configuration of a :class:`~transformers.TransfoXLModel`.
-        It is used to instantiate a Transformer XL model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the `Transformer XL <https://huggingface.co/transfo-xl-wt103>`__ architecture.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 267735):
-                Vocabulary size of the Transformer XL model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.TransfoXLModel`.
-            cutoffs (:obj:`List[int]`, optional, defaults to :obj:`[20000, 40000, 200000]`):
-                Cutoffs for the adaptive softmax
-            d_model (:obj:`int`, optional, defaults to 1024):
-                Dimensionality of the model's hidden states.
-            d_embed (:obj:`int`, optional, defaults to 1024):
-                Dimensionality of the embeddings
-            n_head (:obj:`int`, optional, defaults to 16):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            d_head (:obj:`int`, optional, defaults to 64):
-                Dimensionality of the model's heads.
-            d_inner (:obj:`int`, optional, defaults to 4096):
-                Inner dimension in FF
-            div_val (:obj:`int`, optional, defaults to 4):
-                Divident value for adapative input and softmax
-            pre_lnorm (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Apply LayerNorm to the input instead of the output
-            n_layer (:obj:`int`, optional, defaults to 18):
-                Number of hidden layers in the Transformer encoder.
-            tgt_len (:obj:`int`, optional, defaults to 128):
-                Number of tokens to predict
-            ext_len (:obj:`int`, optional, defaults to 0):
-                Length of the extended context
-            mem_len (:obj:`int`, optional, defaults to 1600):
-                Length of the retained previous heads
-            clamp_len (:obj:`int`, optional, defaults to 1000):
-                use the same pos embeddings after clamp_len
-            same_length (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Use the same attn length for all tokens
-            proj_share_all_but_first (:obj:`boolean`, optional, defaults to :obj:`True`):
-                True to share all but first projs, False not to share.
-            attn_type (:obj:`int`, optional, defaults to 0):
-                Attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
-            sample_softmax (:obj:`int`, optional, defaults to -1):
-                number of samples in sampled softmax
-            adaptive (:obj:`boolean`, optional, defaults to :obj:`True`):
-                use adaptive softmax
-            dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-            dropatt (:obj:`float`, optional, defaults to 0):
-                The dropout ratio for the attention probabilities.
-            untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Untie relative position biases
-            init (:obj:`string`, optional, defaults to `normal`):
-                Parameter initializer to use
-            init_range (:obj:`float`, optional, defaults to 0.01):
-                Parameters initialized by U(-init_range, init_range).
-            proj_init_std (:obj:`float`, optional, defaults to 0.01):
-                Parameters initialized by N(0, init_std)
-            init_std (:obj:`float`, optional, defaults to 0.02):
-                Parameters initialized by N(0, init_std)
-            layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
-                The epsilon to use in the layer normalization layers
-
-        Example::
-
-            >>> from transformers import TransfoXLConfig, TransfoXLModel
-
-            >>> # Initializing a Transformer XL configuration
-            >>> configuration = TransfoXLConfig()
-
-            >>> # Initializing a model from the configuration
-            >>> model = TransfoXLModel(configuration)
-
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+    This is the configuration class to store the configuration of a :class:`~transformers.TransfoXLModel`.
+    It is used to instantiate a Transformer XL model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the `Transformer XL <https://huggingface.co/transfo-xl-wt103>`__ architecture.
+
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
+
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 267735):
+            Vocabulary size of the Transformer XL model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.TransfoXLModel`.
+        cutoffs (:obj:`List[int]`, optional, defaults to :obj:`[20000, 40000, 200000]`):
+            Cutoffs for the adaptive softmax
+        d_model (:obj:`int`, optional, defaults to 1024):
+            Dimensionality of the model's hidden states.
+        d_embed (:obj:`int`, optional, defaults to 1024):
+            Dimensionality of the embeddings
+        n_head (:obj:`int`, optional, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        d_head (:obj:`int`, optional, defaults to 64):
+            Dimensionality of the model's heads.
+        d_inner (:obj:`int`, optional, defaults to 4096):
+            Inner dimension in FF
+        div_val (:obj:`int`, optional, defaults to 4):
+            Divident value for adapative input and softmax
+        pre_lnorm (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Apply LayerNorm to the input instead of the output
+        n_layer (:obj:`int`, optional, defaults to 18):
+            Number of hidden layers in the Transformer encoder.
+        tgt_len (:obj:`int`, optional, defaults to 128):
+            Number of tokens to predict
+        ext_len (:obj:`int`, optional, defaults to 0):
+            Length of the extended context
+        mem_len (:obj:`int`, optional, defaults to 1600):
+            Length of the retained previous heads
+        clamp_len (:obj:`int`, optional, defaults to 1000):
+            use the same pos embeddings after clamp_len
+        same_length (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Use the same attn length for all tokens
+        proj_share_all_but_first (:obj:`boolean`, optional, defaults to :obj:`True`):
+            True to share all but first projs, False not to share.
+        attn_type (:obj:`int`, optional, defaults to 0):
+            Attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
+        sample_softmax (:obj:`int`, optional, defaults to -1):
+            number of samples in sampled softmax
+        adaptive (:obj:`boolean`, optional, defaults to :obj:`True`):
+            use adaptive softmax
+        dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        dropatt (:obj:`float`, optional, defaults to 0):
+            The dropout ratio for the attention probabilities.
+        untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Untie relative position biases
+        init (:obj:`string`, optional, defaults to `normal`):
+            Parameter initializer to use
+        init_range (:obj:`float`, optional, defaults to 0.01):
+            Parameters initialized by U(-init_range, init_range).
+        proj_init_std (:obj:`float`, optional, defaults to 0.01):
+            Parameters initialized by N(0, init_std)
+        init_std (:obj:`float`, optional, defaults to 0.02):
+            Parameters initialized by N(0, init_std)
+        layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers
+
+    Example::
+
+        >>> from transformers import TransfoXLConfig, TransfoXLModel
+
+        >>> # Initializing a Transformer XL configuration
+        >>> configuration = TransfoXLConfig()
+
+        >>> # Initializing a model from the configuration
+        >>> model = TransfoXLModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """

    model_type = "transfo-xl"

--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -29,116 +29,116 @@ logger = logging.get_logger(__name__)


 class PretrainedConfig(object):
-    r""" Base class for all configuration classes.
-        Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving
-        configurations.
-
-        Note:
-            A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to
-            initialize a model does **not** load the model weights.
-            It only affects the model's configuration.
-
-        Class attributes (overridden by derived classes)
-            - **model_type** (:obj:`str`): An identifier for the model type, serialized into the JSON file, and used to
-              recreate the correct object in :class:`~transformers.AutoConfig`.
-
-        Args:
-            output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the model should return all hidden-states.
-            output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the model should returns all attentions.
-            use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
-                Whether or not the model should return the last key/values attentions (not used by all models).
-            return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the model should return a :class:`~transformers.file_utils.ModelOutput` instead of a
-                plain tuple.
-            is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether the model is used as an encoder/decoder or not.
-            is_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether the model is used as decoder or not (in which case it's used as an encoder).
-            add_cross_attention (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether cross-attention layers should be added to the model. Note, this option is only relevant for models that can be used as decoder models within the `:class:~transformers.EncoderDecoderModel` class, which consists of all models in ``AUTO_MODELS_FOR_CAUSAL_LM``.
-            tie_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`)
-                Whether all encoder weights should be tied to their equivalent decoder weights. This requires the encoder and decoder model to have the exact same parameter names.
-            prune_heads (:obj:`Dict[int, List[int]]`, `optional`, defaults to :obj:`{}`):
-                Pruned heads of the model. The keys are the selected layer indices and the associated values, the list
-                of heads to prune in said layer.
-
-                For instance ``{1: [0, 2], 2: [2, 3]}`` will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer
-                2.
-            xla_device (:obj:`bool`, `optional`):
-                A flag to indicate if TPU are available or not.
-            chunk_size_feed_forward (:obj:`int`, `optional`, defaults to :obj:`0`):
-                The chunk size of all feed forward layers in the residual attention blocks.
-                A chunk size of :obj:`0` means that the feed forward layer is not chunked.
-                A chunk size of n means that the feed forward layer processes :obj:`n` < sequence_length embeddings at a time.
-                For more information on feed forward chunking, see `How does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ .
-
-        Parameters for sequence generation
-            - **max_length** (:obj:`int`, `optional`, defaults to 20) -- Maximum length that will be used by
-              default in the :obj:`generate` method of the model.
-            - **min_length** (:obj:`int`, `optional`, defaults to 10) -- Minimum length that will be used by
-              default in the :obj:`generate` method of the model.
-            - **do_sample** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Flag that will be used by default in
-              the :obj:`generate` method of the model. Whether or not to use sampling ; use greedy decoding otherwise.
-            - **early_stopping** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Flag that will be used by
-              default in the :obj:`generate` method of the model. Whether to stop the beam search when at least
-              ``num_beams`` sentences are finished per batch or not.
-            - **num_beams** (:obj:`int`, `optional`, defaults to 1) -- Number of beams for beam search that will be
-              used by default in the :obj:`generate` method of the model. 1 means no beam search.
-            - **temperature** (:obj:`float`, `optional`, defaults to 1) -- The value used to module the next token
-              probabilities that will be used by default in the :obj:`generate` method of the model. Must be strictly
-              positive.
-            - **top_k** (:obj:`int`, `optional`, defaults to 50) -- Number of highest probability vocabulary tokens to
-              keep for top-k-filtering that will be used by default in the :obj:`generate` method of the model.
-            - **top_p** (:obj:`float`, `optional`, defaults to 1) --  Value that will be used by default in the
-              :obj:`generate` method of the model for ``top_p``. If set to float < 1, only the most probable tokens
-              with probabilities that add up to ``top_p`` or higher are kept for generation.
-            - **repetition_penalty** (:obj:`float`, `optional`, defaults to 1) -- Parameter for repetition penalty
-              that will be used by default in the :obj:`generate` method of the model. 1.0 means no penalty.
-            - **length_penalty** (:obj:`float`, `optional`, defaults to 1) -- Exponential penalty to the length that
-              will be used by default in the :obj:`generate` method of the model.
-            - **no_repeat_ngram_size** (:obj:`int`, `optional`, defaults to 0) -- Value that will be used by default
-              in the :obj:`generate` method of the model for ``no_repeat_ngram_size``. If set to int > 0, all ngrams of
-              that size can only occur once.
-            - **bad_words_ids** (:obj:`List[int]`, `optional`) -- List of token ids that are not allowed to be
-              generated that will be used by default in the :obj:`generate` method of the model. In order to get the
-              tokens of the words that should not appear in the generated text, use
-              :obj:`tokenizer.encode(bad_word, add_prefix_space=True)`.
-            - **num_return_sequences** (:obj:`int`, `optional`, defaults to 1) -- Number of independently computed
-              returned sequences for each element in the batch that will be used by default in the :obj:`generate`
-              method of the model.
-
-        Parameters for fine-tuning tasks
-            - **architectures** (:obj:`List[str]`, `optional`) -- Model architectures that can be used with the
-              model pretrained weights.
-            - **finetuning_task** (:obj:`str`, `optional`) -- Name of the task used to fine-tune the model. This can be
-              used when converting from an original (TensorFlow or PyTorch) checkpoint.
-            - **id2label** (:obj:`List[str]`, `optional`) -- A map from index (for instance prediction index, or target
-              index) to label.
-            - **label2id** (:obj:`Dict[str, int]`, `optional`) -- A map from label to index for the model.
-            - **num_labels** (:obj:`int`, `optional`) -- Number of labels to use in the last layer added to the model,
-              typically for a classification task.
-            - **task_specific_params** (:obj:`Dict[str, Any]`, `optional`) -- Additional keyword arguments to store for
-              the current task.
-
-        Parameters linked to the tokenizer
-            - **prefix** (:obj:`str`, `optional`) -- A specific prompt that should be added at the beginning of each
-              text before calling the model.
-            - **bos_token_id** (:obj:`int`, `optional`)) -- The id of the `beginning-of-stream` token.
-            - **pad_token_id** (:obj:`int`, `optional`)) -- The id of the `padding` token.
-            - **eos_token_id** (:obj:`int`, `optional`)) -- The id of the `end-of-stream` token.
-            - **decoder_start_token_id** (:obj:`int`, `optional`)) -- If an encoder-decoder model starts decoding with
-              a different token than `bos`, the id of that token.
-
-        PyTorch specific parameters
-            - **torchscript** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether or not the model should be
-              used with Torchscript.
-            - **tie_word_embeddings** (:obj:`bool`, `optional`, defaults to :obj:`True`) -- Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the model has a output word embedding layer.
-
-        TensorFlow specific parameters
-            - **use_bfloat16** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether or not the model should
-              use BFloat16 scalars (only used by some TensorFlow models).
+    r"""Base class for all configuration classes.
+    Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving
+    configurations.
+
+    Note:
+        A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to
+        initialize a model does **not** load the model weights.
+        It only affects the model's configuration.
+
+    Class attributes (overridden by derived classes)
+        - **model_type** (:obj:`str`): An identifier for the model type, serialized into the JSON file, and used to
+          recreate the correct object in :class:`~transformers.AutoConfig`.
+
+    Args:
+        output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not the model should return all hidden-states.
+        output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not the model should returns all attentions.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not the model should return a :class:`~transformers.file_utils.ModelOutput` instead of a
+            plain tuple.
+        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether the model is used as an encoder/decoder or not.
+        is_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether the model is used as decoder or not (in which case it's used as an encoder).
+        add_cross_attention (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether cross-attention layers should be added to the model. Note, this option is only relevant for models that can be used as decoder models within the `:class:~transformers.EncoderDecoderModel` class, which consists of all models in ``AUTO_MODELS_FOR_CAUSAL_LM``.
+        tie_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`)
+            Whether all encoder weights should be tied to their equivalent decoder weights. This requires the encoder and decoder model to have the exact same parameter names.
+        prune_heads (:obj:`Dict[int, List[int]]`, `optional`, defaults to :obj:`{}`):
+            Pruned heads of the model. The keys are the selected layer indices and the associated values, the list
+            of heads to prune in said layer.
+
+            For instance ``{1: [0, 2], 2: [2, 3]}`` will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer
+            2.
+        xla_device (:obj:`bool`, `optional`):
+            A flag to indicate if TPU are available or not.
+        chunk_size_feed_forward (:obj:`int`, `optional`, defaults to :obj:`0`):
+            The chunk size of all feed forward layers in the residual attention blocks.
+            A chunk size of :obj:`0` means that the feed forward layer is not chunked.
+            A chunk size of n means that the feed forward layer processes :obj:`n` < sequence_length embeddings at a time.
+            For more information on feed forward chunking, see `How does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ .
+
+    Parameters for sequence generation
+        - **max_length** (:obj:`int`, `optional`, defaults to 20) -- Maximum length that will be used by
+          default in the :obj:`generate` method of the model.
+        - **min_length** (:obj:`int`, `optional`, defaults to 10) -- Minimum length that will be used by
+          default in the :obj:`generate` method of the model.
+        - **do_sample** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Flag that will be used by default in
+          the :obj:`generate` method of the model. Whether or not to use sampling ; use greedy decoding otherwise.
+        - **early_stopping** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Flag that will be used by
+          default in the :obj:`generate` method of the model. Whether to stop the beam search when at least
+          ``num_beams`` sentences are finished per batch or not.
+        - **num_beams** (:obj:`int`, `optional`, defaults to 1) -- Number of beams for beam search that will be
+          used by default in the :obj:`generate` method of the model. 1 means no beam search.
+        - **temperature** (:obj:`float`, `optional`, defaults to 1) -- The value used to module the next token
+          probabilities that will be used by default in the :obj:`generate` method of the model. Must be strictly
+          positive.
+        - **top_k** (:obj:`int`, `optional`, defaults to 50) -- Number of highest probability vocabulary tokens to
+          keep for top-k-filtering that will be used by default in the :obj:`generate` method of the model.
+        - **top_p** (:obj:`float`, `optional`, defaults to 1) --  Value that will be used by default in the
+          :obj:`generate` method of the model for ``top_p``. If set to float < 1, only the most probable tokens
+          with probabilities that add up to ``top_p`` or higher are kept for generation.
+        - **repetition_penalty** (:obj:`float`, `optional`, defaults to 1) -- Parameter for repetition penalty
+          that will be used by default in the :obj:`generate` method of the model. 1.0 means no penalty.
+        - **length_penalty** (:obj:`float`, `optional`, defaults to 1) -- Exponential penalty to the length that
+          will be used by default in the :obj:`generate` method of the model.
+        - **no_repeat_ngram_size** (:obj:`int`, `optional`, defaults to 0) -- Value that will be used by default
+          in the :obj:`generate` method of the model for ``no_repeat_ngram_size``. If set to int > 0, all ngrams of
+          that size can only occur once.
+        - **bad_words_ids** (:obj:`List[int]`, `optional`) -- List of token ids that are not allowed to be
+          generated that will be used by default in the :obj:`generate` method of the model. In order to get the
+          tokens of the words that should not appear in the generated text, use
+          :obj:`tokenizer.encode(bad_word, add_prefix_space=True)`.
+        - **num_return_sequences** (:obj:`int`, `optional`, defaults to 1) -- Number of independently computed
+          returned sequences for each element in the batch that will be used by default in the :obj:`generate`
+          method of the model.
+
+    Parameters for fine-tuning tasks
+        - **architectures** (:obj:`List[str]`, `optional`) -- Model architectures that can be used with the
+          model pretrained weights.
+        - **finetuning_task** (:obj:`str`, `optional`) -- Name of the task used to fine-tune the model. This can be
+          used when converting from an original (TensorFlow or PyTorch) checkpoint.
+        - **id2label** (:obj:`List[str]`, `optional`) -- A map from index (for instance prediction index, or target
+          index) to label.
+        - **label2id** (:obj:`Dict[str, int]`, `optional`) -- A map from label to index for the model.
+        - **num_labels** (:obj:`int`, `optional`) -- Number of labels to use in the last layer added to the model,
+          typically for a classification task.
+        - **task_specific_params** (:obj:`Dict[str, Any]`, `optional`) -- Additional keyword arguments to store for
+          the current task.
+
+    Parameters linked to the tokenizer
+        - **prefix** (:obj:`str`, `optional`) -- A specific prompt that should be added at the beginning of each
+          text before calling the model.
+        - **bos_token_id** (:obj:`int`, `optional`)) -- The id of the `beginning-of-stream` token.
+        - **pad_token_id** (:obj:`int`, `optional`)) -- The id of the `padding` token.
+        - **eos_token_id** (:obj:`int`, `optional`)) -- The id of the `end-of-stream` token.
+        - **decoder_start_token_id** (:obj:`int`, `optional`)) -- If an encoder-decoder model starts decoding with
+          a different token than `bos`, the id of that token.
+
+    PyTorch specific parameters
+        - **torchscript** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether or not the model should be
+          used with Torchscript.
+        - **tie_word_embeddings** (:obj:`bool`, `optional`, defaults to :obj:`True`) -- Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the model has a output word embedding layer.
+
+    TensorFlow specific parameters
+        - **use_bfloat16** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether or not the model should
+          use BFloat16 scalars (only used by some TensorFlow models).
    """
    model_type: str = ""


--- a/src/transformers/configuration_xlm.py
+++ b/src/transformers/configuration_xlm.py
@@ -36,120 +36,120 @@ XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class XLMConfig(PretrainedConfig):
    """
-        This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`.
-        It is used to instantiate an XLM model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30145):
-                Vocabulary size of the XLM model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLMModel`.
-            emb_dim (:obj:`int`, optional, defaults to 2048):
-                Dimensionality of the encoder layers and the pooler layer.
-            n_layer (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            n_head (:obj:`int`, optional, defaults to 16):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attention_dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for the attention mechanism
-            gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`):
-                The non-linear activation function (function or string) in the
-                encoder and pooler. If set to `True`, "gelu" will be used instead of "relu".
-            sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Whether to use sinusoidal positional embeddings instead of absolute positional embeddings.
-            causal (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Set this to `True` for the model to behave in a causal manner.
-                Causal models use a triangular attention mask in order to only attend to the left-side context instead
-                if a bidirectional context.
-            asm (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction
-                layer.
-            n_langs (:obj:`int`, optional, defaults to 1):
-                The number of languages the model handles. Set to 1 for monolingual models.
-            use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
-                Whether to use language embeddings. Some models use additional language embeddings, see
-                `the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
-                for information on how to use them.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might
-                ever be used with. Typically set this to something large just in case
-                (e.g., 512 or 1024 or 2048).
-            embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5):
-                The standard deviation of the truncated_normal_initializer for
-                initializing the embedding matrices.
-            init_std (:obj:`int`, optional, defaults to 50257):
-                The standard deviation of the truncated_normal_initializer for
-                initializing all weight matrices except the embedding matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            bos_index (:obj:`int`, optional, defaults to 0):
-                The index of the beginning of sentence token in the vocabulary.
-            eos_index (:obj:`int`, optional, defaults to 1):
-                The index of the end of sentence token in the vocabulary.
-            pad_index (:obj:`int`, optional, defaults to 2):
-                The index of the padding token in the vocabulary.
-            unk_index (:obj:`int`, optional, defaults to 3):
-                The index of the unknown token in the vocabulary.
-            mask_index (:obj:`int`, optional, defaults to 5):
-                The index of the masking token in the vocabulary.
-            is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
-                Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
-            summary_type (:obj:`string`, optional, defaults to "first"):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                Is one of the following options:
-
-                - 'last' => take the last token hidden state (like XLNet)
-                - 'first' => take the first token hidden state (like Bert)
-                - 'mean' => take the mean of all tokens hidden states
-                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                - 'attn' => Not implemented now, use multi-head attention
-            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                Add a projection after the vector extraction
-            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                'tanh' => add a tanh activation to the output, Other => no activation.
-            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
-            summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                Add a dropout before the projection and activation
-            start_n_top (:obj:`int`, optional, defaults to 5):
-                Used in the SQuAD evaluation script for XLM and XLNet.
-            end_n_top (:obj:`int`, optional, defaults to 5):
-                Used in the SQuAD evaluation script for XLM and XLNet.
-            mask_token_id (:obj:`int`, optional, defaults to 0):
-                Model agnostic parameter to identify masked tokens when generating text in an MLM context.
-            lang_id (:obj:`int`, optional, defaults to 1):
-                The ID of the language used by the model. This parameter is used when generating
-                text in a given language.
-
-        Example::
-
-            >>> from transformers import XLMConfig, XLMModel
-
-            >>> # Initializing a XLM configuration
-            >>> configuration = XLMConfig()
-
-            >>> # Initializing a model from the configuration
-            >>> model = XLMModel(configuration)
-
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+    This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`.
+    It is used to instantiate an XLM model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.
+
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
+
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 30145):
+            Vocabulary size of the XLM model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLMModel`.
+        emb_dim (:obj:`int`, optional, defaults to 2048):
+            Dimensionality of the encoder layers and the pooler layer.
+        n_layer (:obj:`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (:obj:`int`, optional, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout probability for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        attention_dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout probability for the attention mechanism
+        gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`):
+            The non-linear activation function (function or string) in the
+            encoder and pooler. If set to `True`, "gelu" will be used instead of "relu".
+        sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Whether to use sinusoidal positional embeddings instead of absolute positional embeddings.
+        causal (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Set this to `True` for the model to behave in a causal manner.
+            Causal models use a triangular attention mask in order to only attend to the left-side context instead
+            if a bidirectional context.
+        asm (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction
+            layer.
+        n_langs (:obj:`int`, optional, defaults to 1):
+            The number of languages the model handles. Set to 1 for monolingual models.
+        use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
+            Whether to use language embeddings. Some models use additional language embeddings, see
+            `the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
+            for information on how to use them.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might
+            ever be used with. Typically set this to something large just in case
+            (e.g., 512 or 1024 or 2048).
+        embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5):
+            The standard deviation of the truncated_normal_initializer for
+            initializing the embedding matrices.
+        init_std (:obj:`int`, optional, defaults to 50257):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices except the embedding matrices.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        bos_index (:obj:`int`, optional, defaults to 0):
+            The index of the beginning of sentence token in the vocabulary.
+        eos_index (:obj:`int`, optional, defaults to 1):
+            The index of the end of sentence token in the vocabulary.
+        pad_index (:obj:`int`, optional, defaults to 2):
+            The index of the padding token in the vocabulary.
+        unk_index (:obj:`int`, optional, defaults to 3):
+            The index of the unknown token in the vocabulary.
+        mask_index (:obj:`int`, optional, defaults to 5):
+            The index of the masking token in the vocabulary.
+        is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
+            Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
+        summary_type (:obj:`string`, optional, defaults to "first"):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            Is one of the following options:
+
+            - 'last' => take the last token hidden state (like XLNet)
+            - 'first' => take the first token hidden state (like Bert)
+            - 'mean' => take the mean of all tokens hidden states
+            - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+            - 'attn' => Not implemented now, use multi-head attention
+        summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            Add a projection after the vector extraction
+        summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            'tanh' => add a tanh activation to the output, Other => no activation.
+        summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+        summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            Add a dropout before the projection and activation
+        start_n_top (:obj:`int`, optional, defaults to 5):
+            Used in the SQuAD evaluation script for XLM and XLNet.
+        end_n_top (:obj:`int`, optional, defaults to 5):
+            Used in the SQuAD evaluation script for XLM and XLNet.
+        mask_token_id (:obj:`int`, optional, defaults to 0):
+            Model agnostic parameter to identify masked tokens when generating text in an MLM context.
+        lang_id (:obj:`int`, optional, defaults to 1):
+            The ID of the language used by the model. This parameter is used when generating
+            text in a given language.
+
+    Example::
+
+        >>> from transformers import XLMConfig, XLMModel
+
+        >>> # Initializing a XLM configuration
+        >>> configuration = XLMConfig()
+
+        >>> # Initializing a model from the configuration
+        >>> model = XLMModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """

    model_type = "xlm"
@@ -191,8 +191,7 @@ class XLMConfig(PretrainedConfig):
        bos_token_id=0,
        **kwargs
    ):
-        """Constructs XLMConfig.
-        """
+        """Constructs XLMConfig."""
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
        self.vocab_size = vocab_size
        self.emb_dim = emb_dim

--- a/src/transformers/configuration_xlnet.py
+++ b/src/transformers/configuration_xlnet.py
@@ -31,104 +31,104 @@ XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class XLNetConfig(PretrainedConfig):
    """
-        This is the configuration class to store the configuration of a :class:`~transformers.XLNetModel`.
-        It is used to instantiate an XLNet model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the `xlnet-large-cased <https://huggingface.co/xlnet-large-cased>`__ architecture.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 32000):
-                Vocabulary size of the XLNet model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLNetModel`.
-            d_model (:obj:`int`, optional, defaults to 1024):
-                Dimensionality of the encoder layers and the pooler layer.
-            n_layer (:obj:`int`, optional, defaults to 24):
-                Number of hidden layers in the Transformer encoder.
-            n_head (:obj:`int`, optional, defaults to 16):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            d_inner (:obj:`int`, optional, defaults to 4096):
-                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            ff_activation (:obj:`string`, optional, defaults to "gelu"):
-                The non-linear activation function (function or string) in the
-                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
-            untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Untie relative position biases
-            attn_type (:obj:`string`, optional, defaults to "bi"):
-                The attention type used by the model. Set 'bi' for XLNet, 'uni' for Transformer-XL.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-            mem_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
-                The number of tokens to cache. The key/value pairs that have already been pre-computed
-                in a previous forward pass won't be re-computed. See the
-                `quickstart <https://huggingface.co/transformers/quickstart.html#using-the-past>`__
-                for more information.
-            reuse_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
-                The number of tokens in the current batch to be cached and reused in the future.
-            bi_data (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Whether to use bidirectional input pipeline. Usually set to `True` during
-                pretraining and `False` during finetuning.
-            clamp_len (:obj:`int`, optional, defaults to -1):
-                Clamp all relative distances larger than clamp_len.
-                Setting this attribute to -1 means no clamping.
-            same_length (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Whether to use the same attention length for each token.
-            summary_type (:obj:`string`, optional, defaults to "last"):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
-                Is one of the following options:
-
-                - 'last' => take the last token hidden state (like XLNet)
-                - 'first' => take the first token hidden state (like Bert)
-                - 'mean' => take the mean of all tokens hidden states
-                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                - 'attn' => Not implemented now, use multi-head attention
-            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
-                Add a projection after the vector extraction
-            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
-                'tanh' => add a tanh activation to the output, Other => no activation.
-            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
-                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
-            summary_last_dropout (:obj:`float`, optional, defaults to 0.1):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
-                Add a dropout after the projection and activation
-            start_n_top (:obj:`int`, optional, defaults to 5):
-                Used in the SQuAD evaluation script for XLM and XLNet.
-            end_n_top (:obj:`int`, optional, defaults to 5):
-                Used in the SQuAD evaluation script for XLM and XLNet.
-            use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
-                Whether or not the model should return the last pre-computed hidden states.
-
-                .. note::
-                    This flag behaves differently from with other models: it just controls the inference behavior, during
-                    training the model always uses ``use_cache=True``.
-
-        Example::
-
-            >>> from transformers import XLNetConfig, XLNetModel
-
-            >>> # Initializing a XLNet configuration
-            >>> configuration = XLNetConfig()
-
-            >>> # Initializing a model from the configuration
-            >>> model = XLNetModel(configuration)
-
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+    This is the configuration class to store the configuration of a :class:`~transformers.XLNetModel`.
+    It is used to instantiate an XLNet model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the `xlnet-large-cased <https://huggingface.co/xlnet-large-cased>`__ architecture.
+
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
+
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 32000):
+            Vocabulary size of the XLNet model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLNetModel`.
+        d_model (:obj:`int`, optional, defaults to 1024):
+            Dimensionality of the encoder layers and the pooler layer.
+        n_layer (:obj:`int`, optional, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        n_head (:obj:`int`, optional, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        d_inner (:obj:`int`, optional, defaults to 4096):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        ff_activation (:obj:`string`, optional, defaults to "gelu"):
+            The non-linear activation function (function or string) in the
+            encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+        untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Untie relative position biases
+        attn_type (:obj:`string`, optional, defaults to "bi"):
+            The attention type used by the model. Set 'bi' for XLNet, 'uni' for Transformer-XL.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        mem_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
+            The number of tokens to cache. The key/value pairs that have already been pre-computed
+            in a previous forward pass won't be re-computed. See the
+            `quickstart <https://huggingface.co/transformers/quickstart.html#using-the-past>`__
+            for more information.
+        reuse_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
+            The number of tokens in the current batch to be cached and reused in the future.
+        bi_data (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Whether to use bidirectional input pipeline. Usually set to `True` during
+            pretraining and `False` during finetuning.
+        clamp_len (:obj:`int`, optional, defaults to -1):
+            Clamp all relative distances larger than clamp_len.
+            Setting this attribute to -1 means no clamping.
+        same_length (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Whether to use the same attention length for each token.
+        summary_type (:obj:`string`, optional, defaults to "last"):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
+            Is one of the following options:
+
+            - 'last' => take the last token hidden state (like XLNet)
+            - 'first' => take the first token hidden state (like Bert)
+            - 'mean' => take the mean of all tokens hidden states
+            - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+            - 'attn' => Not implemented now, use multi-head attention
+        summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
+            Add a projection after the vector extraction
+        summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
+            'tanh' => add a tanh activation to the output, Other => no activation.
+        summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
+            If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+        summary_last_dropout (:obj:`float`, optional, defaults to 0.1):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
+            Add a dropout after the projection and activation
+        start_n_top (:obj:`int`, optional, defaults to 5):
+            Used in the SQuAD evaluation script for XLM and XLNet.
+        end_n_top (:obj:`int`, optional, defaults to 5):
+            Used in the SQuAD evaluation script for XLM and XLNet.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last pre-computed hidden states.
+
+            .. note::
+                This flag behaves differently from with other models: it just controls the inference behavior, during
+                training the model always uses ``use_cache=True``.
+
+    Example::
+
+        >>> from transformers import XLNetConfig, XLNetModel
+
+        >>> # Initializing a XLNet configuration
+        >>> configuration = XLNetConfig()
+
+        >>> # Initializing a model from the configuration
+        >>> model = XLNetModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """

    model_type = "xlnet"
@@ -162,8 +162,7 @@ class XLNetConfig(PretrainedConfig):
        eos_token_id=2,
        **kwargs
    ):
-        """Constructs XLNetConfig.
-        """
+        """Constructs XLNetConfig."""
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
        self.vocab_size = vocab_size
        self.d_model = d_model

--- a/src/transformers/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py
@@ -27,5 +27,6 @@ if __name__ == "__main__":
        checkpoint_path = os.path.join(args.dialogpt_path, f"{MODEL}_ft.pkl")
        pytorch_dump_folder_path = f"./DialoGPT-{MODEL}"
        convert_dialogpt_checkpoint(
-            checkpoint_path, pytorch_dump_folder_path,
+            checkpoint_path,
+            pytorch_dump_folder_path,
        )
--- a/src/transformers/convert_graph_to_onnx.py
+++ b/src/transformers/convert_graph_to_onnx.py
@@ -38,24 +38,39 @@ class OnnxConverterArgumentParser(ArgumentParser):
        super().__init__("ONNX Converter")

        self.add_argument(
-            "--pipeline", type=str, choices=SUPPORTED_PIPELINES, default="feature-extraction",
+            "--pipeline",
+            type=str,
+            choices=SUPPORTED_PIPELINES,
+            default="feature-extraction",
        )
        self.add_argument(
-            "--model", type=str, required=True, help="Model's id or path (ex: bert-base-cased)",
+            "--model",
+            type=str,
+            required=True,
+            help="Model's id or path (ex: bert-base-cased)",
        )
        self.add_argument("--tokenizer", type=str, help="Tokenizer's id or path (ex: bert-base-cased)")
        self.add_argument(
-            "--framework", type=str, choices=["pt", "tf"], help="Framework for loading the model",
+            "--framework",
+            type=str,
+            choices=["pt", "tf"],
+            help="Framework for loading the model",
        )
        self.add_argument("--opset", type=int, default=11, help="ONNX opset to use")
        self.add_argument(
-            "--check-loading", action="store_true", help="Check ONNX is able to load the model",
+            "--check-loading",
+            action="store_true",
+            help="Check ONNX is able to load the model",
        )
        self.add_argument(
-            "--use-external-format", action="store_true", help="Allow exporting model >= than 2Gb",
+            "--use-external-format",
+            action="store_true",
+            help="Allow exporting model >= than 2Gb",
        )
        self.add_argument(
-            "--quantize", action="store_true", help="Quantize the neural network to be run with int8",
+            "--quantize",
+            action="store_true",
+            help="Quantize the neural network to be run with int8",
        )
        self.add_argument("output")

@@ -376,7 +391,10 @@ def quantize(onnx_model_path: Path) -> Path:
    )

    quantized_model = quantize(
-        model=onnx_model, quantization_mode=QuantizationMode.IntegerOps, force_fusions=True, symmetric_weight=True,
+        model=onnx_model,
+        quantization_mode=QuantizationMode.IntegerOps,
+        force_fusions=True,
+        symmetric_weight=True,
    )

    # Append "-quantized" at the end of the model's name

--- a/src/transformers/convert_marian_to_pytorch.py
+++ b/src/transformers/convert_marian_to_pytorch.py
@@ -255,7 +255,11 @@ license: apache-2.0


 def write_model_card(
-    hf_model_name: str, repo_root="OPUS-MT-train", save_dir=Path("marian_converted"), dry_run=False, extra_metadata={},
+    hf_model_name: str,
+    repo_root="OPUS-MT-train",
+    save_dir=Path("marian_converted"),
+    dry_run=False,
+    extra_metadata={},
 ) -> str:
    """Copy the most recent model's readme section from opus, and add metadata.
    upload command: aws s3 sync model_card_dir s3://models.huggingface.co/bert/Helsinki-NLP/ --dryrun
@@ -604,7 +608,9 @@ class OpusState:

        assert "hidden_size" not in cfg.to_dict()
        load_layers_(
-            model.model.encoder.layers, state_dict, BART_CONVERTER,
+            model.model.encoder.layers,
+            state_dict,
+            BART_CONVERTER,
        )
        load_layers_(model.model.decoder.layers, state_dict, BART_CONVERTER, is_decoder=True)


--- a/src/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/src/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -108,7 +108,12 @@ if is_torch_available():
 logging.set_verbosity_info()

 MODEL_CLASSES = {
-    "bert": (BertConfig, TFBertForPreTraining, BertForPreTraining, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,),
+    "bert": (
+        BertConfig,
+        TFBertForPreTraining,
+        BertForPreTraining,
+        BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
    "bert-large-uncased-whole-word-masking-finetuned-squad": (
        BertConfig,
        TFBertForQuestionAnswering,
@@ -127,9 +132,24 @@ MODEL_CLASSES = {
        BertForSequenceClassification,
        BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
    ),
-    "gpt2": (GPT2Config, TFGPT2LMHeadModel, GPT2LMHeadModel, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,),
-    "xlnet": (XLNetConfig, TFXLNetLMHeadModel, XLNetLMHeadModel, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,),
-    "xlm": (XLMConfig, TFXLMWithLMHeadModel, XLMWithLMHeadModel, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,),
+    "gpt2": (
+        GPT2Config,
+        TFGPT2LMHeadModel,
+        GPT2LMHeadModel,
+        GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "xlnet": (
+        XLNetConfig,
+        TFXLNetLMHeadModel,
+        XLNetLMHeadModel,
+        XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "xlm": (
+        XLMConfig,
+        TFXLMWithLMHeadModel,
+        XLMWithLMHeadModel,
+        XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
    "xlm-roberta": (
        XLMRobertaConfig,
        TFXLMRobertaForMaskedLM,
@@ -148,7 +168,12 @@ MODEL_CLASSES = {
        OpenAIGPTLMHeadModel,
        OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
    ),
-    "roberta": (RobertaConfig, TFRobertaForMaskedLM, RobertaForMaskedLM, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,),
+    "roberta": (
+        RobertaConfig,
+        TFRobertaForMaskedLM,
+        RobertaForMaskedLM,
+        ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
    "roberta-large-mnli": (
        RobertaConfig,
        TFRobertaForSequenceClassification,
@@ -179,10 +204,30 @@ MODEL_CLASSES = {
        DistilBertForQuestionAnswering,
        DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
    ),
-    "ctrl": (CTRLConfig, TFCTRLLMHeadModel, CTRLLMHeadModel, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,),
-    "albert": (AlbertConfig, TFAlbertForPreTraining, AlbertForPreTraining, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,),
-    "t5": (T5Config, TFT5ForConditionalGeneration, T5ForConditionalGeneration, T5_PRETRAINED_CONFIG_ARCHIVE_MAP,),
-    "electra": (ElectraConfig, TFElectraForPreTraining, ElectraForPreTraining, ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP,),
+    "ctrl": (
+        CTRLConfig,
+        TFCTRLLMHeadModel,
+        CTRLLMHeadModel,
+        CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "albert": (
+        AlbertConfig,
+        TFAlbertForPreTraining,
+        AlbertForPreTraining,
+        ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "t5": (
+        T5Config,
+        TFT5ForConditionalGeneration,
+        T5ForConditionalGeneration,
+        T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "electra": (
+        ElectraConfig,
+        TFElectraForPreTraining,
+        ElectraForPreTraining,
+        ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
 }



--- a/src/transformers/convert_reformer_trax_checkpoint_to_pytorch.py
+++ b/src/transformers/convert_reformer_trax_checkpoint_to_pytorch.py
@@ -49,10 +49,12 @@ def set_layer_weights_in_torch_lsh(weights, torch_layer, hidden_size):
        torch.tensor(np_query_key).transpose(1, 2).contiguous().view(-1, hidden_size),
    )
    set_param(
-        torch_layer.self_attention.value, torch.tensor(np_value).transpose(1, 2).contiguous().view(-1, hidden_size),
+        torch_layer.self_attention.value,
+        torch.tensor(np_value).transpose(1, 2).contiguous().view(-1, hidden_size),
    )
    set_param(
-        torch_layer.output.dense, torch.tensor(np_dense).view(-1, hidden_size).contiguous().transpose(0, 1),
+        torch_layer.output.dense,
+        torch.tensor(np_dense).view(-1, hidden_size).contiguous().transpose(0, 1),
    )


@@ -64,16 +66,20 @@ def set_layer_weights_in_torch_local(weights, torch_layer, hidden_size):
    np_dense = np.asarray(weights[3])

    set_param(
-        torch_layer.self_attention.query, torch.tensor(np_query).transpose(1, 2).contiguous().view(-1, hidden_size),
+        torch_layer.self_attention.query,
+        torch.tensor(np_query).transpose(1, 2).contiguous().view(-1, hidden_size),
    )
    set_param(
-        torch_layer.self_attention.key, torch.tensor(np_key).transpose(1, 2).contiguous().view(-1, hidden_size),
+        torch_layer.self_attention.key,
+        torch.tensor(np_key).transpose(1, 2).contiguous().view(-1, hidden_size),
    )
    set_param(
-        torch_layer.self_attention.value, torch.tensor(np_value).transpose(1, 2).contiguous().view(-1, hidden_size),
+        torch_layer.self_attention.value,
+        torch.tensor(np_value).transpose(1, 2).contiguous().view(-1, hidden_size),
    )
    set_param(
-        torch_layer.output.dense, torch.tensor(np_dense).view(-1, hidden_size).contiguous().transpose(0, 1),
+        torch_layer.output.dense,
+        torch.tensor(np_dense).view(-1, hidden_size).contiguous().transpose(0, 1),
    )


@@ -83,7 +89,9 @@ def set_block_weights_in_torch(weights, torch_block, hidden_size):
    layer_norm_1_weight = np.asarray(layer_norm_1[0])
    layer_norm_1_bias = np.asarray(layer_norm_1[1])
    set_param(
-        torch_block.attention.layer_norm, torch.tensor(layer_norm_1_weight), torch.tensor(layer_norm_1_bias),
+        torch_block.attention.layer_norm,
+        torch.tensor(layer_norm_1_weight),
+        torch.tensor(layer_norm_1_bias),
    )

    # lsh weights + output
@@ -104,7 +112,9 @@ def set_block_weights_in_torch(weights, torch_block, hidden_size):
    layer_norm_2_weight = np.asarray(intermediate_weights[0][0])
    layer_norm_2_bias = np.asarray(intermediate_weights[0][1])
    set_param(
-        torch_block.feed_forward.layer_norm, torch.tensor(layer_norm_2_weight), torch.tensor(layer_norm_2_bias),
+        torch_block.feed_forward.layer_norm,
+        torch.tensor(layer_norm_2_weight),
+        torch.tensor(layer_norm_2_bias),
    )

    # intermediate dense
@@ -133,7 +143,8 @@ def set_model_weights_in_torch(weights, torch_model, hidden_size):
    # word embeds
    word_embeddings = np.asarray(weights[1])
    set_param(
-        torch_model_reformer.embeddings.word_embeddings, torch.tensor(word_embeddings),
+        torch_model_reformer.embeddings.word_embeddings,
+        torch.tensor(word_embeddings),
    )

    if isinstance(weights[3], tuple):

--- a/src/transformers/data/datasets/glue.py
+++ b/src/transformers/data/datasets/glue.py
@@ -86,7 +86,10 @@ class GlueDataset(Dataset):
        cached_features_file = os.path.join(
            cache_dir if cache_dir is not None else args.data_dir,
            "cached_{}_{}_{}_{}".format(
-                mode.value, tokenizer.__class__.__name__, str(args.max_seq_length), args.task_name,
+                mode.value,
+                tokenizer.__class__.__name__,
+                str(args.max_seq_length),
+                args.task_name,
            ),
        )
        label_list = self.processor.get_labels()

--- a/src/transformers/data/datasets/language_modeling.py
+++ b/src/transformers/data/datasets/language_modeling.py
@@ -21,7 +21,11 @@ class TextDataset(Dataset):
    """

    def __init__(
-        self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, overwrite_cache=False,
+        self,
+        tokenizer: PreTrainedTokenizer,
+        file_path: str,
+        block_size: int,
+        overwrite_cache=False,
    ):
        assert os.path.isfile(file_path), f"Input file path {file_path} not found"

@@ -29,7 +33,12 @@ class TextDataset(Dataset):

        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
-            directory, "cached_lm_{}_{}_{}".format(tokenizer.__class__.__name__, str(block_size), filename,),
+            directory,
+            "cached_lm_{}_{}_{}".format(
+                tokenizer.__class__.__name__,
+                str(block_size),
+                filename,
+            ),
        )

        # Make sure only the first process in distributed training processes the dataset,

--- a/src/transformers/data/datasets/squad.py
+++ b/src/transformers/data/datasets/squad.py
@@ -119,7 +119,10 @@ class SquadDataset(Dataset):
        cached_features_file = os.path.join(
            cache_dir if cache_dir is not None else args.data_dir,
            "cached_{}_{}_{}_{}".format(
-                mode.value, tokenizer.__class__.__name__, str(args.max_seq_length), version_tag,
+                mode.value,
+                tokenizer.__class__.__name__,
+                str(args.max_seq_length),
+                version_tag,
            ),
        )


--- a/src/transformers/data/metrics/squad_metrics.py
+++ b/src/transformers/data/metrics/squad_metrics.py
@@ -589,10 +589,10 @@ def compute_predictions_log_probs(
    tokenizer,
    verbose_logging,
 ):
-    """ XLNet write prediction logic (more complex than Bert's).
-        Write final predictions to the json file and log-odds of null if needed.
+    """XLNet write prediction logic (more complex than Bert's).
+    Write final predictions to the json file and log-odds of null if needed.

-        Requires utils_squad_evaluate.py
+    Requires utils_squad_evaluate.py
    """
    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_log_prob", "end_log_prob"]

--- a/src/transformers/data/processors/glue.py
+++ b/src/transformers/data/processors/glue.py
@@ -69,7 +69,10 @@ def glue_convert_examples_to_features(
 if is_tf_available():

    def _tf_glue_convert_examples_to_features(
-        examples: tf.data.Dataset, tokenizer: PreTrainedTokenizer, task=str, max_length: Optional[int] = None,
+        examples: tf.data.Dataset,
+        tokenizer: PreTrainedTokenizer,
+        task=str,
+        max_length: Optional[int] = None,
    ) -> tf.data.Dataset:
        """
        Returns:

--- a/src/transformers/data/processors/utils.py
+++ b/src/transformers/data/processors/utils.py
@@ -269,7 +269,9 @@ class SingleSentenceClassificationProcessor(DataProcessor):
                logger.info("Tokenizing example %d", ex_index)

            input_ids = tokenizer.encode(
-                example.text_a, add_special_tokens=True, max_length=min(max_length, tokenizer.max_len),
+                example.text_a,
+                add_special_tokens=True,
+                max_length=min(max_length, tokenizer.max_len),
            )
            all_input_ids.append(input_ids)