Black 20 release

a75c64d8 · Lysandre · e78c1103 · a75c64d8 · a75c64d8 · a75c64d8
Commit a75c64d8 authored Aug 26, 2020 by Lysandre
20 changed files
--- a/src/transformers/configuration_pegasus.py
+++ b/src/transformers/configuration_pegasus.py
@@ -92,8 +92,8 @@ expected_alpha = {
 @add_start_docstrings_to_callable(BART_CONFIG_ARGS_DOC)
 class PegasusConfig(BartConfig):
    r"""
-        :class:`~transformers.PegasusConfig` is the configuration class to store the configuration of a
-        `PegasusModel`.
+    :class:`~transformers.PegasusConfig` is the configuration class to store the configuration of a
+    `PegasusModel`.
    """
    model_type = "pegasus"
    # The implementation of the config object is in BartConfig
--- a/src/transformers/configuration_reformer.py
+++ b/src/transformers/configuration_reformer.py
@@ -29,105 +29,105 @@ REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class ReformerConfig(PretrainedConfig):
    r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.ReformerModel`.
-        It is used to instantiate an Reformer model according to the specified arguments, defining the model
-        architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.ReformerModel`.
+    It is used to instantiate an Reformer model according to the specified arguments, defining the model
+    architecture.

-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.

-        Args:
-            attention_head_size (:obj:`int`, optional, defaults to 64):
-                Dimensionality of the projected key, query and value vectors
-            attn_layers (:obj:`list(str)`, optional, defaults to ["local", "lsh", "local", "lsh", "local", "lsh"]):
-                List of attention layer types in ascending order. It can be chosen between a
-                LSHSelfAttention layer ("lsh") and a LocalSelfAttention layer ("local").
-                For more information on LSHSelfAttention layer, see `LSH Self Attention <reformer.html#lsh-self-attention>`__ .
-                For more information on LocalSelfAttention layer, see `Local Self Attention <reformer.html#local-sensitive-hashing-self-attention>`__ .
-            axial_pos_embds (:obj:`bool`, optional, defaults to True):
-                If `True` use axial position embeddings. For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__
-            axial_norm_std (:obj:`float`, optional, defaluts to 1.0):
-                The standard deviation of the normal_initializer for initializing the weight matrices of the axial positional encodings.
-            axial_pos_shape (:obj:`list(int)`, optional, defaults to `[64, 64]`):
-                The position dims of the axial position encodings.
-                During training the product of the position dims has to equal the sequence length.
-                For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__.
-            axial_pos_embds_dim (:obj:`list(int)`, optional, defaults to `[64, 192]`):
-                The embedding dims of the axial position encodings.
-                The sum of the embedding dims has to equal the hidden size.
-                For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__.
-            chunk_size_lm_head (:obj:`int`, optional, defaults to 0):
-                The chunk size of the final language model feed forward head layer.
-                A chunk size of 0 means that the feed forward layer is not chunked.
-                A chunk size of n means that the feed forward layer processes n < sequence_length embeddings at a time.
-                For more information on feed forward chunking, see `How does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ .
-            eos_token_id (:obj:`int`, optional, defaults to 2):
-                The token id for the <EOS> token.
-            feed_forward_size (:obj:`int`, optional, defaults to 512):
-                Dimensionality of the "feed_forward" (i.e., feed-forward) layer in the residual attention block.
-            hash_seed (:obj:`int`, optional, defaults to `None`):
-                Seed that can be used to make local sensitive hashing in LSHSelfAttention deterministic. This should only be set for testing purposed. For evaluation and training purposes `hash_seed` should be set to `None` to ensure fully random rotations in local sensitive hashing scheme.
-            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "relu"):
-                The non-linear activation function (function or string) in the feed forward layer in the residual attention block.
-                If string, "gelu", "relu", "swish", "gelu_new" and "gelu_fast" are supported.
-            hidden_dropout_prob (:obj:`float`, optional, defaults to 0.05):
-                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-            hidden_size (:obj:`int`, optional, defaults to 256):
-                Dimensionality of the output hidden states of the residual attention blocks.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            is_decoder (:obj:`bool`, optional, defaults to False):
-                If `is_decoder` is True, a causal mask is used in addition to `attention_mask`.
-                When using the Reformer for causal language modeling, `is_decoder` is set to `True`.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            local_chunk_length (:obj:`int`, optional, defaults to 64):
-                Length of chunk which attends to itself in LocalSelfAttention. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention).
-            local_num_chunks_before (:obj:`int`, optional, defaults to 1):
-                Number of previous neighbouring chunks to attend to in LocalSelfAttention layer to itself.
-            local_num_chunks_after (:obj:`int`, optional, defaults to 0):
-                Number of following neighbouring chunks to attend to in LocalSelfAttention layer in addition to itself.
-            local_attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities in LocalSelfAttention.
-            lsh_attn_chunk_length (:obj:`int`, optional, defaults to 64):
-                Length of chunk which attends to itself in LSHSelfAttention. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention).
-            lsh_num_chunks_before (:obj:`int`, optional, defaults to 1):
-                Number of previous neighbouring chunks to attend to in LSHSelfAttention layer to itself.
-            lsh_num_chunks_after (:obj:`int`, optional, defaults to 0):
-                Number of following neighbouring chunks to attend to in LSHSelfAttention layer to itself.
-            lsh_attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities in LSHSelfAttention.
-            max_position_embeddings (:obj:`int`, optional, defaults to 4096):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            num_attention_heads (:obj:`int`, optional, defaults to 12):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            num_buckets (:obj:`int` or :obj:`list(int)`, optional, defaults to `None`):
-                Number of buckets, the key query vectors can be "hashed into" using the locality sensitive hashing scheme. Each query key vector is hashed into a hash in `1, ..., num_buckets`.
-                The number of buckets can also be factorized into a list for improved memory complexity. In this case, each query key vector is hashed into a hash in `1-1, 1-2, ..., num_buckets[0]-1, ..., num_buckets[0]-num_buckets[1]` if `num_buckets` is factorized into two factors.
-                The number of buckets (or the product the factors) should approximately equal sequence length / lsh_chunk_length. If `num_buckets` is set to `None`, a good value for `num_buckets` is calculated on the fly.
-            num_hashes (:obj:`int`, optional, defaults to 1):
-                Number of hashing rounds (e.g. number of random rotations) in Local Sensitive Hashing scheme.
-                The higher `num_hashes`, the more accurate the `LSHSelfAttention` becomes, but also the more memory and time intensive the hashing becomes.
-            pad_token_id (:obj:`int`, optional, defaults to 0):
-                The token id for the <PAD> token.
-            vocab_size (:obj:`int`, optional, defaults to 320):
-                Vocabulary size of the Reformer model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ReformerModel`.
+    Args:
+        attention_head_size (:obj:`int`, optional, defaults to 64):
+            Dimensionality of the projected key, query and value vectors
+        attn_layers (:obj:`list(str)`, optional, defaults to ["local", "lsh", "local", "lsh", "local", "lsh"]):
+            List of attention layer types in ascending order. It can be chosen between a
+            LSHSelfAttention layer ("lsh") and a LocalSelfAttention layer ("local").
+            For more information on LSHSelfAttention layer, see `LSH Self Attention <reformer.html#lsh-self-attention>`__ .
+            For more information on LocalSelfAttention layer, see `Local Self Attention <reformer.html#local-sensitive-hashing-self-attention>`__ .
+        axial_pos_embds (:obj:`bool`, optional, defaults to True):
+            If `True` use axial position embeddings. For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__
+        axial_norm_std (:obj:`float`, optional, defaluts to 1.0):
+            The standard deviation of the normal_initializer for initializing the weight matrices of the axial positional encodings.
+        axial_pos_shape (:obj:`list(int)`, optional, defaults to `[64, 64]`):
+            The position dims of the axial position encodings.
+            During training the product of the position dims has to equal the sequence length.
+            For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__.
+        axial_pos_embds_dim (:obj:`list(int)`, optional, defaults to `[64, 192]`):
+            The embedding dims of the axial position encodings.
+            The sum of the embedding dims has to equal the hidden size.
+            For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__.
+        chunk_size_lm_head (:obj:`int`, optional, defaults to 0):
+            The chunk size of the final language model feed forward head layer.
+            A chunk size of 0 means that the feed forward layer is not chunked.
+            A chunk size of n means that the feed forward layer processes n < sequence_length embeddings at a time.
+            For more information on feed forward chunking, see `How does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ .
+        eos_token_id (:obj:`int`, optional, defaults to 2):
+            The token id for the <EOS> token.
+        feed_forward_size (:obj:`int`, optional, defaults to 512):
+            Dimensionality of the "feed_forward" (i.e., feed-forward) layer in the residual attention block.
+        hash_seed (:obj:`int`, optional, defaults to `None`):
+            Seed that can be used to make local sensitive hashing in LSHSelfAttention deterministic. This should only be set for testing purposed. For evaluation and training purposes `hash_seed` should be set to `None` to ensure fully random rotations in local sensitive hashing scheme.
+        hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "relu"):
+            The non-linear activation function (function or string) in the feed forward layer in the residual attention block.
+            If string, "gelu", "relu", "swish", "gelu_new" and "gelu_fast" are supported.
+        hidden_dropout_prob (:obj:`float`, optional, defaults to 0.05):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        hidden_size (:obj:`int`, optional, defaults to 256):
+            Dimensionality of the output hidden states of the residual attention blocks.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        is_decoder (:obj:`bool`, optional, defaults to False):
+            If `is_decoder` is True, a causal mask is used in addition to `attention_mask`.
+            When using the Reformer for causal language modeling, `is_decoder` is set to `True`.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        local_chunk_length (:obj:`int`, optional, defaults to 64):
+            Length of chunk which attends to itself in LocalSelfAttention. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention).
+        local_num_chunks_before (:obj:`int`, optional, defaults to 1):
+            Number of previous neighbouring chunks to attend to in LocalSelfAttention layer to itself.
+        local_num_chunks_after (:obj:`int`, optional, defaults to 0):
+            Number of following neighbouring chunks to attend to in LocalSelfAttention layer in addition to itself.
+        local_attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities in LocalSelfAttention.
+        lsh_attn_chunk_length (:obj:`int`, optional, defaults to 64):
+            Length of chunk which attends to itself in LSHSelfAttention. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention).
+        lsh_num_chunks_before (:obj:`int`, optional, defaults to 1):
+            Number of previous neighbouring chunks to attend to in LSHSelfAttention layer to itself.
+        lsh_num_chunks_after (:obj:`int`, optional, defaults to 0):
+            Number of following neighbouring chunks to attend to in LSHSelfAttention layer to itself.
+        lsh_attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities in LSHSelfAttention.
+        max_position_embeddings (:obj:`int`, optional, defaults to 4096):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        num_attention_heads (:obj:`int`, optional, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_buckets (:obj:`int` or :obj:`list(int)`, optional, defaults to `None`):
+            Number of buckets, the key query vectors can be "hashed into" using the locality sensitive hashing scheme. Each query key vector is hashed into a hash in `1, ..., num_buckets`.
+            The number of buckets can also be factorized into a list for improved memory complexity. In this case, each query key vector is hashed into a hash in `1-1, 1-2, ..., num_buckets[0]-1, ..., num_buckets[0]-num_buckets[1]` if `num_buckets` is factorized into two factors.
+            The number of buckets (or the product the factors) should approximately equal sequence length / lsh_chunk_length. If `num_buckets` is set to `None`, a good value for `num_buckets` is calculated on the fly.
+        num_hashes (:obj:`int`, optional, defaults to 1):
+            Number of hashing rounds (e.g. number of random rotations) in Local Sensitive Hashing scheme.
+            The higher `num_hashes`, the more accurate the `LSHSelfAttention` becomes, but also the more memory and time intensive the hashing becomes.
+        pad_token_id (:obj:`int`, optional, defaults to 0):
+            The token id for the <PAD> token.
+        vocab_size (:obj:`int`, optional, defaults to 320):
+            Vocabulary size of the Reformer model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ReformerModel`.

-        Example::
+    Example::

-            >>> from transformers import ReformerModel, ReformerConfig
+        >>> from transformers import ReformerModel, ReformerConfig

-            >>> # Initializing a Reformer configuration
-            >>> configuration = ReformerConfig()
+        >>> # Initializing a Reformer configuration
+        >>> configuration = ReformerConfig()

-            >>> # Initializing a Reformer model
-            >>> model = ReformerModel(configuration)
+        >>> # Initializing a Reformer model
+        >>> model = ReformerModel(configuration)

-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """
    model_type = "reformer"


--- a/src/transformers/configuration_retribert.py
+++ b/src/transformers/configuration_retribert.py
@@ -28,47 +28,47 @@ RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class RetriBertConfig(PretrainedConfig):
    r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.RetriBertModel`.
-        It is used to instantiate a RetriBertModel model according to the specified arguments, defining the model
-        architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.RetriBertModel`.
+    It is used to instantiate a RetriBertModel model according to the specified arguments, defining the model
+    architecture.

-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.


-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30522):
-                Vocabulary size of the BERT model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
-            hidden_size (:obj:`int`, optional, defaults to 768):
-                Dimensionality of the encoder layers and the pooler layer.
-            num_hidden_layers (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            num_attention_heads (:obj:`int`, optional, defaults to 12):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            intermediate_size (:obj:`int`, optional, defaults to 3072):
-                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            type_vocab_size (:obj:`int`, optional, defaults to 2):
-                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            share_encoders (:obj:`bool`, optional, defaults to True):
-                Whether to use the same Bert-type encoder for the queries and document
-            projection_dim (:obj:`int`, optional, defaults to 128):
-                Final dimension of the query and document representation after projection
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
+        hidden_size (:obj:`int`, optional, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, optional, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, optional, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, optional, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        share_encoders (:obj:`bool`, optional, defaults to True):
+            Whether to use the same Bert-type encoder for the queries and document
+        projection_dim (:obj:`int`, optional, defaults to 128):
+            Final dimension of the query and document representation after projection

    """
    model_type = "retribert"

--- a/src/transformers/configuration_roberta.py
+++ b/src/transformers/configuration_roberta.py
@@ -33,34 +33,33 @@ ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class RobertaConfig(BertConfig):
    r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.RobertaModel`.
-        It is used to instantiate an RoBERTa model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.RobertaModel`.
+    It is used to instantiate an RoBERTa model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.

-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.

-        The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`.
-        It reuses the same defaults. Please check the parent class for more information.
+    The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`.
+    It reuses the same defaults. Please check the parent class for more information.

-        Example::
+    Example::

-            >>> from transformers import RobertaConfig, RobertaModel
+        >>> from transformers import RobertaConfig, RobertaModel

-            >>> # Initializing a RoBERTa configuration
-            >>> configuration = RobertaConfig()
+        >>> # Initializing a RoBERTa configuration
+        >>> configuration = RobertaConfig()

-            >>> # Initializing a model from the configuration
-            >>> model = RobertaModel(configuration)
+        >>> # Initializing a model from the configuration
+        >>> model = RobertaModel(configuration)

-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """
    model_type = "roberta"

    def __init__(self, pad_token_id=1, bos_token_id=0, eos_token_id=2, **kwargs):
-        """Constructs RobertaConfig.
-        """
+        """Constructs RobertaConfig."""
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
--- a/src/transformers/configuration_t5.py
+++ b/src/transformers/configuration_t5.py
@@ -31,33 +31,33 @@ T5_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class T5Config(PretrainedConfig):
    r"""
-        :class:`~transformers.T5Config` is the configuration class to store the configuration of a
-        `T5Model`.
+    :class:`~transformers.T5Config` is the configuration class to store the configuration of a
+    `T5Model`.


-        Arguments:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `T5Model`.
-            d_model: Size of the encoder layers and the pooler layer. `d_model` can also accesed via the property `hidden_size`.
-            num_layers: Number of hidden layers in the Transformer encoder. `num_layers` can also be accessed via the property `num_hidden_layers`.
-            d_kv: Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model // num_heads`.
-            d_ff: Size of the intermediate feed forward layer in each `T5Block`.
-            num_heads: Number of attention heads for each attention layer in
-                the Transformer encoder. `num_heads` can also be accessed via the property `num_attention_heads`.
-            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
-                layer in the Transformer encoder.
-            hidden_act: The non-linear activation function (function or string) in the
-                encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob: The dropout probabilitiy for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob: The dropout ratio for the attention
-                probabilities.
-            n_positions: The maximum sequence length that this model might
-                ever be used with. Typically set this to something large just in case
-                (e.g., 512 or 1024 or 2048). `n_positions` can also be accessed via the property `max_position_embeddings`.
-            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
-                `T5Model`.
-            initializer_factor: A factor for initializing all weight matrices (should be kept to 1.0, used for initialization testing).
-            layer_norm_eps: The epsilon used by LayerNorm.
+    Arguments:
+        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `T5Model`.
+        d_model: Size of the encoder layers and the pooler layer. `d_model` can also accesed via the property `hidden_size`.
+        num_layers: Number of hidden layers in the Transformer encoder. `num_layers` can also be accessed via the property `num_hidden_layers`.
+        d_kv: Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model // num_heads`.
+        d_ff: Size of the intermediate feed forward layer in each `T5Block`.
+        num_heads: Number of attention heads for each attention layer in
+            the Transformer encoder. `num_heads` can also be accessed via the property `num_attention_heads`.
+        intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+            layer in the Transformer encoder.
+        hidden_act: The non-linear activation function (function or string) in the
+            encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        hidden_dropout_prob: The dropout probabilitiy for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob: The dropout ratio for the attention
+            probabilities.
+        n_positions: The maximum sequence length that this model might
+            ever be used with. Typically set this to something large just in case
+            (e.g., 512 or 1024 or 2048). `n_positions` can also be accessed via the property `max_position_embeddings`.
+        type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+            `T5Model`.
+        initializer_factor: A factor for initializing all weight matrices (should be kept to 1.0, used for initialization testing).
+        layer_norm_eps: The epsilon used by LayerNorm.
    """
    model_type = "t5"

@@ -80,7 +80,10 @@ class T5Config(PretrainedConfig):
        **kwargs
    ):
        super().__init__(
-            pad_token_id=pad_token_id, eos_token_id=eos_token_id, is_encoder_decoder=is_encoder_decoder, **kwargs,
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            **kwargs,
        )
        self.vocab_size = vocab_size
        self.n_positions = n_positions

--- a/src/transformers/configuration_transfo_xl.py
+++ b/src/transformers/configuration_transfo_xl.py
@@ -31,84 +31,84 @@ TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class TransfoXLConfig(PretrainedConfig):
    """
-        This is the configuration class to store the configuration of a :class:`~transformers.TransfoXLModel`.
-        It is used to instantiate a Transformer XL model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the `Transformer XL <https://huggingface.co/transfo-xl-wt103>`__ architecture.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 267735):
-                Vocabulary size of the Transformer XL model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.TransfoXLModel`.
-            cutoffs (:obj:`List[int]`, optional, defaults to :obj:`[20000, 40000, 200000]`):
-                Cutoffs for the adaptive softmax
-            d_model (:obj:`int`, optional, defaults to 1024):
-                Dimensionality of the model's hidden states.
-            d_embed (:obj:`int`, optional, defaults to 1024):
-                Dimensionality of the embeddings
-            n_head (:obj:`int`, optional, defaults to 16):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            d_head (:obj:`int`, optional, defaults to 64):
-                Dimensionality of the model's heads.
-            d_inner (:obj:`int`, optional, defaults to 4096):
-                Inner dimension in FF
-            div_val (:obj:`int`, optional, defaults to 4):
-                Divident value for adapative input and softmax
-            pre_lnorm (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Apply LayerNorm to the input instead of the output
-            n_layer (:obj:`int`, optional, defaults to 18):
-                Number of hidden layers in the Transformer encoder.
-            tgt_len (:obj:`int`, optional, defaults to 128):
-                Number of tokens to predict
-            ext_len (:obj:`int`, optional, defaults to 0):
-                Length of the extended context
-            mem_len (:obj:`int`, optional, defaults to 1600):
-                Length of the retained previous heads
-            clamp_len (:obj:`int`, optional, defaults to 1000):
-                use the same pos embeddings after clamp_len
-            same_length (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Use the same attn length for all tokens
-            proj_share_all_but_first (:obj:`boolean`, optional, defaults to :obj:`True`):
-                True to share all but first projs, False not to share.
-            attn_type (:obj:`int`, optional, defaults to 0):
-                Attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
-            sample_softmax (:obj:`int`, optional, defaults to -1):
-                number of samples in sampled softmax
-            adaptive (:obj:`boolean`, optional, defaults to :obj:`True`):
-                use adaptive softmax
-            dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-            dropatt (:obj:`float`, optional, defaults to 0):
-                The dropout ratio for the attention probabilities.
-            untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Untie relative position biases
-            init (:obj:`string`, optional, defaults to `normal`):
-                Parameter initializer to use
-            init_range (:obj:`float`, optional, defaults to 0.01):
-                Parameters initialized by U(-init_range, init_range).
-            proj_init_std (:obj:`float`, optional, defaults to 0.01):
-                Parameters initialized by N(0, init_std)
-            init_std (:obj:`float`, optional, defaults to 0.02):
-                Parameters initialized by N(0, init_std)
-            layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
-                The epsilon to use in the layer normalization layers
-
-        Example::
-
-            >>> from transformers import TransfoXLConfig, TransfoXLModel
-
-            >>> # Initializing a Transformer XL configuration
-            >>> configuration = TransfoXLConfig()
-
-            >>> # Initializing a model from the configuration
-            >>> model = TransfoXLModel(configuration)
-
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+    This is the configuration class to store the configuration of a :class:`~transformers.TransfoXLModel`.
+    It is used to instantiate a Transformer XL model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the `Transformer XL <https://huggingface.co/transfo-xl-wt103>`__ architecture.
+
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
+
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 267735):
+            Vocabulary size of the Transformer XL model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.TransfoXLModel`.
+        cutoffs (:obj:`List[int]`, optional, defaults to :obj:`[20000, 40000, 200000]`):
+            Cutoffs for the adaptive softmax
+        d_model (:obj:`int`, optional, defaults to 1024):
+            Dimensionality of the model's hidden states.
+        d_embed (:obj:`int`, optional, defaults to 1024):
+            Dimensionality of the embeddings
+        n_head (:obj:`int`, optional, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        d_head (:obj:`int`, optional, defaults to 64):
+            Dimensionality of the model's heads.
+        d_inner (:obj:`int`, optional, defaults to 4096):
+            Inner dimension in FF
+        div_val (:obj:`int`, optional, defaults to 4):
+            Divident value for adapative input and softmax
+        pre_lnorm (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Apply LayerNorm to the input instead of the output
+        n_layer (:obj:`int`, optional, defaults to 18):
+            Number of hidden layers in the Transformer encoder.
+        tgt_len (:obj:`int`, optional, defaults to 128):
+            Number of tokens to predict
+        ext_len (:obj:`int`, optional, defaults to 0):
+            Length of the extended context
+        mem_len (:obj:`int`, optional, defaults to 1600):
+            Length of the retained previous heads
+        clamp_len (:obj:`int`, optional, defaults to 1000):
+            use the same pos embeddings after clamp_len
+        same_length (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Use the same attn length for all tokens
+        proj_share_all_but_first (:obj:`boolean`, optional, defaults to :obj:`True`):
+            True to share all but first projs, False not to share.
+        attn_type (:obj:`int`, optional, defaults to 0):
+            Attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
+        sample_softmax (:obj:`int`, optional, defaults to -1):
+            number of samples in sampled softmax
+        adaptive (:obj:`boolean`, optional, defaults to :obj:`True`):
+            use adaptive softmax
+        dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        dropatt (:obj:`float`, optional, defaults to 0):
+            The dropout ratio for the attention probabilities.
+        untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Untie relative position biases
+        init (:obj:`string`, optional, defaults to `normal`):
+            Parameter initializer to use
+        init_range (:obj:`float`, optional, defaults to 0.01):
+            Parameters initialized by U(-init_range, init_range).
+        proj_init_std (:obj:`float`, optional, defaults to 0.01):
+            Parameters initialized by N(0, init_std)
+        init_std (:obj:`float`, optional, defaults to 0.02):
+            Parameters initialized by N(0, init_std)
+        layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers
+
+    Example::
+
+        >>> from transformers import TransfoXLConfig, TransfoXLModel
+
+        >>> # Initializing a Transformer XL configuration
+        >>> configuration = TransfoXLConfig()
+
+        >>> # Initializing a model from the configuration
+        >>> model = TransfoXLModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """

    model_type = "transfo-xl"

--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
--- a/src/transformers/configuration_xlm.py
+++ b/src/transformers/configuration_xlm.py
--- a/src/transformers/configuration_xlnet.py
+++ b/src/transformers/configuration_xlnet.py
--- a/src/transformers/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py
@@ -27,5 +27,6 @@ if __name__ == "__main__":
        checkpoint_path = os.path.join(args.dialogpt_path, f"{MODEL}_ft.pkl")
        pytorch_dump_folder_path = f"./DialoGPT-{MODEL}"
        convert_dialogpt_checkpoint(
-            checkpoint_path, pytorch_dump_folder_path,
+            checkpoint_path,
+            pytorch_dump_folder_path,
        )
--- a/src/transformers/convert_graph_to_onnx.py
+++ b/src/transformers/convert_graph_to_onnx.py
@@ -38,24 +38,39 @@ class OnnxConverterArgumentParser(ArgumentParser):
        super().__init__("ONNX Converter")

        self.add_argument(
-            "--pipeline", type=str, choices=SUPPORTED_PIPELINES, default="feature-extraction",
+            "--pipeline",
+            type=str,
+            choices=SUPPORTED_PIPELINES,
+            default="feature-extraction",
        )
        self.add_argument(
-            "--model", type=str, required=True, help="Model's id or path (ex: bert-base-cased)",
+            "--model",
+            type=str,
+            required=True,
+            help="Model's id or path (ex: bert-base-cased)",
        )
        self.add_argument("--tokenizer", type=str, help="Tokenizer's id or path (ex: bert-base-cased)")
        self.add_argument(
-            "--framework", type=str, choices=["pt", "tf"], help="Framework for loading the model",
+            "--framework",
+            type=str,
+            choices=["pt", "tf"],
+            help="Framework for loading the model",
        )
        self.add_argument("--opset", type=int, default=11, help="ONNX opset to use")
        self.add_argument(
-            "--check-loading", action="store_true", help="Check ONNX is able to load the model",
+            "--check-loading",
+            action="store_true",
+            help="Check ONNX is able to load the model",
        )
        self.add_argument(
-            "--use-external-format", action="store_true", help="Allow exporting model >= than 2Gb",
+            "--use-external-format",
+            action="store_true",
+            help="Allow exporting model >= than 2Gb",
        )
        self.add_argument(
-            "--quantize", action="store_true", help="Quantize the neural network to be run with int8",
+            "--quantize",
+            action="store_true",
+            help="Quantize the neural network to be run with int8",
        )
        self.add_argument("output")

@@ -376,7 +391,10 @@ def quantize(onnx_model_path: Path) -> Path:
    )

    quantized_model = quantize(
-        model=onnx_model, quantization_mode=QuantizationMode.IntegerOps, force_fusions=True, symmetric_weight=True,
+        model=onnx_model,
+        quantization_mode=QuantizationMode.IntegerOps,
+        force_fusions=True,
+        symmetric_weight=True,
    )

    # Append "-quantized" at the end of the model's name

--- a/src/transformers/convert_marian_to_pytorch.py
+++ b/src/transformers/convert_marian_to_pytorch.py
@@ -255,7 +255,11 @@ license: apache-2.0


 def write_model_card(
-    hf_model_name: str, repo_root="OPUS-MT-train", save_dir=Path("marian_converted"), dry_run=False, extra_metadata={},
+    hf_model_name: str,
+    repo_root="OPUS-MT-train",
+    save_dir=Path("marian_converted"),
+    dry_run=False,
+    extra_metadata={},
 ) -> str:
    """Copy the most recent model's readme section from opus, and add metadata.
    upload command: aws s3 sync model_card_dir s3://models.huggingface.co/bert/Helsinki-NLP/ --dryrun
@@ -604,7 +608,9 @@ class OpusState:

        assert "hidden_size" not in cfg.to_dict()
        load_layers_(
-            model.model.encoder.layers, state_dict, BART_CONVERTER,
+            model.model.encoder.layers,
+            state_dict,
+            BART_CONVERTER,
        )
        load_layers_(model.model.decoder.layers, state_dict, BART_CONVERTER, is_decoder=True)


--- a/src/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/src/transformers/convert_pytorch_checkpoint_to_tf2.py
--- a/src/transformers/convert_reformer_trax_checkpoint_to_pytorch.py
+++ b/src/transformers/convert_reformer_trax_checkpoint_to_pytorch.py
@@ -49,10 +49,12 @@ def set_layer_weights_in_torch_lsh(weights, torch_layer, hidden_size):
        torch.tensor(np_query_key).transpose(1, 2).contiguous().view(-1, hidden_size),
    )
    set_param(
-        torch_layer.self_attention.value, torch.tensor(np_value).transpose(1, 2).contiguous().view(-1, hidden_size),
+        torch_layer.self_attention.value,
+        torch.tensor(np_value).transpose(1, 2).contiguous().view(-1, hidden_size),
    )
    set_param(
-        torch_layer.output.dense, torch.tensor(np_dense).view(-1, hidden_size).contiguous().transpose(0, 1),
+        torch_layer.output.dense,
+        torch.tensor(np_dense).view(-1, hidden_size).contiguous().transpose(0, 1),
    )


@@ -64,16 +66,20 @@ def set_layer_weights_in_torch_local(weights, torch_layer, hidden_size):
    np_dense = np.asarray(weights[3])

    set_param(
-        torch_layer.self_attention.query, torch.tensor(np_query).transpose(1, 2).contiguous().view(-1, hidden_size),
+        torch_layer.self_attention.query,
+        torch.tensor(np_query).transpose(1, 2).contiguous().view(-1, hidden_size),
    )
    set_param(
-        torch_layer.self_attention.key, torch.tensor(np_key).transpose(1, 2).contiguous().view(-1, hidden_size),
+        torch_layer.self_attention.key,
+        torch.tensor(np_key).transpose(1, 2).contiguous().view(-1, hidden_size),
    )
    set_param(
-        torch_layer.self_attention.value, torch.tensor(np_value).transpose(1, 2).contiguous().view(-1, hidden_size),
+        torch_layer.self_attention.value,
+        torch.tensor(np_value).transpose(1, 2).contiguous().view(-1, hidden_size),
    )
    set_param(
-        torch_layer.output.dense, torch.tensor(np_dense).view(-1, hidden_size).contiguous().transpose(0, 1),
+        torch_layer.output.dense,
+        torch.tensor(np_dense).view(-1, hidden_size).contiguous().transpose(0, 1),
    )


@@ -83,7 +89,9 @@ def set_block_weights_in_torch(weights, torch_block, hidden_size):
    layer_norm_1_weight = np.asarray(layer_norm_1[0])
    layer_norm_1_bias = np.asarray(layer_norm_1[1])
    set_param(
-        torch_block.attention.layer_norm, torch.tensor(layer_norm_1_weight), torch.tensor(layer_norm_1_bias),
+        torch_block.attention.layer_norm,
+        torch.tensor(layer_norm_1_weight),
+        torch.tensor(layer_norm_1_bias),
    )

    # lsh weights + output
@@ -104,7 +112,9 @@ def set_block_weights_in_torch(weights, torch_block, hidden_size):
    layer_norm_2_weight = np.asarray(intermediate_weights[0][0])
    layer_norm_2_bias = np.asarray(intermediate_weights[0][1])
    set_param(
-        torch_block.feed_forward.layer_norm, torch.tensor(layer_norm_2_weight), torch.tensor(layer_norm_2_bias),
+        torch_block.feed_forward.layer_norm,
+        torch.tensor(layer_norm_2_weight),
+        torch.tensor(layer_norm_2_bias),
    )

    # intermediate dense
@@ -133,7 +143,8 @@ def set_model_weights_in_torch(weights, torch_model, hidden_size):
    # word embeds
    word_embeddings = np.asarray(weights[1])
    set_param(
-        torch_model_reformer.embeddings.word_embeddings, torch.tensor(word_embeddings),
+        torch_model_reformer.embeddings.word_embeddings,
+        torch.tensor(word_embeddings),
    )

    if isinstance(weights[3], tuple):

--- a/src/transformers/data/datasets/glue.py
+++ b/src/transformers/data/datasets/glue.py
--- a/src/transformers/data/datasets/language_modeling.py
+++ b/src/transformers/data/datasets/language_modeling.py
--- a/src/transformers/data/datasets/squad.py
+++ b/src/transformers/data/datasets/squad.py
--- a/src/transformers/data/metrics/squad_metrics.py
+++ b/src/transformers/data/metrics/squad_metrics.py
--- a/src/transformers/data/processors/glue.py
+++ b/src/transformers/data/processors/glue.py
--- a/src/transformers/data/processors/utils.py
+++ b/src/transformers/data/processors/utils.py