Doc styling (#8067)

* Important files * Styling them all * Revert "Styling them all" This reverts commit 7d029395fdae8513b8281cbc2a6c239f8093503e. * Syling them for realsies * Fix syntax error * Fix benchmark_utils * More fixes * Fix modeling auto and script * Remove new line * Fixes * More fixes * Fix more files * Style * Add FSMT * More fixes * More fixes * More fixes * More fixes * Fixes * More fixes * More fixes * Last fixes * Make sphinx happy

Doc styling (#8067)
* Important files * Styling them all * Revert "Styling them all" This reverts commit 7d029395fdae8513b8281cbc2a6c239f8093503e. * Syling them for realsies * Fix syntax error * Fix benchmark_utils * More fixes * Fix modeling auto and script * Remove new line * Fixes * More fixes * Fix more files * Style * Add FSMT * More fixes * More fixes * More fixes * More fixes * Fixes * More fixes * More fixes * Last fixes * Make sphinx happy
08f534d2 · Sylvain Gugger · GitHub · 04a17f85 · 08f534d2 · 08f534d2
Unverified Commit 08f534d2 authored Oct 26, 2020 by Sylvain Gugger Committed by GitHub Oct 26, 2020
20 changed files
--- a/src/transformers/configuration_pegasus.py
+++ b/src/transformers/configuration_pegasus.py
@@ -68,12 +68,11 @@ task_specific_params = {
 class PegasusConfig(BartConfig):
    """
    This is the configuration class to store the configuration of a
-    :class:`~transformers.PegasusForConditionalGeneration`. It is used to
+    :class:`~transformers.PegasusForConditionalGeneration`. It is used to instantiate a Pegasus model according to the
-    instantiate a Pegasus model according to the specified arguments, defining the model architecture.
+    specified arguments, defining the model architecture.
-    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
-    for more information.
    Args:
        vocab_size (:obj:`int`, `optional`, defaults to 96103):
@@ -94,8 +93,8 @@ class PegasusConfig(BartConfig):
        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler.
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
        dropout (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
@@ -105,8 +104,8 @@ class PegasusConfig(BartConfig):
        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
            The dropout ratio for classifier.
        max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
-            The maximum sequence length that this model might ever be used with.
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            just in case (e.g., 512 or 1024 or 2048).
        init_std (:obj:`float`, `optional`, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        add_bias_logits (:obj:`bool`, `optional`, defaults to :obj:`False`):
@@ -128,11 +127,11 @@ class PegasusConfig(BartConfig):
        bos_token_id (:obj:`int`, `optional`, defaults to 0)
            Beginning of stream token id.
        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the `LayerDrop paper
+            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
-            <see https://arxiv.org/abs/1909.11556>`__ for more details.
+            https://arxiv.org/abs/1909.11556>`__ for more details.
        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the `LayerDrop paper
+            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
-            <see https://arxiv.org/abs/1909.11556>`__ for more details.
+            https://arxiv.org/abs/1909.11556>`__ for more details.
        extra_pos_embeddings: (:obj:`int`, `optional`, defaults to 2):
            How many extra learned positional embeddings to use. Should be pad_token_id+1 for bart.
        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):

--- a/src/transformers/configuration_prophetnet.py
+++ b/src/transformers/configuration_prophetnet.py
@@ -28,22 +28,21 @@ PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class ProphetNetConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.ProphetNetModel`. It is used to
+    This is the configuration class to store the configuration of a :class:`~transformers.ProphetNetModel`. It is used
-    instantiate a ProphetNet model according to the specified arguments, defining the model architecture.
+    to instantiate a ProphetNet model according to the specified arguments, defining the model architecture.
-    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
-    for more information.
    Args:
        activation_dropout (:obj:`float`, `optional`, defaults to 0.1):
            The dropout ratio for activations inside the fully connected layer.
        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler.
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
        vocab_size (:obj:`int`, `optional`, defaults to 30522):
-            Vocabulary size of the ProphetNET model. Defines the number of different tokens that can be represented by the
+            Vocabulary size of the ProphetNET model. Defines the number of different tokens that can be represented by
-            :obj:`inputs_ids` passed when calling :class:`~transformers.ProphetNetModel`.
+            the :obj:`inputs_ids` passed when calling :class:`~transformers.ProphetNetModel`.
        hidden_size (:obj:`int`, `optional`, defaults to 1024):
            Dimensionality of the layers and the pooler layer.
        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
@@ -63,8 +62,8 @@ class ProphetNetConfig(PretrainedConfig):
        dropout (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
-            The maximum sequence length that this model might ever be used with.
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            just in case (e.g., 512 or 1024 or 2048).
        init_std (:obj:`float`, `optional`, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        add_cross_attention (:obj:`bool`, `optional`, defaults to :obj:`True`):
@@ -78,21 +77,19 @@ class ProphetNetConfig(PretrainedConfig):
        eos_token_id (:obj:`int`, `optional`, defaults to 2)
            End of stream token id.
        ngram (:obj:`int`, `optional`, defaults to 2)
-            Number of future tokens to predict.
+            Number of future tokens to predict. Set to 1 to be same as traditional Language model to predict next first
-            Set to 1 to be same as traditional Language model to predict next first token.
+            token.
        num_buckets (:obj:`int`, `optional`, defaults to 32)
-            The number of buckets to use for each attention layer.
+            The number of buckets to use for each attention layer. This is for relative position calculation. See the
-            This is for relative position calculation. See the `T5 paper
+            `T5 paper <see https://arxiv.org/abs/1910.10683>`__ for more details.
-            <see https://arxiv.org/abs/1910.10683>`__ for more details.
        relative_max_distance (:obj:`int`, `optional`, defaults to 128)
-            Relative distances greater than this number will be put into the last same bucket.
+            Relative distances greater than this number will be put into the last same bucket. This is for relative
-            This is for relative position calculation. See the `T5 paper
+            position calculation. See the `T5 paper <see https://arxiv.org/abs/1910.10683>`__ for more details.
-            <see https://arxiv.org/abs/1910.10683>`__ for more details.
        disable_ngram_loss (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether be trained predicting only the next first token.
        eps (:obj:`float`, `optional`, defaults to 0.0):
-            Controls the ``epsilon`` parameter value for label
+            Controls the ``epsilon`` parameter value for label smoothing in the loss calculation. If set to 0, no label
-            smoothing in the loss calculation. If set to 0, no label smoothing is performed.
+            smoothing is performed.
    """
    model_type = "prophetnet"

--- a/src/transformers/configuration_rag.py
+++ b/src/transformers/configuration_rag.py
@@ -21,16 +21,17 @@ from .file_utils import add_start_docstrings
 RAG_CONFIG_DOC = r"""
-    :class:`~transformers.RagConfig` stores the configuration of a `RagModel`.
+    :class:`~transformers.RagConfig` stores the configuration of a `RagModel`. Configuration objects inherit from
-    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    :class:`~transformers.PretrainedConfig` and can be used to control the model outputs. Read the documentation from
-    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    :class:`~transformers.PretrainedConfig` for more information.
-    for more information.
    Args:
        title_sep (:obj:`str`, `optional`, defaults to  ``" / "``):
-            Separator inserted between the title and the text of the retrieved document when calling :class:`~transformers.RagRetriever`.
+            Separator inserted between the title and the text of the retrieved document when calling
+            :class:`~transformers.RagRetriever`.
        doc_sep (:obj:`str`, `optional`, defaults to  ``" // "``):
-            Separator inserted between the the text of the retrieved document and the original input when calliang :class:`~transformers.RagRetriever`.
+            Separator inserted between the the text of the retrieved document and the original input when calliang
+            :class:`~transformers.RagRetriever`.
        n_docs (:obj:`int`, `optional`, defaults to 5):
            Number of documents to retrieve.
        max_combined_length (:obj:`int`, `optional`, defaults to 300):
@@ -41,8 +42,8 @@ RAG_CONFIG_DOC = r"""
            Retrieval batch size, defined as the number of queries issues concurrently to the faiss index excapsulated
            :class:`~transformers.RagRetriever`.
        dataset (:obj:`str`, `optional`, defaults to :obj:`"wiki_dpr"`):
-            A dataset identifier of the indexed dataset in HuggingFace Datasets (list all available datasets and
+            A dataset identifier of the indexed dataset in HuggingFace Datasets (list all available datasets and ids
-            ids using :obj:`datasets.list_datasets()`).
+            using :obj:`datasets.list_datasets()`).
        dataset_split (:obj:`str`, `optional`, defaults to :obj:`"train"`)
            Which split of the :obj:`dataset` to load.
        index_name (:obj:`str`, `optional`, defaults to :obj:`"compressed"`)
@@ -59,13 +60,13 @@ RAG_CONFIG_DOC = r"""
            Only relevant if ``return_loss`` is set to :obj:`True`. Controls the ``epsilon`` parameter value for label
            smoothing in the loss calculation. If set to 0, no label smoothing is performed.
        do_marginalize (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            If :obj:`True`, the logits are marginalized over all documents
+            If :obj:`True`, the logits are marginalized over all documents by making use of
-            by making use of ``torch.nn.functional.log_softmax``.
+            ``torch.nn.functional.log_softmax``.
        reduce_loss (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether or not to reduce the NLL loss using the ``torch.Tensor.sum`` operation.
        do_deduplication (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to deduplicate the generations from different context documents for a given input.
+            Whether or not to deduplicate the generations from different context documents for a given input. Has to be
-            Has to be set to :obj:`False` if used while training with distributed backend.
+            set to :obj:`False` if used while training with distributed backend.
        exclude_bos_score (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether or not to disregard the BOS token when computing the loss.
        output_retrieved(:obj:`bool`, `optional`, defaults to :obj:`False`):
@@ -160,7 +161,8 @@ class RagConfig(PretrainedConfig):
        cls, question_encoder_config: PretrainedConfig, generator_config: PretrainedConfig, **kwargs
    ) -> PretrainedConfig:
        r"""
-        Instantiate a :class:`~transformers.EncoderDecoderConfig` (or a derived class) from a pre-trained encoder model configuration and decoder model configuration.
+        Instantiate a :class:`~transformers.EncoderDecoderConfig` (or a derived class) from a pre-trained encoder model
+        configuration and decoder model configuration.
        Returns:
            :class:`EncoderDecoderConfig`: An instance of a configuration object
@@ -169,7 +171,8 @@ class RagConfig(PretrainedConfig):
    def to_dict(self):
        """
-        Serializes this instance to a Python dictionary. Override the default :meth:`~transformers.PretrainedConfig.to_dict`.
+        Serializes this instance to a Python dictionary. Override the default
+        :meth:`~transformers.PretrainedConfig.to_dict`.
        Returns:
            :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,

--- a/src/transformers/configuration_reformer.py
+++ b/src/transformers/configuration_reformer.py
@@ -32,16 +32,15 @@ class ReformerConfig(PretrainedConfig):
    This is the configuration class to store the configuration of a :class:`~transformers.ReformerModel`. It is used to
    instantiate a Reformer model according to the specified arguments, defining the model architecture.
-    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
-    for more information.
    Args:
        attention_head_size (:obj:`int`, `optional`, defaults to 64):
            Dimensionality of the projected key, query and value vectors
        attn_layers (:obj:`List[str]`, `optional`, defaults to :obj:`["local", "lsh", "local", "lsh", "local", "lsh"]`):
-            List of attention layer types in ascending order. It can be chosen between a
+            List of attention layer types in ascending order. It can be chosen between a LSHSelfAttention layer
-            LSHSelfAttention layer (:obj:`"lsh"`) and a LocalSelfAttention layer (:obj:`"local"`).
+            (:obj:`"lsh"`) and a LocalSelfAttention layer (:obj:`"local"`).
            For more information on LSHSelfAttention layer, see `LSH Self Attention
            <reformer.html#lsh-self-attention>`__. For more information on LocalSelfAttention layer, see `Local Self
@@ -65,9 +64,9 @@ class ReformerConfig(PretrainedConfig):
            For more information on how axial position embeddings work, see `Axial Position Encodings
            <reformer.html#axial-positional-encodings>`__.
        chunk_size_lm_head (:obj:`int`, `optional`, defaults to 0):
-            The chunk size of the final language model feed forward head layer.
+            The chunk size of the final language model feed forward head layer. A chunk size of 0 means that the feed
-            A chunk size of 0 means that the feed forward layer is not chunked.
+            forward layer is not chunked. A chunk size of n means that the feed forward layer processes n <
-            A chunk size of n means that the feed forward layer processes n < sequence_length embeddings at a time.
+            sequence_length embeddings at a time.
            For more information on feed forward chunking, see `How does Feed Forward Chunking work?
            <../glossary.html#feed-forward-chunking>`__.
@@ -81,8 +80,7 @@ class ReformerConfig(PretrainedConfig):
            :obj:`None` to ensure fully random rotations in local sensitive hashing scheme.
        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"relu"`):
            The non-linear activation function (function or string) in the feed forward layer in the residual attention
-            block.
+            block. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
-            If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.05):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
        hidden_size (:obj:`int`, `optional`, defaults to 256):
@@ -97,8 +95,8 @@ class ReformerConfig(PretrainedConfig):
            The epsilon used by the layer normalization layers.
        local_chunk_length (:obj:`int`, `optional`, defaults to 64):
            Length of chunk which attends to itself in :obj:`LocalSelfAttention`. Chunking reduces memory complexity
-            from sequence length x sequence length (self attention) to
+            from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length /
-            chunk length x chunk length x sequence length / chunk length (chunked self attention).
+            chunk length (chunked self attention).
        local_num_chunks_before (:obj:`int`, `optional`, defaults to 1):
            Number of previous neighbouring chunks to attend to in :obj:`LocalSelfAttention` layer to itself.
        local_num_chunks_after (:obj:`int`, `optional`, defaults to 0):
@@ -108,8 +106,8 @@ class ReformerConfig(PretrainedConfig):
            The dropout ratio for the attention probabilities in :obj:`LocalSelfAttention`.
        lsh_attn_chunk_length (:obj:`int`, `optional`, defaults to 64):
            Length of chunk which attends to itself in :obj:`LSHSelfAttention`. Chunking reduces memory complexity from
-            sequence length x sequence length (self attention) to
+            sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk
-            chunk length x chunk length x sequence length / chunk length (chunked self attention).
+            length (chunked self attention).
        lsh_num_chunks_before (:obj:`int`, `optional`, defaults to 1):
            Number of previous neighbouring chunks to attend to in :obj:`LSHSelfAttention` layer to itself.
        lsh_num_chunks_after (:obj:`int`, `optional`, defaults to 0):
@@ -117,23 +115,22 @@ class ReformerConfig(PretrainedConfig):
        lsh_attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
            The dropout ratio for the attention probabilities in :obj:`LSHSelfAttention`.
        max_position_embeddings (:obj:`int`, `optional`, defaults to 4096):
-            The maximum sequence length that this model might ever be used with.
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            just in case (e.g., 512 or 1024 or 2048).
        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_buckets (:obj:`int` or :obj:`List[int]`, `optional`):
            Number of buckets, the key query vectors can be "hashed into" using the locality sensitive hashing scheme.
-            Each query key vector is hashed into a hash in :obj:`1, ..., num_buckets`.
+            Each query key vector is hashed into a hash in :obj:`1, ..., num_buckets`. The number of buckets can also
-            The number of buckets can also be factorized into a list for improved memory complexity. In this case, each
+            be factorized into a list for improved memory complexity. In this case, each query key vector is hashed
-            query key vector is hashed into a hash in
+            into a hash in :obj:`1-1, 1-2, ..., num_buckets[0]-1, ..., num_buckets[0]-num_buckets[1]` if
-            :obj:`1-1, 1-2, ..., num_buckets[0]-1, ..., num_buckets[0]-num_buckets[1]` if :obj:`num_buckets` is
+            :obj:`num_buckets` is factorized into two factors. The number of buckets (or the product the factors)
-            factorized into two factors.
+            should approximately equal sequence length / lsh_chunk_length. If :obj:`num_buckets` not set, a good value
-            The number of buckets (or the product the factors) should approximately equal
+            is calculated on the fly.
-            sequence length / lsh_chunk_length. If :obj:`num_buckets` not set, a good value is calculated on the fly.
        num_hashes (:obj:`int`, `optional`, defaults to 1):
-            Number of hashing rounds (e.g., number of random rotations) in Local Sensitive Hashing scheme.
+            Number of hashing rounds (e.g., number of random rotations) in Local Sensitive Hashing scheme. The higher
-            The higher :obj:`num_hashes`, the more accurate the :obj:`LSHSelfAttention` becomes, but also the more
+            :obj:`num_hashes`, the more accurate the :obj:`LSHSelfAttention` becomes, but also the more memory and time
-            memory and time intensive the hashing becomes.
+            intensive the hashing becomes.
        pad_token_id (:obj:`int`, `optional`, defaults to 0):
            The token id for the padding token.
        vocab_size (:obj:`int`, `optional`, defaults to 320):\

--- a/src/transformers/configuration_retribert.py
+++ b/src/transformers/configuration_retribert.py
@@ -28,13 +28,11 @@ RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class RetriBertConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.RetriBertModel`.
+    This is the configuration class to store the configuration of a :class:`~transformers.RetriBertModel`. It is used
-    It is used to instantiate a RetriBertModel model according to the specified arguments, defining the model
+    to instantiate a RetriBertModel model according to the specified arguments, defining the model architecture.
-    architecture.
-    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
-    for more information.
    Args:
@@ -50,15 +48,15 @@ class RetriBertConfig(PretrainedConfig):
        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler.
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
            The dropout ratio for the attention probabilities.
        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
-            The maximum sequence length that this model might ever be used with.
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            just in case (e.g., 512 or 1024 or 2048).
        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
            The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
        initializer_range (:obj:`float`, `optional`, defaults to 0.02):

--- a/src/transformers/configuration_roberta.py
+++ b/src/transformers/configuration_roberta.py
@@ -38,12 +38,11 @@ class RobertaConfig(BertConfig):
    arguments, defining the model architecture.
-    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
-    for more information.
-    The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`.
+    The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`. It reuses the
-    It reuses the same defaults. Please check the parent class for more information.
+    same defaults. Please check the parent class for more information.
    Examples::

--- a/src/transformers/configuration_squeezebert.py
+++ b/src/transformers/configuration_squeezebert.py
@@ -29,19 +29,17 @@ SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class SqueezeBertConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.SqueezeBertModel`.
+    This is the configuration class to store the configuration of a :class:`~transformers.SqueezeBertModel`. It is used
-    It is used to instantiate a SqueezeBERT model according to the specified arguments, defining the model
+    to instantiate a SqueezeBERT model according to the specified arguments, defining the model architecture.
-    architecture.
-    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
-    for more information.
    Args:
        vocab_size (:obj:`int`, `optional`, defaults to 30522):
-            Vocabulary size of the SqueezeBERT model. Defines the number of different tokens that can be
+            Vocabulary size of the SqueezeBERT model. Defines the number of different tokens that can be represented by
-            represented by the :obj:`inputs_ids` passed when calling :class:`~transformers.SqueezeBertModel`.
+            the :obj:`inputs_ids` passed when calling :class:`~transformers.SqueezeBertModel`.
        hidden_size (:obj:`int`, `optional`, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
@@ -51,15 +49,15 @@ class SqueezeBertConfig(PretrainedConfig):
        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler.
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
            The dropout ratio for the attention probabilities.
        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
-            The maximum sequence length that this model might ever be used with.
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            just in case (e.g., 512 or 1024 or 2048).
        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or
            :class:`~transformers.TFBertModel`.
@@ -85,7 +83,7 @@ class SqueezeBertConfig(PretrainedConfig):
        output_groups (:obj:`int`, `optional`, defaults to 4):
            The number of groups in the third feed forward network layer.
-    Example:
+    Examples::
        >>> from transformers import SqueezeBertModel, SqueezeBertConfig
@@ -98,9 +96,8 @@ class SqueezeBertConfig(PretrainedConfig):
        >>> # Accessing the model configuration
        >>> configuration = model.config
-    Attributes:
+    Attributes: pretrained_config_archive_map (Dict[str, str]): A dictionary containing all the available pre-trained
-        pretrained_config_archive_map (Dict[str, str]):
+    checkpoints.
-            A dictionary containing all the available pre-trained checkpoints.
    """
    pretrained_config_archive_map = SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
    model_type = "squeezebert"

--- a/src/transformers/configuration_t5.py
+++ b/src/transformers/configuration_t5.py
@@ -32,36 +32,34 @@ T5_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class T5Config(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a :class:`~transformers.T5Model` or a
-    :class:`~transformers.TFT5Model`. It is used to instantiate a T5 model according to the specified
+    :class:`~transformers.TFT5Model`. It is used to instantiate a T5 model according to the specified arguments,
-    arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
+    defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
-    configuration to that of the T5 `t5-small <https://huggingface.co/t5-small>`__ architecture.
+    to that of the T5 `t5-small <https://huggingface.co/t5-small>`__ architecture.
-    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
-    for more information.
    Arguments:
        vocab_size (:obj:`int`, `optional`, defaults to 32128):
            Vocabulary size of the T5 model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.T5Model` or
+            :obj:`inputs_ids` passed when calling :class:`~transformers.T5Model` or :class:`~transformers.TFT5Model`.
-            :class:`~transformers.TFT5Model`.
        n_positions (:obj:`int`, `optional`, defaults to 512):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
        d_model (:obj:`int`, `optional`, defaults to 512):
            Size of the encoder layers and the pooler layer.
        d_kv (:obj:`int`, `optional`, defaults to 64):
-            Size of the key, query, value projections per attention head. :obj:`d_kv` has to be equal to
+            Size of the key, query, value projections per attention head. :obj:`d_kv` has to be equal to :obj:`d_model
-            :obj:`d_model // num_heads`.
+            // num_heads`.
        d_ff (:obj:`int`, `optional`, defaults to 2048):
            Size of the intermediate feed forward layer in each :obj:`T5Block`.
        num_layers (:obj:`int`, `optional`, defaults to 6):
            Number of hidden layers in the Transformer encoder.
        num_decoder_layers (:obj:`int`, `optional`):
-            Number of hidden layers in the Transformer decoder. Will use the same value as :obj:`num_layers` if not set.
+            Number of hidden layers in the Transformer decoder. Will use the same value as :obj:`num_layers` if not
+            set.
        num_heads (:obj:`int`, `optional`, defaults to 8):
-            Number of attention heads for each attention layer in
+            Number of attention heads for each attention layer in the Transformer encoder.
-            the Transformer encoder.
        relative_attention_num_buckets (:obj:`int`, `optional`, defaults to 32):
            The number of buckets to use for each attention layer.
        dropout_rate (:obj:`float`, `optional`, defaults to 0.1):

--- a/src/transformers/configuration_transfo_xl.py
+++ b/src/transformers/configuration_transfo_xl.py
@@ -32,13 +32,12 @@ TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class TransfoXLConfig(PretrainedConfig):
    """
    This is the configuration class to store the configuration of a :class:`~transformers.TransfoXLModel` or a
-    :class:`~transformers.TFTransfoXLModel`. It is used to instantiate a Transformer-XL model according to the specified
+    :class:`~transformers.TFTransfoXLModel`. It is used to instantiate a Transformer-XL model according to the
-    arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
+    specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a
-    configuration to that of the `Transformer XL <https://huggingface.co/transfo-xl-wt103>`__ architecture.
+    similar configuration to that of the `Transformer XL <https://huggingface.co/transfo-xl-wt103>`__ architecture.
-    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
-    for more information.
    Args:
        vocab_size (:obj:`int`, `optional`, defaults to 267735):

--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -29,27 +29,26 @@ logger = logging.get_logger(__name__)
 class PretrainedConfig(object):
-    r"""Base class for all configuration classes.
+    r"""
-    Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving
+    Base class for all configuration classes. Handles a few parameters common to all models' configurations as well as
-    configurations.
+    methods for loading/downloading/saving configurations.
-    Note:
+    Note: A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to
-        A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to
+    initialize a model does **not** load the model weights. It only affects the model's configuration.
-        initialize a model does **not** load the model weights.
-        It only affects the model's configuration.
    Class attributes (overridden by derived classes)
        - **model_type** (:obj:`str`): An identifier for the model type, serialized into the JSON file, and used to
          recreate the correct object in :class:`~transformers.AutoConfig`.
-        - **is_composition** (:obj:`bool`): Whether the config class is composed of multiple
+        - **is_composition** (:obj:`bool`): Whether the config class is composed of multiple sub-configs. In this case
-          sub-configs. In this case the config has to be initialized from two or more configs of
+          the config has to be initialized from two or more configs of type :class:`~transformers.PretrainedConfig`
-          type :class:`~transformers.PretrainedConfig` like: :class:`~transformers.EncoderDecoderConfig` or
+          like: :class:`~transformers.EncoderDecoderConfig` or :class:`~RagConfig`.
-          :class:`~RagConfig`.
    Args:
        name_or_path (:obj:`str`, `optional`, defaults to :obj:`""`):
-            Store the string that was passed to :func:`~transformers.PreTrainedModel.from_pretrained` or :func:`~transformers.TFPreTrainedModel.from_pretrained`
+            Store the string that was passed to :func:`~transformers.PreTrainedModel.from_pretrained` or
-            as ``pretrained_model_name_or_path`` if the configuration was created with such a method.
+            :func:`~transformers.TFPreTrainedModel.from_pretrained` as ``pretrained_model_name_or_path`` if the
+            configuration was created with such a method.
        output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether or not the model should return all hidden-states.
        output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`):
@@ -57,68 +56,72 @@ class PretrainedConfig(object):
        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
        return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not the model should return a :class:`~transformers.file_utils.ModelOutput` instead of a
+            Whether or not the model should return a :class:`~transformers.file_utils.ModelOutput` instead of a plain
-            plain tuple.
+            tuple.
        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether the model is used as an encoder/decoder or not.
        is_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether the model is used as decoder or not (in which case it's used as an encoder).
        add_cross_attention (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether cross-attention layers should be added to the model. Note, this option is only relevant for models that can be used as decoder models within the `:class:~transformers.EncoderDecoderModel` class, which consists of all models in ``AUTO_MODELS_FOR_CAUSAL_LM``.
+            Whether cross-attention layers should be added to the model. Note, this option is only relevant for models
+            that can be used as decoder models within the `:class:~transformers.EncoderDecoderModel` class, which
+            consists of all models in ``AUTO_MODELS_FOR_CAUSAL_LM``.
        tie_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`)
-            Whether all encoder weights should be tied to their equivalent decoder weights. This requires the encoder and decoder model to have the exact same parameter names.
+            Whether all encoder weights should be tied to their equivalent decoder weights. This requires the encoder
+            and decoder model to have the exact same parameter names.
        prune_heads (:obj:`Dict[int, List[int]]`, `optional`, defaults to :obj:`{}`):
-            Pruned heads of the model. The keys are the selected layer indices and the associated values, the list
+            Pruned heads of the model. The keys are the selected layer indices and the associated values, the list of
-            of heads to prune in said layer.
+            heads to prune in said layer.
-            For instance ``{1: [0, 2], 2: [2, 3]}`` will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer
+            For instance ``{1: [0, 2], 2: [2, 3]}`` will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2.
-            2.
        xla_device (:obj:`bool`, `optional`):
            A flag to indicate if TPU are available or not.
        chunk_size_feed_forward (:obj:`int`, `optional`, defaults to :obj:`0`):
-            The chunk size of all feed forward layers in the residual attention blocks.
+            The chunk size of all feed forward layers in the residual attention blocks. A chunk size of :obj:`0` means
-            A chunk size of :obj:`0` means that the feed forward layer is not chunked.
+            that the feed forward layer is not chunked. A chunk size of n means that the feed forward layer processes
-            A chunk size of n means that the feed forward layer processes :obj:`n` < sequence_length embeddings at a time.
+            :obj:`n` < sequence_length embeddings at a time. For more information on feed forward chunking, see `How
-            For more information on feed forward chunking, see `How does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ .
+            does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ .
    Parameters for sequence generation
-        - **max_length** (:obj:`int`, `optional`, defaults to 20) -- Maximum length that will be used by
-          default in the :obj:`generate` method of the model.
+        - **max_length** (:obj:`int`, `optional`, defaults to 20) -- Maximum length that will be used by default in the
-        - **min_length** (:obj:`int`, `optional`, defaults to 10) -- Minimum length that will be used by
+          :obj:`generate` method of the model.
-          default in the :obj:`generate` method of the model.
+        - **min_length** (:obj:`int`, `optional`, defaults to 10) -- Minimum length that will be used by default in the
-        - **do_sample** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Flag that will be used by default in
+          :obj:`generate` method of the model.
-          the :obj:`generate` method of the model. Whether or not to use sampling ; use greedy decoding otherwise.
+        - **do_sample** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Flag that will be used by default in the
-        - **early_stopping** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Flag that will be used by
+          :obj:`generate` method of the model. Whether or not to use sampling ; use greedy decoding otherwise.
-          default in the :obj:`generate` method of the model. Whether to stop the beam search when at least
+        - **early_stopping** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Flag that will be used by default
-          ``num_beams`` sentences are finished per batch or not.
+          in the :obj:`generate` method of the model. Whether to stop the beam search when at least ``num_beams``
-        - **num_beams** (:obj:`int`, `optional`, defaults to 1) -- Number of beams for beam search that will be
+          sentences are finished per batch or not.
-          used by default in the :obj:`generate` method of the model. 1 means no beam search.
+        - **num_beams** (:obj:`int`, `optional`, defaults to 1) -- Number of beams for beam search that will be used by
+          default in the :obj:`generate` method of the model. 1 means no beam search.
        - **temperature** (:obj:`float`, `optional`, defaults to 1) -- The value used to module the next token
          probabilities that will be used by default in the :obj:`generate` method of the model. Must be strictly
          positive.
-        - **top_k** (:obj:`int`, `optional`, defaults to 50) -- Number of highest probability vocabulary tokens to
+        - **top_k** (:obj:`int`, `optional`, defaults to 50) -- Number of highest probability vocabulary tokens to keep
-          keep for top-k-filtering that will be used by default in the :obj:`generate` method of the model.
+          for top-k-filtering that will be used by default in the :obj:`generate` method of the model.
-        - **top_p** (:obj:`float`, `optional`, defaults to 1) --  Value that will be used by default in the
+        - **top_p** (:obj:`float`, `optional`, defaults to 1) -- Value that will be used by default in the
-          :obj:`generate` method of the model for ``top_p``. If set to float < 1, only the most probable tokens
+          :obj:`generate` method of the model for ``top_p``. If set to float < 1, only the most probable tokens with
-          with probabilities that add up to ``top_p`` or higher are kept for generation.
+          probabilities that add up to ``top_p`` or higher are kept for generation.
-        - **repetition_penalty** (:obj:`float`, `optional`, defaults to 1) -- Parameter for repetition penalty
+        - **repetition_penalty** (:obj:`float`, `optional`, defaults to 1) -- Parameter for repetition penalty that
-          that will be used by default in the :obj:`generate` method of the model. 1.0 means no penalty.
+          will be used by default in the :obj:`generate` method of the model. 1.0 means no penalty.
-        - **length_penalty** (:obj:`float`, `optional`, defaults to 1) -- Exponential penalty to the length that
+        - **length_penalty** (:obj:`float`, `optional`, defaults to 1) -- Exponential penalty to the length that will
-          will be used by default in the :obj:`generate` method of the model.
+          be used by default in the :obj:`generate` method of the model.
-        - **no_repeat_ngram_size** (:obj:`int`, `optional`, defaults to 0) -- Value that will be used by default
+        - **no_repeat_ngram_size** (:obj:`int`, `optional`, defaults to 0) -- Value that will be used by default in the
-          in the :obj:`generate` method of the model for ``no_repeat_ngram_size``. If set to int > 0, all ngrams of
+          :obj:`generate` method of the model for ``no_repeat_ngram_size``. If set to int > 0, all ngrams of that size
-          that size can only occur once.
+          can only occur once.
-        - **bad_words_ids** (:obj:`List[int]`, `optional`) -- List of token ids that are not allowed to be
+        - **bad_words_ids** (:obj:`List[int]`, `optional`) -- List of token ids that are not allowed to be generated
-          generated that will be used by default in the :obj:`generate` method of the model. In order to get the
+          that will be used by default in the :obj:`generate` method of the model. In order to get the tokens of the
-          tokens of the words that should not appear in the generated text, use
+          words that should not appear in the generated text, use :obj:`tokenizer.encode(bad_word,
-          :obj:`tokenizer.encode(bad_word, add_prefix_space=True)`.
+          add_prefix_space=True)`.
-        - **num_return_sequences** (:obj:`int`, `optional`, defaults to 1) -- Number of independently computed
+        - **num_return_sequences** (:obj:`int`, `optional`, defaults to 1) -- Number of independently computed returned
-          returned sequences for each element in the batch that will be used by default in the :obj:`generate`
+          sequences for each element in the batch that will be used by default in the :obj:`generate` method of the
-          method of the model.
+          model.
    Parameters for fine-tuning tasks
-        - **architectures** (:obj:`List[str]`, `optional`) -- Model architectures that can be used with the
-          model pretrained weights.
+        - **architectures** (:obj:`List[str]`, `optional`) -- Model architectures that can be used with the model
+          pretrained weights.
        - **finetuning_task** (:obj:`str`, `optional`) -- Name of the task used to fine-tune the model. This can be
          used when converting from an original (TensorFlow or PyTorch) checkpoint.
        - **id2label** (:obj:`Dict[int, str]`, `optional`) -- A map from index (for instance prediction index, or
@@ -126,27 +129,32 @@ class PretrainedConfig(object):
        - **label2id** (:obj:`Dict[str, int]`, `optional`) -- A map from label to index for the model.
        - **num_labels** (:obj:`int`, `optional`) -- Number of labels to use in the last layer added to the model,
          typically for a classification task.
-        - **task_specific_params** (:obj:`Dict[str, Any]`, `optional`) -- Additional keyword arguments to store for
+        - **task_specific_params** (:obj:`Dict[str, Any]`, `optional`) -- Additional keyword arguments to store for the
-          the current task.
+          current task.
    Parameters linked to the tokenizer
-        - **prefix** (:obj:`str`, `optional`) -- A specific prompt that should be added at the beginning of each
-          text before calling the model.
+        - **prefix** (:obj:`str`, `optional`) -- A specific prompt that should be added at the beginning of each text
+          before calling the model.
        - **bos_token_id** (:obj:`int`, `optional`)) -- The id of the `beginning-of-stream` token.
        - **pad_token_id** (:obj:`int`, `optional`)) -- The id of the `padding` token.
        - **eos_token_id** (:obj:`int`, `optional`)) -- The id of the `end-of-stream` token.
-        - **decoder_start_token_id** (:obj:`int`, `optional`)) -- If an encoder-decoder model starts decoding with
+        - **decoder_start_token_id** (:obj:`int`, `optional`)) -- If an encoder-decoder model starts decoding with a
-          a different token than `bos`, the id of that token.
+          different token than `bos`, the id of that token.
        - **sep_token_id** (:obj:`int`, `optional`)) -- The id of the `separation` token.
    PyTorch specific parameters
        - **torchscript** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether or not the model should be
          used with Torchscript.
-        - **tie_word_embeddings** (:obj:`bool`, `optional`, defaults to :obj:`True`) -- Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the model has a output word embedding layer.
+        - **tie_word_embeddings** (:obj:`bool`, `optional`, defaults to :obj:`True`) -- Whether the model's input and
+          output word embeddings should be tied. Note that this is only relevant if the model has a output word
+          embedding layer.
    TensorFlow specific parameters
-        - **use_bfloat16** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether or not the model should
-          use BFloat16 scalars (only used by some TensorFlow models).
+        - **use_bfloat16** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether or not the model should use
+          BFloat16 scalars (only used by some TensorFlow models).
    """
    model_type: str = ""
    is_composition: bool = False
@@ -293,15 +301,14 @@ class PretrainedConfig(object):
                Path to a directory in which a downloaded pretrained model configuration should be cached if the
                standard cache should not be used.
            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not to force to (re-)download the configuration files and override the cached versions if they
+                Whether or not to force to (re-)download the configuration files and override the cached versions if
-                exist.
+                they exist.
            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to delete incompletely received file. Attempts to resume the download if such a file
                exists.
            proxies (:obj:`Dict[str, str]`, `optional`):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.,
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
-                :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.`
+                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-                The proxies are used on each request.
            return_unused_kwargs (:obj:`bool`, `optional`, defaults to :obj:`False`):
                If :obj:`False`, then this function returns just the final configuration object.
@@ -310,8 +317,8 @@ class PretrainedConfig(object):
                the part of ``kwargs`` which has not been used to update ``config`` and is otherwise ignored.
            kwargs (:obj:`Dict[str, Any]`, `optional`):
                The values in kwargs of any keys which are configuration attributes will be used to override the loaded
-                values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is
+                values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
-                controlled by the ``return_unused_kwargs`` keyword parameter.
+                by the ``return_unused_kwargs`` keyword parameter.
        Returns:
            :class:`PretrainedConfig`: The configuration object instantiated from this pretrained model.
@@ -337,8 +344,8 @@ class PretrainedConfig(object):
    @classmethod
    def get_config_dict(cls, pretrained_model_name_or_path: str, **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
        """
-        From a ``pretrained_model_name_or_path``, resolve to a dictionary of parameters, to be used
+        From a ``pretrained_model_name_or_path``, resolve to a dictionary of parameters, to be used for instantiating a
-        for instantiating a :class:`~transformers.PretrainedConfig` using ``from_dict``.
+        :class:`~transformers.PretrainedConfig` using ``from_dict``.
        Parameters:
            pretrained_model_name_or_path (:obj:`str`):
@@ -469,9 +476,8 @@ class PretrainedConfig(object):
    def to_diff_dict(self) -> Dict[str, Any]:
        """
-        Removes all attributes from config which correspond to the default
+        Removes all attributes from config which correspond to the default config attributes for better readability and
-        config attributes for better readability and serializes to a Python
+        serializes to a Python dictionary.
-        dictionary.
        Returns:
            :obj:`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance,

--- a/src/transformers/configuration_xlm.py
+++ b/src/transformers/configuration_xlm.py
@@ -37,19 +37,17 @@ XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class XLMConfig(PretrainedConfig):
    """
    This is the configuration class to store the configuration of a :class:`~transformers.XLMModel` or a
-    :class:`~transformers.TFXLMModel`. It is used to instantiate a XLM model according to the specified
+    :class:`~transformers.TFXLMModel`. It is used to instantiate a XLM model according to the specified arguments,
-    arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
+    defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
-    configuration to that of the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.
+    to that of the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.
-    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
-    for more information.
    Args:
        vocab_size (:obj:`int`, `optional`, defaults to 30145):
            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.XLMModel` or
+            :obj:`inputs_ids` passed when calling :class:`~transformers.XLMModel` or :class:`~transformers.TFXLMModel`.
-            :class:`~transformers.TFXLMModel`.
        emb_dim (:obj:`int`, `optional`, defaults to 2048):
            Dimensionality of the encoder layers and the pooler layer.
        n_layer (:obj:`int`, `optional`, defaults to 12):
@@ -57,8 +55,7 @@ class XLMConfig(PretrainedConfig):
        n_head (:obj:`int`, `optional`, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        dropout (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probability for all fully connected
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-            layers in the embeddings, encoder, and pooler.
        attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probability for the attention mechanism
        gelu_activation (:obj:`bool`, `optional`, defaults to :obj:`True`):
@@ -66,28 +63,25 @@ class XLMConfig(PretrainedConfig):
        sinusoidal_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether or not to use sinusoidal positional embeddings instead of absolute positional embeddings.
        causal (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not the model should behave in a causal manner.
+            Whether or not the model should behave in a causal manner. Causal models use a triangular attention mask in
-            Causal models use a triangular attention mask in order to only attend to the left-side context instead
+            order to only attend to the left-side context instead if a bidirectional context.
-            if a bidirectional context.
        asm (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether or not to use an adaptive log softmax projection layer instead of a linear layer for the prediction
            layer.
        n_langs (:obj:`int`, `optional`, defaults to 1):
            The number of languages the model handles. Set to 1 for monolingual models.
        use_lang_emb (:obj:`bool`, `optional`, defaults to :obj:`True`)
-            Whether to use language embeddings. Some models use additional language embeddings, see
+            Whether to use language embeddings. Some models use additional language embeddings, see `the multilingual
-            `the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
+            models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__ for
-            for information on how to use them.
+            information on how to use them.
        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
-            The maximum sequence length that this model might
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            ever be used with. Typically set this to something large just in case
+            just in case (e.g., 512 or 1024 or 2048).
-            (e.g., 512 or 1024 or 2048).
        embed_init_std (:obj:`float`, `optional`, defaults to 2048^-0.5):
-            The standard deviation of the truncated_normal_initializer for
+            The standard deviation of the truncated_normal_initializer for initializing the embedding matrices.
-            initializing the embedding matrices.
        init_std (:obj:`int`, `optional`, defaults to 50257):
-            The standard deviation of the truncated_normal_initializer for
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices except the
-            initializing all weight matrices except the embedding matrices.
+            embedding matrices.
        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
        bos_index (:obj:`int`, `optional`, defaults to 0):
@@ -135,8 +129,7 @@ class XLMConfig(PretrainedConfig):
        mask_token_id (:obj:`int`, `optional`, defaults to 0):
            Model agnostic parameter to identify masked tokens when generating text in an MLM context.
        lang_id (:obj:`int`, `optional`, defaults to 1):
-            The ID of the language used by the model. This parameter is used when generating
+            The ID of the language used by the model. This parameter is used when generating text in a given language.
-            text in a given language.
    Examples::

--- a/src/transformers/configuration_xlm_prophetnet.py
+++ b/src/transformers/configuration_xlm_prophetnet.py
@@ -28,8 +28,8 @@ XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class XLMProphetNetConfig(ProphetNetConfig):
    """
-    This class overrides :class:`~transformers.ProphetNetConfig`. Please check the
+    This class overrides :class:`~transformers.ProphetNetConfig`. Please check the superclass for the appropriate
-    superclass for the appropriate documentation alongside usage examples.
+    documentation alongside usage examples.
    """
    model_type = "xlm-prophetnet"
--- a/src/transformers/configuration_xlm_roberta.py
+++ b/src/transformers/configuration_xlm_roberta.py
@@ -33,8 +33,8 @@ XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class XLMRobertaConfig(RobertaConfig):
    """
-    This class overrides :class:`~transformers.RobertaConfig`. Please check the
+    This class overrides :class:`~transformers.RobertaConfig`. Please check the superclass for the appropriate
-    superclass for the appropriate documentation alongside usage examples.
+    documentation alongside usage examples.
    """
    model_type = "xlm-roberta"
--- a/src/transformers/configuration_xlnet.py
+++ b/src/transformers/configuration_xlnet.py
@@ -32,13 +32,12 @@ XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class XLNetConfig(PretrainedConfig):
    """
    This is the configuration class to store the configuration of a :class:`~transformers.XLNetModel` or a
-    :class:`~transformers.TFXLNetModel`. It is used to instantiate a XLNet model according to the specified
+    :class:`~transformers.TFXLNetModel`. It is used to instantiate a XLNet model according to the specified arguments,
-    arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
+    defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
-    configuration to that of the `xlnet-large-cased <https://huggingface.co/xlnet-large-cased>`__ architecture.
+    to that of the `xlnet-large-cased <https://huggingface.co/xlnet-large-cased>`__ architecture.
-    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
-    for more information.
    Args:
        vocab_size (:obj:`int`, `optional`, defaults to 32000):
@@ -54,8 +53,8 @@ class XLNetConfig(PretrainedConfig):
        d_inner (:obj:`int`, `optional`, defaults to 4096):
            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
        ff_activation (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
-            The non-linear activation function (function or string) in the
+            The non-linear activation function (function or string) in the If string, :obj:`"gelu"`, :obj:`"relu"`,
-            If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"swish"` and :obj:`"gelu_new"` are supported.
        untie_r (:obj:`bool`, `optional`, defaults to :obj:`True`):
            Whether or not to untie relative position biases
        attn_type (:obj:`str`, `optional`, defaults to :obj:`"bi"`):
@@ -67,18 +66,16 @@ class XLNetConfig(PretrainedConfig):
        dropout (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        mem_len (:obj:`int` or :obj:`None`, `optional`):
-            The number of tokens to cache. The key/value pairs that have already been pre-computed
+            The number of tokens to cache. The key/value pairs that have already been pre-computed in a previous
-            in a previous forward pass won't be re-computed. See the
+            forward pass won't be re-computed. See the `quickstart
-            `quickstart <https://huggingface.co/transformers/quickstart.html#using-the-past>`__
+            <https://huggingface.co/transformers/quickstart.html#using-the-past>`__ for more information.
-            for more information.
        reuse_len (:obj:`int`, `optional`):
            The number of tokens in the current batch to be cached and reused in the future.
        bi_data (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to use bidirectional input pipeline. Usually set to :obj:`True` during
+            Whether or not to use bidirectional input pipeline. Usually set to :obj:`True` during pretraining and
-            pretraining and :obj:`False` during finetuning.
+            :obj:`False` during finetuning.
        clamp_len (:obj:`int`, `optional`, defaults to -1):
-            Clamp all relative distances larger than clamp_len.
+            Clamp all relative distances larger than clamp_len. Setting this attribute to -1 means no clamping.
-            Setting this attribute to -1 means no clamping.
        same_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether or not to use the same attention length for each token.
        summary_type (:obj:`str`, `optional`, defaults to "last"):

--- a/src/transformers/convert_bert_original_tf2_checkpoint_to_pytorch.py
+++ b/src/transformers/convert_bert_original_tf2_checkpoint_to_pytorch.py
 """
-This script can be used to convert a head-less TF2.x Bert model to PyTorch,
+This script can be used to convert a head-less TF2.x Bert model to PyTorch, as published on the official GitHub:
-as published on the official GitHub: https://github.com/tensorflow/models/tree/master/official/nlp/bert
+https://github.com/tensorflow/models/tree/master/official/nlp/bert
-TF2.x uses different variable names from the original BERT (TF 1.4) implementation.
+TF2.x uses different variable names from the original BERT (TF 1.4) implementation. The script re-maps the TF2.x Bert
-The script re-maps the TF2.x Bert weight names to the original names, so the model can be imported with Huggingface/transformer.
+weight names to the original names, so the model can be imported with Huggingface/transformer.
 You may adapt this script to include classification/MLM/NSP/etc. heads.
 """

--- a/src/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py
+++ b/src/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py
@@ -28,12 +28,13 @@ from transformers import BertModel
 def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str):
    """
-    Args
+    Args:
        model: BertModel Pytorch model instance to be converted
        ckpt_dir: Tensorflow model directory
        model_name: model name
    Currently supported HF models:
        - Y BertModel
        - N BertForMaskedLM
        - N BertForPreTraining

--- a/src/transformers/convert_graph_to_onnx.py
+++ b/src/transformers/convert_graph_to_onnx.py
@@ -77,7 +77,8 @@ class OnnxConverterArgumentParser(ArgumentParser):
 def generate_identified_filename(filename: Path, identifier: str) -> Path:
    """
-    Append a string-identifier at the end (before the extension,  if any) to the provided filepath.
+    Append a string-identifier at the end (before the extension, if any) to the provided filepath
    Args:
        filename: pathlib.Path The actual path object we would like to add an identifier suffix
        identifier: The suffix to add
@@ -89,7 +90,8 @@ def generate_identified_filename(filename: Path, identifier: str) -> Path:
 def check_onnxruntime_requirements(minimum_version: Version):
    """
-    Check onnxruntime is installed and if the installed version match is recent enough.
+    Check onnxruntime is installed and if the installed version match is recent enough
    Raises:
        ImportError: If onnxruntime is not installed or too old version is found
    """
@@ -117,7 +119,8 @@ def check_onnxruntime_requirements(minimum_version: Version):
 def ensure_valid_input(model, tokens, input_names):
    """
-    Ensure input are presented in the correct order, without any None
+    Ensure input are presented in the correct order, without any Non
    Args:
        model: The model used to forward the input data
        tokens: BatchEncoding holding the input data
@@ -144,12 +147,14 @@ def ensure_valid_input(model, tokens, input_names):
 def infer_shapes(nlp: Pipeline, framework: str) -> Tuple[List[str], List[str], Dict, BatchEncoding]:
    """
-    Attempt to infer the static vs dynamic axes for each input and output tensors for a specific model.
+    Attempt to infer the static vs dynamic axes for each input and output tensors for a specific model
    Args:
        nlp: The pipeline object holding the model to be exported
        framework: The framework identifier to dispatch to the correct inference scheme (pt/tf)
    Returns:
        - List of the inferred input variable names
        - List of the inferred output variable names
        - Dictionary with input/output variables names as key and shape tensor as value
@@ -206,7 +211,8 @@ def infer_shapes(nlp: Pipeline, framework: str) -> Tuple[List[str], List[str], D
 def load_graph_from_args(pipeline_name: str, framework: str, model: str, tokenizer: Optional[str] = None) -> Pipeline:
    """
-    Convert the set of arguments provided through the CLI to an actual pipeline reference (tokenizer + model)
+    Convert the set of arguments provided through the CLI to an actual pipeline reference (tokenizer + model
    Args:
        pipeline_name: The kind of pipeline to use (ner, question-answering, etc.)
        framework: The actual model to convert the pipeline from ("pt" or "tf")
@@ -234,7 +240,8 @@ def load_graph_from_args(pipeline_name: str, framework: str, model: str, tokeniz
 def convert_pytorch(nlp: Pipeline, opset: int, output: Path, use_external_format: bool):
    """
-    Export a PyTorch backed pipeline to ONNX Intermediate Representation (IR)
+    Export a PyTorch backed pipeline to ONNX Intermediate Representation (IR
    Args:
        nlp: The pipeline to be exported
        opset: The actual version of the ONNX operator set to use
@@ -272,7 +279,8 @@ def convert_pytorch(nlp: Pipeline, opset: int, output: Path, use_external_format
 def convert_tensorflow(nlp: Pipeline, opset: int, output: Path):
    """
-    Export a TensorFlow backed pipeline to ONNX Intermediate Representation (IR)
+    Export a TensorFlow backed pipeline to ONNX Intermediate Representation (IR
    Args:
        nlp: The pipeline to be exported
        opset: The actual version of the ONNX operator set to use
@@ -316,7 +324,8 @@ def convert(
    pipeline_name: str = "feature-extraction",
 ):
    """
-    Convert the pipeline object to the ONNX Intermediate Representation (IR) format.
+    Convert the pipeline object to the ONNX Intermediate Representation (IR) format
    Args:
        framework: The framework the pipeline is backed by ("pt" or "tf")
        model: The name of the model to load for the pipeline
@@ -349,8 +358,9 @@ def convert(
 def optimize(onnx_model_path: Path) -> Path:
    """
-    Load the model at the specified path and let onnxruntime look at transformations on the graph
+    Load the model at the specified path and let onnxruntime look at transformations on the graph to enable all the
-    to enable all the optimizations possible
+    optimizations possibl
    Args:
        onnx_model_path: filepath where the model binary description is stored
@@ -373,7 +383,8 @@ def optimize(onnx_model_path: Path) -> Path:
 def quantize(onnx_model_path: Path) -> Path:
    """
-    Quantize the weights of the model from float32 to in8 to allow very efficient inference on modern CPU.
+    Quantize the weights of the model from float32 to in8 to allow very efficient inference on modern CPU
    Args:
        onnx_model_path: Path to location the exported ONNX model is stored

--- a/src/transformers/convert_marian_tatoeba_to_pytorch.py
+++ b/src/transformers/convert_marian_tatoeba_to_pytorch.py
@@ -27,14 +27,16 @@ LANG_CODE_PATH = "lang_code_data/language-codes-3b2.csv"
 class TatoebaConverter:
-    """Convert Tatoeba-Challenge models to huggingface format.
+    """
+    Convert Tatoeba-Challenge models to huggingface format.
    Steps:
        1. convert numpy state dict to hf format (same code as OPUS-MT-Train conversion).
-        2. rename opus model to huggingface format. This means replace each alpha3 code with an alpha2 code if a unique one existes.
+        2. rename opus model to huggingface format. This means replace each alpha3 code with an alpha2 code if a unique
-           e.g. aav-eng -> aav-en, heb-eng -> he-en
+           one existes. e.g. aav-eng -> aav-en, heb-eng -> he-en
-        3. write a model card containing the original Tatoeba-Challenge/README.md and extra info about alpha3 group members.
+        3. write a model card containing the original Tatoeba-Challenge/README.md and extra info about alpha3 group
+           members.
    """
    def __init__(self, save_dir="marian_converted"):
@@ -148,8 +150,9 @@ class TatoebaConverter:
        repo_root=DEFAULT_REPO,
        dry_run=False,
    ) -> str:
-        """Copy the most recent model's readme section from opus, and add metadata.
+        """
-        upload command: aws s3 sync model_card_dir s3://models.huggingface.co/bert/Helsinki-NLP/ --dryrun
+        Copy the most recent model's readme section from opus, and add metadata. upload command: aws s3 sync
+        model_card_dir s3://models.huggingface.co/bert/Helsinki-NLP/ --dryrun
        """
        short_pair = remove_prefix(hf_model_id, "opus-mt-")
        extra_metadata = self.metadata.loc[short_pair].drop("2m")

--- a/src/transformers/convert_marian_to_pytorch.py
+++ b/src/transformers/convert_marian_to_pytorch.py
@@ -152,8 +152,9 @@ def convert_opus_name_to_hf_name(x):
 def convert_hf_name_to_opus_name(hf_model_name):
-    """Relies on the assumption that there are no language codes like pt_br in models that are not in
+    """
-    GROUP_TO_OPUS_NAME."""
+    Relies on the assumption that there are no language codes like pt_br in models that are not in GROUP_TO_OPUS_NAME.
+    """
    hf_model_name = remove_prefix(hf_model_name, ORG_NAME)
    if hf_model_name in GROUP_TO_OPUS_NAME:
        opus_w_prefix = GROUP_TO_OPUS_NAME[hf_model_name]
@@ -173,6 +174,7 @@ def get_system_metadata(repo_root):
    )
+# docstyle-ignore
 FRONT_MATTER_TEMPLATE = """---
 language:
 {}
@@ -181,7 +183,6 @@ tags:
 license: apache-2.0
 ---
 """
 DEFAULT_REPO = "Tatoeba-Challenge"
 DEFAULT_MODEL_DIR = os.path.join(DEFAULT_REPO, "models")
@@ -194,8 +195,9 @@ def write_model_card(
    dry_run=False,
    extra_metadata={},
 ) -> str:
-    """Copy the most recent model's readme section from opus, and add metadata.
+    """
-    upload command: aws s3 sync model_card_dir s3://models.huggingface.co/bert/Helsinki-NLP/ --dryrun
+    Copy the most recent model's readme section from opus, and add metadata. upload command: aws s3 sync model_card_dir
+    s3://models.huggingface.co/bert/Helsinki-NLP/ --dryrun
    """
    import pandas as pd

--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -12,10 +12,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Utilities to convert slow tokenizers in their fast tokenizers counterparts.
+"""
+ Utilities to convert slow tokenizers in their fast tokenizers counterparts.
-    All the conversions are grouped here to gather SentencePiece dependencies outside of
+    All the conversions are grouped here to gather SentencePiece dependencies outside of the fast tokenizers files and
-    the fast tokenizers files and allow to make our dependency on SentencePiece optional.
+    allow to make our dependency on SentencePiece optional.
 """
 from typing import Dict, List, Tuple
@@ -31,8 +32,7 @@ from .file_utils import requires_sentencepiece
 class SentencePieceExtractor:
    """
-    Extractor implementation for SentencePiece trained models.
+    Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece
-    https://github.com/google/sentencepiece
    """
    def __init__(self, model: str):
@@ -602,7 +602,8 @@ SLOW_TO_FAST_CONVERTERS = {
 def convert_slow_tokenizer(transformer_tokenizer) -> Tokenizer:
-    """Utilities to convert a slow tokenizer instance in a fast tokenizer instance.
+    """
+    Utilities to convert a slow tokenizer instance in a fast tokenizer instance.
    Args:
        transformer_tokenizer (:class:`~transformers.tokenization_utils_base.PreTrainedTokenizer`):