Doc styling (#8067)

* Important files * Styling them all * Revert "Styling them all" This reverts commit 7d029395fdae8513b8281cbc2a6c239f8093503e. * Syling them for realsies * Fix syntax error * Fix benchmark_utils * More fixes * Fix modeling auto and script * Remove new line * Fixes * More fixes * Fix more files * Style * Add FSMT * More fixes * More fixes * More fixes * More fixes * Fixes * More fixes * More fixes * Last fixes * Make sphinx happy

Doc styling (#8067)
* Important files * Styling them all * Revert "Styling them all" This reverts commit 7d029395fdae8513b8281cbc2a6c239f8093503e. * Syling them for realsies * Fix syntax error * Fix benchmark_utils * More fixes * Fix modeling auto and script * Remove new line * Fixes * More fixes * Fix more files * Style * Add FSMT * More fixes * More fixes * More fixes * More fixes * Fixes * More fixes * More fixes * Last fixes * Make sphinx happy
08f534d2 · Sylvain Gugger · GitHub · 04a17f85 · 08f534d2 · 08f534d2
Unverified Commit 08f534d2 authored Oct 26, 2020 by Sylvain Gugger Committed by GitHub Oct 26, 2020
20 changed files
--- a/src/transformers/configuration_bert_generation.py
+++ b/src/transformers/configuration_bert_generation.py
@@ -23,9 +23,8 @@ class BertGenerationConfig(PretrainedConfig):
    :class:`~transformers.BertGenerationPreTrainedModel`. It is used to instantiate a BertGeneration model according to
    the specified arguments, defining the model architecture.
-    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
-    for more information.
    Args:
        vocab_size (:obj:`int`, `optional`, defaults to 50358):
@@ -40,15 +39,15 @@ class BertGenerationConfig(PretrainedConfig):
        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
            Dimensionality of the "intermediate" (often called feed-forward) layer in the Transformer encoder.
        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler.
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
            The dropout ratio for the attention probabilities.
        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
-            The maximum sequence length that this model might ever be used with.
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            just in case (e.g., 512 or 1024 or 2048).
        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):

--- a/src/transformers/configuration_blenderbot.py
+++ b/src/transformers/configuration_blenderbot.py
@@ -14,7 +14,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # LICENSE file in the root directory of this source tree.
-"""BlenderbotConfig has the same signature as BartConfig. We only rewrite the signature in order to document blenderbot-90M defaults."""
+"""
+BlenderbotConfig has the same signature as BartConfig. We only rewrite the signature in order to document
+blenderbot-90M defaults.
+"""
 from .configuration_bart import BartConfig
@@ -26,12 +29,12 @@ BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class BlenderbotConfig(BartConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.BlenderbotForConditionalGeneration`.
+    This is the configuration class to store the configuration of a
-    It inherits from :class:`~transformers.BartConfig` and has the same signature with different defaults.
+    :class:`~transformers.BlenderbotForConditionalGeneration`. It inherits from :class:`~transformers.BartConfig` and
+    has the same signature with different defaults.
-    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
-    for more information.
    Args:
        vocab_size (:obj:`int`, `optional`, defaults to 54944):
@@ -52,8 +55,8 @@ class BlenderbotConfig(BartConfig):
        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler.
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
        dropout (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
@@ -63,8 +66,8 @@ class BlenderbotConfig(BartConfig):
        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
            The dropout ratio for classifier.
        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
-            The maximum sequence length that this model might ever be used with.
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            just in case (e.g., 512 or 1024 or 2048).
        init_std (:obj:`float`, `optional`, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        add_bias_logits (:obj:`bool`, `optional`, defaults to :obj:`False`):
@@ -88,11 +91,11 @@ class BlenderbotConfig(BartConfig):
        bos_token_id (:obj:`int`, `optional`, defaults to 0)
            Beginning of stream token id.
        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the `LayerDrop paper
+            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
-            <see https://arxiv.org/abs/1909.11556>`__ for more details.
+            https://arxiv.org/abs/1909.11556>`__ for more details.
        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the `LayerDrop paper
+            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
-            <see https://arxiv.org/abs/1909.11556>`__ for more details.
+            https://arxiv.org/abs/1909.11556>`__ for more details.
        extra_pos_embeddings: (:obj:`int`, `optional`, defaults to 2):
            How many extra learned positional embeddings to use. Should be set to :obj:`pad_token_id+1`.
        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):

--- a/src/transformers/configuration_camembert.py
+++ b/src/transformers/configuration_camembert.py
@@ -30,8 +30,8 @@ CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class CamembertConfig(RobertaConfig):
    """
-    This class overrides :class:`~transformers.RobertaConfig`. Please check the
+    This class overrides :class:`~transformers.RobertaConfig`. Please check the superclass for the appropriate
-    superclass for the appropriate documentation alongside usage examples.
+    documentation alongside usage examples.
    """
    model_type = "camembert"
--- a/src/transformers/configuration_ctrl.py
+++ b/src/transformers/configuration_ctrl.py
@@ -26,13 +26,12 @@ CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://s3.amazonaws.com/models.h
 class CTRLConfig(PretrainedConfig):
    """
    This is the configuration class to store the configuration of a :class:`~transformers.CTRLModel` or a
-    :class:`~transformers.TFCTRLModel`. It is used to instantiate a CTRL model according to the specified
+    :class:`~transformers.TFCTRLModel`. It is used to instantiate a CTRL model according to the specified arguments,
-    arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
+    defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
-    configuration to that of the `ctrl <https://huggingface.co/ctrl>`__ architecture from SalesForce.
+    to that of the `ctrl <https://huggingface.co/ctrl>`__ architecture from SalesForce.
-    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
-    for more information.
    Args:
        vocab_size (:obj:`int`, `optional`, defaults to 246534):
@@ -40,8 +39,8 @@ class CTRLConfig(PretrainedConfig):
            :obj:`inputs_ids` passed when calling :class:`~transformers.CTRLModel` or
            :class:`~transformers.TFCTRLModel`.
        n_positions (:obj:`int`, `optional`, defaults to 256):
-            The maximum sequence length that this model might ever be used with.
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            just in case (e.g., 512 or 1024 or 2048).
        n_ctx (:obj:`int`, `optional`, defaults to 256):
            Dimensionality of the causal mask (usually same as n_positions).
        n_embd (:obj:`int`, `optional`, defaults to 1280):

--- a/src/transformers/configuration_deberta.py
+++ b/src/transformers/configuration_deberta.py
@@ -45,16 +45,16 @@ class DebertaConfig(PretrainedConfig):
        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler.
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"`, :obj:`"gelu"`, :obj:`"tanh"`, :obj:`"gelu_fast"`,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"`, :obj:`"gelu"`, :obj:`"tanh"`, :obj:`"gelu_fast"`,
            :obj:`"mish"`, :obj:`"linear"`, :obj:`"sigmoid"` and :obj:`"gelu_new"` are supported.
        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
            The dropout ratio for the attention probabilities.
        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
-            The maximum sequence length that this model might ever be used with.
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            just in case (e.g., 512 or 1024 or 2048).
        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.DebertaModel` or
            :class:`~transformers.TFDebertaModel`.
@@ -65,15 +65,15 @@ class DebertaConfig(PretrainedConfig):
        relative_attention (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether use relative position encoding.
        max_relative_positions (:obj:`int`, `optional`, defaults to 1):
-            The range of relative positions :obj:`[-max_position_embeddings, max_position_embeddings]`.
+            The range of relative positions :obj:`[-max_position_embeddings, max_position_embeddings]`. Use the same
-            Use the same value as :obj:`max_position_embeddings`.
+            value as :obj:`max_position_embeddings`.
        pad_token_id (:obj:`int`, `optional`, defaults to 0):
            The value used to pad input_ids.
        position_biased_input (:obj:`bool`, `optional`, defaults to :obj:`True`):
            Whether add absolute position embedding to content embedding.
        pos_att_type (:obj:`List[str]`, `optional`):
-            The type of relative position attention, it can be a combination of :obj:`["p2c", "c2p", "p2p"]`,
+            The type of relative position attention, it can be a combination of :obj:`["p2c", "c2p", "p2p"]`, e.g.
-            e.g. :obj:`["p2c"]`, :obj:`["p2c", "c2p"]`, :obj:`["p2c", "c2p", 'p2p"]`.
+            :obj:`["p2c"]`, :obj:`["p2c", "c2p"]`, :obj:`["p2c", "c2p", 'p2p"]`.
        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
    """

--- a/src/transformers/configuration_distilbert.py
+++ b/src/transformers/configuration_distilbert.py
@@ -36,21 +36,20 @@ class DistilBertConfig(PretrainedConfig):
    This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel` or a
    :class:`~transformers.TFDistilBertModel`. It is used to instantiate a DistilBERT model according to the specified
    arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
-    configuration to that of the DistilBERT
+    configuration to that of the DistilBERT `distilbert-base-uncased
-    `distilbert-base-uncased <https://huggingface.co/distilbert-base-uncased>`__ architecture.
+    <https://huggingface.co/distilbert-base-uncased>`__ architecture.
-    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
-    for more information.
    Args:
        vocab_size (:obj:`int`, `optional`, defaults to 30522):
-            Vocabulary size of the DistilBERT model. Defines the number of different tokens that can be represented by the
+            Vocabulary size of the DistilBERT model. Defines the number of different tokens that can be represented by
-            :obj:`inputs_ids` passed when calling :class:`~transformers.DistilBertModel` or
+            the :obj:`inputs_ids` passed when calling :class:`~transformers.DistilBertModel` or
            :class:`~transformers.TFDistilBertModel`.
        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
-            The maximum sequence length that this model might ever be used with.
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            just in case (e.g., 512 or 1024 or 2048).
        sinusoidal_pos_embds (:obj:`boolean`, `optional`, defaults to :obj:`False`):
            Whether to use sinusoidal positional embeddings.
        n_layers (:obj:`int`, `optional`, defaults to 6):
@@ -66,8 +65,8 @@ class DistilBertConfig(PretrainedConfig):
        attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
            The dropout ratio for the attention probabilities.
        activation (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler.
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        qa_dropout (:obj:`float`, `optional`, defaults to 0.1):

--- a/src/transformers/configuration_dpr.py
+++ b/src/transformers/configuration_dpr.py
@@ -32,20 +32,19 @@ DPR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class DPRConfig(PretrainedConfig):
    r"""
-    :class:`~transformers.DPRConfig` is the configuration class to store the configuration of a
+    :class:`~transformers.DPRConfig` is the configuration class to store the configuration of a `DPRModel`.
-    `DPRModel`.
    This is the configuration class to store the configuration of a :class:`~transformers.DPRContextEncoder`,
    :class:`~transformers.DPRQuestionEncoder`, or a :class:`~transformers.DPRReader`. It is used to instantiate the
    components of the DPR model.
-    This class is a subclass of :class:`~transformers.BertConfig`. Please check the
+    This class is a subclass of :class:`~transformers.BertConfig`. Please check the superclass for the documentation of
-    superclass for the documentation of all kwargs.
+    all kwargs.
    Args:
        vocab_size (:obj:`int`, `optional`, defaults to 30522):
-            Vocabulary size of the DPR model. Defines the different tokens that
+            Vocabulary size of the DPR model. Defines the different tokens that can be represented by the `inputs_ids`
-            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
+            passed to the forward method of :class:`~transformers.BertModel`.
        hidden_size (:obj:`int`, `optional`, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
@@ -55,15 +54,15 @@ class DPRConfig(PretrainedConfig):
        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler.
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
            The dropout ratio for the attention probabilities.
        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
-            The maximum sequence length that this model might ever be used with.
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            just in case (e.g., 512 or 1024 or 2048).
        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
            The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
@@ -73,8 +72,8 @@ class DPRConfig(PretrainedConfig):
        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
        projection_dim (:obj:`int`, `optional`, defaults to 0):
-            Dimension of the projection for the context and question encoders.
+            Dimension of the projection for the context and question encoders. If it is set to zero (default), then no
-            If it is set to zero (default), then no projection is done.
+            projection is done.
    """
    model_type = "dpr"

--- a/src/transformers/configuration_electra.py
+++ b/src/transformers/configuration_electra.py
@@ -36,12 +36,11 @@ class ElectraConfig(PretrainedConfig):
    This is the configuration class to store the configuration of a :class:`~transformers.ElectraModel` or a
    :class:`~transformers.TFElectraModel`. It is used to instantiate a ELECTRA model according to the specified
    arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
-    configuration to that of the ELECTRA
+    configuration to that of the ELECTRA `google/electra-small-discriminator
-    `google/electra-small-discriminator <https://huggingface.co/google/electra-small-discriminator>`__ architecture.
+    <https://huggingface.co/google/electra-small-discriminator>`__ architecture.
-    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
-    for more information.
    Args:
@@ -60,15 +59,15 @@ class ElectraConfig(PretrainedConfig):
        intermediate_size (:obj:`int`, `optional`, defaults to 1024):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler.
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
            The dropout ratio for the attention probabilities.
        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
-            The maximum sequence length that this model might ever be used with.
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            just in case (e.g., 512 or 1024 or 2048).
        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.ElectraModel` or
            :class:`~transformers.TFElectraModel`.

--- a/src/transformers/configuration_encoder_decoder.py
+++ b/src/transformers/configuration_encoder_decoder.py
@@ -29,9 +29,8 @@ class EncoderDecoderConfig(PretrainedConfig):
    :class:`~transformers.EncoderDecoderModel`. It is used to instantiate an Encoder Decoder model according to the
    specified arguments, defining the encoder and decoder configs.
-    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
-    for more information.
    Args:
        kwargs (`optional`):
@@ -93,7 +92,8 @@ class EncoderDecoderConfig(PretrainedConfig):
        cls, encoder_config: PretrainedConfig, decoder_config: PretrainedConfig, **kwargs
    ) -> PretrainedConfig:
        r"""
-        Instantiate a :class:`~transformers.EncoderDecoderConfig` (or a derived class) from a pre-trained encoder model configuration and decoder model configuration.
+        Instantiate a :class:`~transformers.EncoderDecoderConfig` (or a derived class) from a pre-trained encoder model
+        configuration and decoder model configuration.
        Returns:
            :class:`EncoderDecoderConfig`: An instance of a configuration object

--- a/src/transformers/configuration_flaubert.py
+++ b/src/transformers/configuration_flaubert.py
@@ -34,20 +34,19 @@ class FlaubertConfig(XLMConfig):
    :class:`~transformers.TFFlaubertModel`. It is used to instantiate a FlauBERT model according to the specified
    arguments, defining the model architecture.
-    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
-    for more information.
    Args:
        pre_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to apply the layer normalization before or after the feed forward layer following the
+            Whether to apply the layer normalization before or after the feed forward layer following the attention in
-            attention in each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018)
+            each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018)
        layerdrop (:obj:`float`, `optional`, defaults to 0.0):
-            Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand
+            Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand with
-            with Structured Dropout. ICLR 2020)
+            Structured Dropout. ICLR 2020)
        vocab_size (:obj:`int`, `optional`, defaults to 30145):
-            Vocabulary size of the FlauBERT model. Defines the number of different tokens that can be represented by the
+            Vocabulary size of the FlauBERT model. Defines the number of different tokens that can be represented by
-            :obj:`inputs_ids` passed when calling :class:`~transformers.FlaubertModel` or
+            the :obj:`inputs_ids` passed when calling :class:`~transformers.FlaubertModel` or
            :class:`~transformers.TFFlaubertModel`.
        emb_dim (:obj:`int`, `optional`, defaults to 2048):
            Dimensionality of the encoder layers and the pooler layer.
@@ -56,8 +55,7 @@ class FlaubertConfig(XLMConfig):
        n_head (:obj:`int`, `optional`, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        dropout (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probability for all fully connected
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-            layers in the embeddings, encoder, and pooler.
        attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probability for the attention mechanism
        gelu_activation (:obj:`bool`, `optional`, defaults to :obj:`True`):
@@ -65,28 +63,25 @@ class FlaubertConfig(XLMConfig):
        sinusoidal_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether or not to use sinusoidal positional embeddings instead of absolute positional embeddings.
        causal (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not the model shoul behave in a causal manner.
+            Whether or not the model shoul behave in a causal manner. Causal models use a triangular attention mask in
-            Causal models use a triangular attention mask in order to only attend to the left-side context instead
+            order to only attend to the left-side context instead if a bidirectional context.
-            if a bidirectional context.
        asm (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether or not to use an adaptive log softmax projection layer instead of a linear layer for the prediction
            layer.
        n_langs (:obj:`int`, `optional`, defaults to 1):
            The number of languages the model handles. Set to 1 for monolingual models.
        use_lang_emb (:obj:`bool`, `optional`, defaults to :obj:`True`)
-            Whether to use language embeddings. Some models use additional language embeddings, see
+            Whether to use language embeddings. Some models use additional language embeddings, see `the multilingual
-            `the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
+            models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__ for
-            for information on how to use them.
+            information on how to use them.
        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
-            The maximum sequence length that this model might
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            ever be used with. Typically set this to something large just in case
+            just in case (e.g., 512 or 1024 or 2048).
-            (e.g., 512 or 1024 or 2048).
        embed_init_std (:obj:`float`, `optional`, defaults to 2048^-0.5):
-            The standard deviation of the truncated_normal_initializer for
+            The standard deviation of the truncated_normal_initializer for initializing the embedding matrices.
-            initializing the embedding matrices.
        init_std (:obj:`int`, `optional`, defaults to 50257):
-            The standard deviation of the truncated_normal_initializer for
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices except the
-            initializing all weight matrices except the embedding matrices.
+            embedding matrices.
        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
        bos_index (:obj:`int`, `optional`, defaults to 0):
@@ -134,8 +129,7 @@ class FlaubertConfig(XLMConfig):
        mask_token_id (:obj:`int`, `optional`, defaults to 0):
            Model agnostic parameter to identify masked tokens when generating text in an MLM context.
        lang_id (:obj:`int`, `optional`, defaults to 1):
-            The ID of the language used by the model. This parameter is used when generating
+            The ID of the language used by the model. This parameter is used when generating text in a given language.
-            text in a given language.
    """
    model_type = "flaubert"

--- a/src/transformers/configuration_fsmt.py
+++ b/src/transformers/configuration_fsmt.py
@@ -28,8 +28,7 @@ FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
 class DecoderConfig(PretrainedConfig):
    r"""
-    Configuration class for FSMT's decoder specific things.
+    Configuration class for FSMT's decoder specific things. note: this is a private helper class
-    note: this is a private helper class
    """
    model_type = "fsmt_decoder"
@@ -44,9 +43,8 @@ class FSMTConfig(PretrainedConfig):
    This is the configuration class to store the configuration of a :class:`~transformers.FSMTModel`. It is used to
    instantiate a FSMT model according to the specified arguments, defining the model architecture.
-    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
-    for more information.
    Args:
        langs (:obj:`List[str]`):
@@ -72,8 +70,8 @@ class FSMTConfig(PretrainedConfig):
        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
        activation_function (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"relu"`):
-            The non-linear activation function (function or string) in the encoder and pooler.
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
        dropout (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
@@ -81,8 +79,8 @@ class FSMTConfig(PretrainedConfig):
        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
            The dropout ratio for activations inside the fully connected layer.
        max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
-            The maximum sequence length that this model might ever be used with.
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            just in case (e.g., 512 or 1024 or 2048).
        init_std (:obj:`float`, `optional`, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`True`):
@@ -104,14 +102,13 @@ class FSMTConfig(PretrainedConfig):
        tie_word_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether to tie input and output embeddings.
        num_beams (:obj:`int`, `optional`, defaults to 5)
-            Number of beams for beam search that will be used by default in the :obj:`generate` method
+            Number of beams for beam search that will be used by default in the :obj:`generate` method of the model. 1
-            of the model. 1 means no beam search.
+            means no beam search.
        length_penalty (:obj:`float`, `optional`, defaults to 1)
-            Exponential penalty to the length that will be used by default in the :obj:`generate` method
+            Exponential penalty to the length that will be used by default in the :obj:`generate` method of the model.
-            of the model.
        early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`)
-            Flag that will be used by default in the :obj:`generate` method of the model. Whether to stop
+            Flag that will be used by default in the :obj:`generate` method of the model. Whether to stop the beam
-            the beam search when at least ``num_beams`` sentences are finished per batch or not.
+            search when at least ``num_beams`` sentences are finished per batch or not.
        Examples::

--- a/src/transformers/configuration_funnel.py
+++ b/src/transformers/configuration_funnel.py
@@ -42,9 +42,8 @@ class FunnelConfig(PretrainedConfig):
    configuration to that of the Funnel Transformer `funnel-transformer/small
    <https://huggingface.co/funnel-transformer/small>`__ architecture.
-    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
-    for more information.
    Args:
        vocab_size (:obj:`int`, `optional`, defaults to 30522):
@@ -66,8 +65,8 @@ class FunnelConfig(PretrainedConfig):
        d_inner (:obj:`int`, `optional`, defaults to 3072):
            Inner dimension in the feed-forward blocks.
        hidden_act (:obj:`str` or :obj:`callable`, `optional`, defaults to :obj:`"gelu_new"`):
-            The non-linear activation function (function or string) in the encoder and pooler.
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
        hidden_dropout (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
        attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
@@ -75,8 +74,8 @@ class FunnelConfig(PretrainedConfig):
        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
            The dropout probability used between the two layers of the feed-forward blocks.
        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
-            The maximum sequence length that this model might ever be used with.
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            just in case (e.g., 512 or 1024 or 2048).
        type_vocab_size (:obj:`int`, `optional`, defaults to 3):
            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.FunnelModel` or
            :class:`~transformers.TFFunnelModel`.
@@ -90,19 +89,17 @@ class FunnelConfig(PretrainedConfig):
        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-9):
            The epsilon used by the layer normalization layers.
        pooling_type (:obj:`str`, `optional`, defaults to :obj:`"mean"`):
-            Possible values are ``"mean"`` or ``"max"``. The way pooling is performed at the beginning of each
+            Possible values are ``"mean"`` or ``"max"``. The way pooling is performed at the beginning of each block.
-            block.
        attention_type (:obj:`str`, `optional`, defaults to :obj:`"relative_shift"`):
-            Possible values are ``"relative_shift"`` or ``"factorized"``. The former is faster on CPU/GPU while
+            Possible values are ``"relative_shift"`` or ``"factorized"``. The former is faster on CPU/GPU while the
-            the latter is faster on TPU.
+            latter is faster on TPU.
        separate_cls (:obj:`bool`, `optional`, defaults to :obj:`True`):
            Whether or not to separate the cls token when applying pooling.
        truncate_seq (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            When using ``separate_cls``, whether or not to truncate the last token when pooling, to avoid getting
+            When using ``separate_cls``, whether or not to truncate the last token when pooling, to avoid getting a
-            a sequence length that is not a multiple of 2.
+            sequence length that is not a multiple of 2.
        pool_q_only (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to apply the pooling only to the query or to query, key and values for the attention
+            Whether or not to apply the pooling only to the query or to query, key and values for the attention layers.
-            layers.
    """
    model_type = "funnel"

--- a/src/transformers/configuration_gpt2.py
+++ b/src/transformers/configuration_gpt2.py
@@ -33,13 +33,12 @@ GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class GPT2Config(PretrainedConfig):
    """
    This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model` or a
-    :class:`~transformers.TFGPT2Model`. It is used to instantiate a GPT-2 model according to the specified
+    :class:`~transformers.TFGPT2Model`. It is used to instantiate a GPT-2 model according to the specified arguments,
-    arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
+    defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
-    configuration to that of the GPT-2 `small <https://huggingface.co/gpt2>`__ architecture.
+    to that of the GPT-2 `small <https://huggingface.co/gpt2>`__ architecture.
-    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
-    for more information.
    Args:
@@ -48,8 +47,8 @@ class GPT2Config(PretrainedConfig):
            :obj:`inputs_ids` passed when calling :class:`~transformers.GPT2Model` or
            :class:`~transformers.TFGPT2Model`.
        n_positions (:obj:`int`, `optional`, defaults to 1024):
-            The maximum sequence length that this model might ever be used with.
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            just in case (e.g., 512 or 1024 or 2048).
        n_ctx (:obj:`int`, `optional`, defaults to 1024):
            Dimensionality of the causal mask (usually same as n_positions).
        n_embd (:obj:`int`, `optional`, defaults to 768):
@@ -73,8 +72,8 @@ class GPT2Config(PretrainedConfig):
        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        summary_type (:obj:`string`, `optional`, defaults to :obj:`"cls_index"`):
-            Argument used when doing sequence summary, used in the models
+            Argument used when doing sequence summary, used in the models :class:`~transformers.GPT2DoubleHeadsModel`
-            :class:`~transformers.GPT2DoubleHeadsModel` and :class:`~transformers.TFGPT2DoubleHeadsModel`.
+            and :class:`~transformers.TFGPT2DoubleHeadsModel`.
            Has to be one of the following options:
@@ -84,8 +83,8 @@ class GPT2Config(PretrainedConfig):
                - :obj:`"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
                - :obj:`"attn"`: Not implemented now, use multi-head attention.
        summary_use_proj (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Argument used when doing sequence summary, used in the models
+            Argument used when doing sequence summary, used in the models :class:`~transformers.GPT2DoubleHeadsModel`
-            :class:`~transformers.GPT2DoubleHeadsModel` and :class:`~transformers.TFGPT2DoubleHeadsModel`.
+            and :class:`~transformers.TFGPT2DoubleHeadsModel`.
            Whether or not to add a projection after the vector extraction.
        summary_activation (:obj:`str`, `optional`):
@@ -94,13 +93,13 @@ class GPT2Config(PretrainedConfig):
            Pass :obj:`"tanh"` for a tanh activation to the output, any other value will result in no activation.
        summary_proj_to_labels (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Argument used when doing sequence summary, used in the models
+            Argument used when doing sequence summary, used in the models :class:`~transformers.GPT2DoubleHeadsModel`
-            :class:`~transformers.GPT2DoubleHeadsModel` and :class:`~transformers.TFGPT2DoubleHeadsModel`.
+            and :class:`~transformers.TFGPT2DoubleHeadsModel`.
            Whether the projection outputs should have :obj:`config.num_labels` or :obj:`config.hidden_size` classes.
        summary_first_dropout (:obj:`float`, `optional`, defaults to 0.1):
-            Argument used when doing sequence summary, used in the models
+            Argument used when doing sequence summary, used in the models :class:`~transformers.GPT2DoubleHeadsModel`
-            :class:`~transformers.GPT2DoubleHeadsModel` and :class:`~transformers.TFGPT2DoubleHeadsModel`.
+            and :class:`~transformers.TFGPT2DoubleHeadsModel`.
            The dropout ratio to be used after the projection and activation.
        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):

--- a/src/transformers/configuration_layoutlm.py
+++ b/src/transformers/configuration_layoutlm.py
@@ -29,20 +29,19 @@ LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class LayoutLMConfig(BertConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.LayoutLMModel`.
+    This is the configuration class to store the configuration of a :class:`~transformers.LayoutLMModel`. It is used to
-    It is used to instantiate a LayoutLM model according to the specified arguments, defining the model
+    instantiate a LayoutLM model according to the specified arguments, defining the model architecture. Instantiating a
-    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    configuration with the defaults will yield a similar configuration to that of the LayoutLM `layoutlm-base-uncased
-    the LayoutLM `layoutlm-base-uncased <https://huggingface.co/microsoft/layoutlm-base-uncased>`__ architecture.
+    <https://huggingface.co/microsoft/layoutlm-base-uncased>`__ architecture.
-    Configuration objects inherit from :class:`~transformers.BertConfig` and can be used
+    Configuration objects inherit from :class:`~transformers.BertConfig` and can be used to control the model outputs.
-    to control the model outputs. Read the documentation from :class:`~transformers.BertConfig`
+    Read the documentation from :class:`~transformers.BertConfig` for more information.
-    for more information.
    Args:
        vocab_size (:obj:`int`, `optional`, defaults to 30522):
-            Vocabulary size of the LayoutLM model. Defines the different tokens that
+            Vocabulary size of the LayoutLM model. Defines the different tokens that can be represented by the
-            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.LayoutLMModel`.
+            `inputs_ids` passed to the forward method of :class:`~transformers.LayoutLMModel`.
        hidden_size (:obj:`int`, `optional`, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
@@ -52,15 +51,15 @@ class LayoutLMConfig(BertConfig):
        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler.
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
            The dropout ratio for the attention probabilities.
        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
-            The maximum sequence length that this model might ever be used with.
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            just in case (e.g., 512 or 1024 or 2048).
        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
            The vocabulary size of the :obj:`token_type_ids` passed into :class:`~transformers.LayoutLMModel`.
        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
@@ -70,8 +69,8 @@ class LayoutLMConfig(BertConfig):
        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
        max_2d_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
-            The maximum value that the 2D position embedding might ever used.
+            The maximum value that the 2D position embedding might ever used. Typically set this to something large
-            Typically set this to something large just in case (e.g., 1024).
+            just in case (e.g., 1024).
    Examples::

--- a/src/transformers/configuration_longformer.py
+++ b/src/transformers/configuration_longformer.py
@@ -37,19 +37,19 @@ class LongformerConfig(RobertaConfig):
    :class:`~transformers.TFLongformerModel`. It is used to instantiate a Longformer model according to the specified
    arguments, defining the model architecture.
-    This is the configuration class to store the configuration of a :class:`~transformers.LongformerModel`.
+    This is the configuration class to store the configuration of a :class:`~transformers.LongformerModel`. It is used
-    It is used to instantiate an Longformer model according to the specified arguments, defining the model
+    to instantiate an Longformer model according to the specified arguments, defining the model architecture.
-    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the RoBERTa
-    the RoBERTa `roberta-base <https://huggingface.co/roberta-base>`__ architecture with a sequence length 4,096.
+    `roberta-base <https://huggingface.co/roberta-base>`__ architecture with a sequence length 4,096.
-    The :class:`~transformers.LongformerConfig` class directly inherits :class:`~transformers.RobertaConfig`.
+    The :class:`~transformers.LongformerConfig` class directly inherits :class:`~transformers.RobertaConfig`. It reuses
-    It reuses the same defaults. Please check the parent class for more information.
+    the same defaults. Please check the parent class for more information.
    Args:
        attention_window (:obj:`int` or :obj:`List[int]`, `optional`, defaults to 512):
-            Size of an attention window around each token. If an :obj:`int`, use the same size for all layers.
+            Size of an attention window around each token. If an :obj:`int`, use the same size for all layers. To
-            To specify a different window size for each layer, use a :obj:`List[int]` where
+            specify a different window size for each layer, use a :obj:`List[int]` where ``len(attention_window) ==
-            ``len(attention_window) == num_hidden_layers``.
+            num_hidden_layers``.
    Example::

--- a/src/transformers/configuration_lxmert.py
+++ b/src/transformers/configuration_lxmert.py
@@ -32,9 +32,8 @@ class LxmertConfig(PretrainedConfig):
    :class:`~transformers.TFLxmertModel`. It is used to instantiate a LXMERT model according to the specified
    arguments, defining the model architecture.
-    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
-    for more information.
    Args:
@@ -55,15 +54,15 @@ class LxmertConfig(PretrainedConfig):
        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler.
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
            The dropout ratio for the attention probabilities.
        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
-            The maximum sequence length that this model might ever be used with.
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            just in case (e.g., 512 or 1024 or 2048).
        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
            The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
@@ -71,15 +70,14 @@ class LxmertConfig(PretrainedConfig):
        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
        visual_feat_dim (:obj:`int`, `optional`, defaults to 2048):
-            This represents the last dimension of the pooled-object features used as input for the model,
+            This represents the last dimension of the pooled-object features used as input for the model, representing
-            representing the size of each object feature itself.
+            the size of each object feature itself.
        visual_pos_dim (:obj:`int`, `optional`, defaults to 4):
-            This represents the number of spacial features that are mixed into the visual features.
+            This represents the number of spacial features that are mixed into the visual features. The default is set
-            The default is set to 4 because most commonly this will represent the location of a bounding box.
+            to 4 because most commonly this will represent the location of a bounding box. i.e., (x, y, width, height)
-            i.e., (x, y, width, height)
        visual_loss_normalizer (:obj:`float`, `optional`, defaults to 1/15):
-            This represents the scaling factor in which each visual loss is multiplied by if during pretraining,
+            This represents the scaling factor in which each visual loss is multiplied by if during pretraining, one
-            one decided to train with multiple vision-based loss objectives.
+            decided to train with multiple vision-based loss objectives.
        num_qa_labels (:obj:`int`, `optional`, defaults to 9500):
            This represents the total number of different question answering (QA) labels there are. If using more than
            one dataset with QA, the user will need to account for the total number of labels that all of the datasets
@@ -91,8 +89,8 @@ class LxmertConfig(PretrainedConfig):
            This represents the total number of semantically unique attributes that lxmert will be able to classify a
            pooled-object feature as possessing.
        task_matched (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            This task is used for sentence-image matching. If the sentence correctly describes the image the label
+            This task is used for sentence-image matching. If the sentence correctly describes the image the label will
-            will be 1. If the sentence does not correctly describe the image, the label will be 0.
+            be 1. If the sentence does not correctly describe the image, the label will be 0.
        task_mask_lm (:obj:`bool`, `optional`, defaults to :obj:`True`):
            Whether or not to add masked language modeling (as used in pretraining models such as BERT) to the loss
            objective.
@@ -108,8 +106,8 @@ class LxmertConfig(PretrainedConfig):
        visual_feat_loss (:obj:`bool`, `optional`, defaults to :obj:`True`):
            Whether or not to calculate the feature-regression loss objective
        output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not the model should return the attentions from the vision, langauge, and cross-modality
+            Whether or not the model should return the attentions from the vision, langauge, and cross-modality layers
-            layers should be returned.
+            should be returned.
        output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether or not the model should return the hidden states from the vision, langauge, and cross-modality
            layers should be returned.

--- a/src/transformers/configuration_marian.py
+++ b/src/transformers/configuration_marian.py
@@ -27,9 +27,8 @@ class MarianConfig(BartConfig):
    This is the configuration class to store the configuration of a :class:`~transformers.MarianMTModel`. It is used to
    instantiate a Marian model according to the specified arguments, defining the model architecture.
-    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
-    for more information.
    Args:
        vocab_size (:obj:`int`, `optional`, defaults to 58101):
@@ -50,8 +49,8 @@ class MarianConfig(BartConfig):
        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler.
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
        dropout (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
@@ -61,8 +60,8 @@ class MarianConfig(BartConfig):
        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
            The dropout ratio for classifier.
        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
-            The maximum sequence length that this model might ever be used with.
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            just in case (e.g., 512 or 1024 or 2048).
        init_std (:obj:`float`, `optional`, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        add_bias_logits (:obj:`bool`, `optional`, defaults to :obj:`False`):
@@ -84,11 +83,11 @@ class MarianConfig(BartConfig):
        bos_token_id (:obj:`int`, `optional`, defaults to 0)
            Beginning of stream token id.
        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the `LayerDrop paper
+            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
-            <see https://arxiv.org/abs/1909.11556>`__ for more details.
+            https://arxiv.org/abs/1909.11556>`__ for more details.
        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the `LayerDrop paper
+            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
-            <see https://arxiv.org/abs/1909.11556>`__ for more details.
+            https://arxiv.org/abs/1909.11556>`__ for more details.
        extra_pos_embeddings: (:obj:`int`, `optional`, defaults to 2):
            How many extra learned positional embeddings to use.
        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):

--- a/src/transformers/configuration_mbart.py
+++ b/src/transformers/configuration_mbart.py
@@ -29,12 +29,11 @@ MBART_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class MBartConfig(BartConfig):
    """
    This is the configuration class to store the configuration of a
-    :class:`~transformers.MBartForConditionalGeneration`. It is used to
+    :class:`~transformers.MBartForConditionalGeneration`. It is used to instantiate a BART model according to the
-    instantiate a BART model according to the specified arguments, defining the model architecture.
+    specified arguments, defining the model architecture.
-    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
-    for more information.
    Args:
        vocab_size (:obj:`int`, `optional`, defaults to 250027):
@@ -55,8 +54,8 @@ class MBartConfig(BartConfig):
        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler.
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
        dropout (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
@@ -66,8 +65,8 @@ class MBartConfig(BartConfig):
        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
            The dropout ratio for classifier.
        max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
-            The maximum sequence length that this model might ever be used with.
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            just in case (e.g., 512 or 1024 or 2048).
        init_std (:obj:`float`, `optional`, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        add_bias_logits (:obj:`bool`, `optional`, defaults to :obj:`False`):
@@ -89,11 +88,11 @@ class MBartConfig(BartConfig):
        bos_token_id (:obj:`int`, `optional`, defaults to 0)
            Beginning of stream token id.
        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the `LayerDrop paper
+            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
-            <see https://arxiv.org/abs/1909.11556>`__ for more details.
+            https://arxiv.org/abs/1909.11556>`__ for more details.
        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the `LayerDrop paper
+            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
-            <see https://arxiv.org/abs/1909.11556>`__ for more details.
+            https://arxiv.org/abs/1909.11556>`__ for more details.
        extra_pos_embeddings: (:obj:`int`, `optional`, defaults to 2):
            How many extra learned positional embeddings to use. Should be equal to :obj:`pad_token_id+1`.
        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):

--- a/src/transformers/configuration_mobilebert.py
+++ b/src/transformers/configuration_mobilebert.py
@@ -29,9 +29,8 @@ class MobileBertConfig(PretrainedConfig):
    :class:`~transformers.TFMobileBertModel`. It is used to instantiate a MobileBERT model according to the specified
    arguments, defining the model architecture.
-    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
-    for more information.
    Args:
@@ -48,15 +47,15 @@ class MobileBertConfig(PretrainedConfig):
        intermediate_size (:obj:`int`, `optional`, defaults to 512):
            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"relu"`):
-            The non-linear activation function (function or string) in the encoder and pooler.
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.0):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
            The dropout ratio for the attention probabilities.
        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
-            The maximum sequence length that this model might ever be used with.
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            just in case (e.g., 512 or 1024 or 2048).
        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.MobileBertModel`
            or :class:`~transformers.TFMobileBertModel`.
@@ -84,7 +83,7 @@ class MobileBertConfig(PretrainedConfig):
        normalization_type (:obj:`str`, `optional`, defaults to :obj:`"no_norm"`):
            The normalization type in MobileBERT.
-    Examples:
+    Examples::
        >>> from transformers import MobileBertModel, MobileBertConfig
@@ -97,9 +96,8 @@ class MobileBertConfig(PretrainedConfig):
        >>> # Accessing the model configuration
        >>> configuration = model.config
-    Attributes:
+    Attributes: pretrained_config_archive_map (Dict[str, str]): A dictionary containing all the available pre-trained
-        pretrained_config_archive_map (Dict[str, str]):
+    checkpoints.
-            A dictionary containing all the available pre-trained checkpoints.
    """
    pretrained_config_archive_map = MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
    model_type = "mobilebert"

--- a/src/transformers/configuration_openai.py
+++ b/src/transformers/configuration_openai.py
@@ -33,9 +33,8 @@ class OpenAIGPTConfig(PretrainedConfig):
    arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
    configuration to that of the `GPT <https://huggingface.co/openai-gpt>`__ architecture from OpenAI.
-    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
-    for more information.
    Args:
        vocab_size (:obj:`int`, `optional`, defaults to 40478):
@@ -43,8 +42,8 @@ class OpenAIGPTConfig(PretrainedConfig):
            :obj:`inputs_ids` passed when calling :class:`~transformers.OpenAIGPTModel` or
            :class:`~transformers.TFOpenAIGPTModel`.
        n_positions (:obj:`int`, `optional`, defaults to 512):
-            The maximum sequence length that this model might ever be used with.
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            just in case (e.g., 512 or 1024 or 2048).
        n_ctx (:obj:`int`, `optional`, defaults to 512):
            Dimensionality of the causal mask (usually same as n_positions).
        n_embd (:obj:`int`, `optional`, defaults to 768):
@@ -54,8 +53,8 @@ class OpenAIGPTConfig(PretrainedConfig):
        n_head (:obj:`int`, `optional`, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
        afn (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler.
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
        resid_pdrop (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        embd_pdrop (:obj:`int`, `optional`, defaults to 0.1):