Doc styling (#8067)

* Important files * Styling them all * Revert "Styling them all" This reverts commit 7d029395fdae8513b8281cbc2a6c239f8093503e. * Syling them for realsies * Fix syntax error * Fix benchmark_utils * More fixes * Fix modeling auto and script * Remove new line * Fixes * More fixes * Fix more files * Style * Add FSMT * More fixes * More fixes * More fixes * More fixes * Fixes * More fixes * More fixes * Last fixes * Make sphinx happy

Doc styling (#8067)
* Important files * Styling them all * Revert "Styling them all" This reverts commit 7d029395fdae8513b8281cbc2a6c239f8093503e. * Syling them for realsies * Fix syntax error * Fix benchmark_utils * More fixes * Fix modeling auto and script * Remove new line * Fixes * More fixes * Fix more files * Style * Add FSMT * More fixes * More fixes * More fixes * More fixes * Fixes * More fixes * More fixes * Last fixes * Make sphinx happy
08f534d2 · Sylvain Gugger · GitHub · 04a17f85 · 08f534d2 · 08f534d2
Unverified Commit 08f534d2 authored Oct 26, 2020 by Sylvain Gugger Committed by GitHub Oct 26, 2020
20 changed files
--- a/src/transformers/modeling_bert_generation.py
+++ b/src/transformers/modeling_bert_generation.py
@@ -166,8 +166,9 @@ class BertGenerationEmbeddings(nn.Module):
 class BertGenerationPreTrainedModel(PreTrainedModel):
-    """An abstract class to handle weights initialization and
+    """
-    a simple interface for downloading and loading pretrained models.
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
    """
    config_class = BertGenerationConfig
@@ -193,14 +194,15 @@ BERT_GENERATION_START_DOCSTRING = r"""
    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
    pruning heads etc.)
-    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass.
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    usage and behavior.
+    general usage and behavior.
    Parameters:
        config (:class:`~transformers.BertGenerationConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Initializing with a config file does not load the weights associated with the model, only the
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
 """
 BERT_GENERATION_INPUTS_DOCSTRING = r"""
@@ -208,27 +210,25 @@ BERT_GENERATION_INPUTS_DOCSTRING = r"""
        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using :class:`~transformers.BertGenerationTokenizer`.
+            Indices can be obtained using :class:`~transformers.BertGenerationTokenizer`. See
-            See :meth:`transformers.PreTrainedTokenizer.__call__` and
+            :meth:`transformers.PreTrainedTokenizer.__call__` and :meth:`transformers.PreTrainedTokenizer.encode` for
-            :meth:`transformers.PreTrainedTokenizer.encode` for details.
+            details.
            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
-            Mask to avoid performing attention on padding token indices.
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            `What are attention masks? <../glossary.html#attention-mask>`__
        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings.
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+            config.max_position_embeddings - 1]``.
            `What are position IDs? <../glossary.html#position-ids>`_
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules.
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
@@ -255,21 +255,19 @@ BERT_GENERATION_INPUTS_DOCSTRING = r"""
 class BertGenerationEncoder(BertGenerationPreTrainedModel):
    """
-    The model can behave as an encoder (with only self-attention) as well
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
-    as a decoder, in which case a layer of cross-attention is added between
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
-    the self-attention layers, following the architecture described in `Attention is all you need
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
-    <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
-    Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
    This model should be used when leveraging Bert or Roberta checkpoints for the
    :class:`~transformers.EncoderDecoderModel` class as described in `Leveraging Pre-trained Checkpoints for Sequence
    Generation Tasks <https://arxiv.org/abs/1907.12461>`__ by Sascha Rothe, Shashi Narayan, and Aliaksei Severyn.
-    To behave as an decoder the model needs to be initialized with the
+    To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
-    :obj:`is_decoder` argument of the configuration set to :obj:`True`.
+    set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
-    To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
-    argument and :obj:`add_cross_attention` set to :obj:`True`; an
+    input to the forward pass.
-    :obj:`encoder_hidden_states` is then expected as an input to the forward pass.
    """
    def __init__(self, config):
@@ -288,9 +286,9 @@ class BertGenerationEncoder(BertGenerationPreTrainedModel):
        self.embeddings.word_embeddings = value
    def _prune_heads(self, heads_to_prune):
-        """Prunes heads of the model.
+        """
-        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        See base class PreTrainedModel
+        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)
@@ -317,13 +315,12 @@ class BertGenerationEncoder(BertGenerationPreTrainedModel):
    ):
        r"""
        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
-            if the model is configured as a decoder.
+            the model is configured as a decoder.
        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
-            is used in the cross-attention if the model is configured as a decoder.
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: ``1`` for
-            Mask values selected in ``[0, 1]``:
+            tokens that are NOT MASKED, ``0`` for MASKED tokens.
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
@@ -442,20 +439,18 @@ class BertGenerationDecoder(BertGenerationPreTrainedModel):
    ):
        r"""
        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
-            if the model is configured as a decoder.
+            the model is configured as a decoder.
        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
-            is used in the cross-attention if the model is configured as a decoder.
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the left-to-right language modeling loss (next word prediction).
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with
+            ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
-            labels in ``[0, ..., config.vocab_size]``
        Returns:

--- a/src/transformers/modeling_blenderbot.py
+++ b/src/transformers/modeling_blenderbot.py
@@ -29,9 +29,9 @@ BLENDER_START_DOCSTRING = r"""
    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
    pruning heads etc.)
-    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass.
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    usage and behavior.
+    general usage and behavior.
 """
@@ -43,8 +43,8 @@ BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST = ["facebook/blenderbot-3B", "facebook/
 )
 class BlenderbotForConditionalGeneration(BartForConditionalGeneration):
    """
-    This class overrides :class:`~transformers.BartForConditionalGeneration`. Please check the
+    This class overrides :class:`~transformers.BartForConditionalGeneration`. Please check the superclass for the
-    superclass for the appropriate documentation alongside usage examples.
+    appropriate documentation alongside usage examples.
    """
    config_class = BlenderbotConfig

--- a/src/transformers/modeling_camembert.py
+++ b/src/transformers/modeling_camembert.py
@@ -46,15 +46,15 @@ CAMEMBERT_START_DOCSTRING = r"""
    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
    pruning heads etc.)
-    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass.
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    usage and behavior.
+    general usage and behavior.
    Parameters:
        config (:class:`~transformers.CamembertConfig`): Model configuration class with all the parameters of the
            model. Initializing with a config file does not load the weights associated with the model, only the
-            configuration.
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            weights.
 """
@@ -64,8 +64,8 @@ CAMEMBERT_START_DOCSTRING = r"""
 )
 class CamembertModel(RobertaModel):
    """
-    This class overrides :class:`~transformers.RobertaModel`. Please check the
+    This class overrides :class:`~transformers.RobertaModel`. Please check the superclass for the appropriate
-    superclass for the appropriate documentation alongside usage examples.
+    documentation alongside usage examples.
    """
    config_class = CamembertConfig
@@ -77,64 +77,72 @@ class CamembertModel(RobertaModel):
 )
 class CamembertForMaskedLM(RobertaForMaskedLM):
    """
-    This class overrides :class:`~transformers.RobertaForMaskedLM`. Please check the
+    This class overrides :class:`~transformers.RobertaForMaskedLM`. Please check the superclass for the appropriate
-    superclass for the appropriate documentation alongside usage examples.
+    documentation alongside usage examples.
    """
    config_class = CamembertConfig
 @add_start_docstrings(
-    """CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer
+    """
-    on top of the pooled output) e.g. for GLUE tasks. """,
+    CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """,
    CAMEMBERT_START_DOCSTRING,
 )
 class CamembertForSequenceClassification(RobertaForSequenceClassification):
    """
-    This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the
+    This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the superclass for the
-    superclass for the appropriate documentation alongside usage examples.
+    appropriate documentation alongside usage examples.
    """
    config_class = CamembertConfig
 @add_start_docstrings(
-    """CamemBERT Model with a multiple choice classification head on top (a linear layer on top of
+    """
-    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    CamemBERT Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
    CAMEMBERT_START_DOCSTRING,
 )
 class CamembertForMultipleChoice(RobertaForMultipleChoice):
    """
-    This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the
+    This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the superclass for the
-    superclass for the appropriate documentation alongside usage examples.
+    appropriate documentation alongside usage examples.
    """
    config_class = CamembertConfig
 @add_start_docstrings(
-    """CamemBERT Model with a token classification head on top (a linear layer on top of
+    """
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    CamemBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
+    for Named-Entity-Recognition (NER) tasks.
+    """,
    CAMEMBERT_START_DOCSTRING,
 )
 class CamembertForTokenClassification(RobertaForTokenClassification):
    """
-    This class overrides :class:`~transformers.RobertaForTokenClassification`. Please check the
+    This class overrides :class:`~transformers.RobertaForTokenClassification`. Please check the superclass for the
-    superclass for the appropriate documentation alongside usage examples.
+    appropriate documentation alongside usage examples.
    """
    config_class = CamembertConfig
 @add_start_docstrings(
-    """CamemBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD
+    """
-    (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits` """,
+    CamemBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`
+    """,
    CAMEMBERT_START_DOCSTRING,
 )
 class CamembertForQuestionAnswering(RobertaForQuestionAnswering):
    """
-    This class overrides :class:`~transformers.RobertaForQuestionAnswering`. Please check the
+    This class overrides :class:`~transformers.RobertaForQuestionAnswering`. Please check the superclass for the
-    superclass for the appropriate documentation alongside usage examples.
+    appropriate documentation alongside usage examples.
    """
    config_class = CamembertConfig
@@ -145,8 +153,8 @@ class CamembertForQuestionAnswering(RobertaForQuestionAnswering):
 )
 class CamembertForCausalLM(RobertaForCausalLM):
    """
-    This class overrides :class:`~transformers.RobertaForCausalLM`. Please check the
+    This class overrides :class:`~transformers.RobertaForCausalLM`. Please check the superclass for the appropriate
-    superclass for the appropriate documentation alongside usage examples.
+    documentation alongside usage examples.
    """
    config_class = CamembertConfig
--- a/src/transformers/modeling_ctrl.py
+++ b/src/transformers/modeling_ctrl.py
@@ -212,8 +212,9 @@ class EncoderLayer(torch.nn.Module):
 class CTRLPreTrainedModel(PreTrainedModel):
-    """An abstract class to handle weights initialization and
+    """
-    a simple interface for downloading and loading pretrained models.
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
    """
    config_class = CTRLConfig
@@ -238,60 +239,58 @@ CTRL_START_DOCSTRING = r"""
    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
    pruning heads etc.)
-    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass.
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    usage and behavior.
+    general usage and behavior.
    Parameters:
        config (:class:`~transformers.CTRLConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Initializing with a config file does not load the weights associated with the model, only the
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
 """
 CTRL_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
            :obj:`input_ids_length` = ``sequence_length`` if :obj:`past_key_values` is ``None`` else
-            ``past_key_values[0].shape[-2]`` (``sequence_length`` of input past key value states).
+            ``past_key_values[0].shape[-2]`` (``sequence_length`` of input past key value states). Indices of input
-            Indices of input sequence tokens in the vocabulary.
+            sequence tokens in the vocabulary.
-            If :obj:`past_key_values` is used, only input IDs that do not have their past calculated should be passed as
+            If :obj:`past_key_values` is used, only input IDs that do not have their past calculated should be passed
-            ``input_ids``.
+            as ``input_ids``.
-            Indices can be obtained using :class:`~transformers.CTRLTokenizer`.
+            Indices can be obtained using :class:`~transformers.CTRLTokenizer`. See
-            See :meth:`transformers.PreTrainedTokenizer.__call__` and
+            :meth:`transformers.PreTrainedTokenizer.__call__` and :meth:`transformers.PreTrainedTokenizer.encode` for
-            :meth:`transformers.PreTrainedTokenizer.encode` for details.
+            details.
            `What are input IDs? <../glossary.html#input-ids>`__
        past_key_values (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
-            Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see
-            (see :obj:`past_key_values` output below). Can be used to speed up sequential decoding.
+            :obj:`past_key_values` output below). Can be used to speed up sequential decoding. The ``input_ids`` which
-            The ``input_ids`` which have their past given to this model should not be passed as input ids as they have
+            have their past given to this model should not be passed as input ids as they have already been computed.
-            already been computed.
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on padding token indices.
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            `What are attention masks? <../glossary.html#attention-mask>`__
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs.
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            Indices are selected in ``[0, 1]``:
+            1]``:
            - 0 corresponds to a `sentence A` token,
            - 1 corresponds to a `sentence B` token.
            `What are token type IDs? <../glossary.html#token-type-ids>`_
        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings.
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+            config.max_position_embeddings - 1]``.
            `What are position IDs? <../glossary.html#position-ids>`_
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules.
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
@@ -344,8 +343,8 @@ class CTRLModel(CTRLPreTrainedModel):
        self.w = new_embeddings
    def _prune_heads(self, heads_to_prune):
-        """Prunes heads of the model.
+        """
-        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        """
        for layer, heads in heads_to_prune.items():
            self.h[layer].multi_head_attention.prune_heads(heads)
@@ -498,8 +497,10 @@ class CTRLModel(CTRLPreTrainedModel):
 @add_start_docstrings(
-    """The CTRL Model transformer with a language modeling head on top
+    """
-    (linear layer with weights tied to the input embeddings). """,
+    The CTRL Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
    CTRL_START_DOCSTRING,
 )
 class CTRLLMHeadModel(CTRLPreTrainedModel):
@@ -545,11 +546,9 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for language modeling.
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
-            Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
+            ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to
-            Indices are selected in ``[-100, 0, ..., config.vocab_size]``
+            ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]``
-            All labels set to ``-100`` are ignored (masked), the loss is only
-            computed for labels in ``[0, ..., config.vocab_size]``
        """
        if "past" in kwargs:
            warnings.warn(

--- a/src/transformers/modeling_deberta.py
+++ b/src/transformers/modeling_deberta.py
@@ -64,12 +64,14 @@ class ContextPooler(nn.Module):
 class XSoftmax(torch.autograd.Function):
-    """Masked Softmax which is optimized for saving memory
+    """
+    Masked Softmax which is optimized for saving memory
    Args:
      input (:obj:`torch.tensor`): The input tensor that will apply softmax.
      mask (:obj:`torch.IntTensor`): The mask matrix where 0 indicate that element will be ignored in the softmax caculation.
-      dim (int): The dimenssion that will apply softmax.
+      dim (int): The dimenssion that will apply softmax
    Example::
      import torch
      from transformers.modeling_deroberta import XSoftmax
@@ -154,7 +156,8 @@ class XDropout(torch.autograd.Function):
 class StableDropout(torch.nn.Module):
-    """Optimized dropout module for stabilizing the training
+    """
+    Optimized dropout module for stabilizing the training
    Args:
@@ -169,7 +172,8 @@ class StableDropout(torch.nn.Module):
        self.context_stack = None
    def forward(self, x):
-        """Call the module
+        """
+        Call the module
        Args:
            x (:obj:`torch.tensor`): The input tensor to apply dropout
@@ -430,11 +434,12 @@ class DebertaEncoder(nn.Module):
 def build_relative_position(query_size, key_size, device):
-    """Build relative position according to the query and key
+    """
+    Build relative position according to the query and key
-    We assume the absolute position of query :math:`P_q` is range from (0, query_size) and the absolute position of key :math:`P_k` is range from (0, key_size),
+    We assume the absolute position of query :math:`P_q` is range from (0, query_size) and the absolute position of key
-    The relative positions from query to key is
+    :math:`P_k` is range from (0, key_size), The relative positions from query to key is :math:`R_{q \\rightarrow k} =
-    :math:`R_{q \\rightarrow k} = P_q - P_k`
+    P_q - P_k`
    Args:
        query_size (int): the length of query
@@ -469,12 +474,13 @@ def pos_dynamic_expand(pos_index, p2c_att, key_layer):
 class DisentangledSelfAttention(torch.nn.Module):
-    """ Disentangled self-attention module
+    """
+    Disentangled self-attention module
    Parameters:
        config (:obj:`str`):
-            A model config class instance with the configuration to build a new model. The schema is similar to `BertConfig`, \
+            A model config class instance with the configuration to build a new model. The schema is similar to
-            for more details, please refer :class:`~transformers.DebertaConfig`
+            `BertConfig`, \ for more details, please refer :class:`~transformers.DebertaConfig`
    """
@@ -529,14 +535,18 @@ class DisentangledSelfAttention(torch.nn.Module):
        relative_pos=None,
        rel_embeddings=None,
    ):
-        """Call the module
+        """
+        Call the module
        Args:
            hidden_states (:obj:`torch.FloatTensor`):
-                Input states to the module usally the output from previous layer, it will be the Q,K and V in `Attention(Q,K,V)`
+                Input states to the module usally the output from previous layer, it will be the Q,K and V in
+                `Attention(Q,K,V)`
            attention_mask (:obj:`torch.ByteTensor`):
-                An attention mask matrix of shape [`B`, `N`, `N`] where `B` is the batch size, `N` is the maxium sequence length in which element [i,j] = `1` means the `i` th token in the input can attend to the `j` th token.
+                An attention mask matrix of shape [`B`, `N`, `N`] where `B` is the batch size, `N` is the maxium
+                sequence length in which element [i,j] = `1` means the `i` th token in the input can attend to the `j`
+                th token.
            return_att (:obj:`bool`, optional):
                Whether return the attention maxitrix.
@@ -545,10 +555,12 @@ class DisentangledSelfAttention(torch.nn.Module):
                The `Q` state in `Attention(Q,K,V)`.
            relative_pos (:obj:`torch.LongTensor`):
-                The relative position encoding between the tokens in the sequence. It's of shape [`B`, `N`, `N`] with values ranging in [`-max_relative_positions`, `max_relative_positions`].
+                The relative position encoding between the tokens in the sequence. It's of shape [`B`, `N`, `N`] with
+                values ranging in [`-max_relative_positions`, `max_relative_positions`].
            rel_embeddings (:obj:`torch.FloatTensor`):
-                The embedding of relative distances. It's a tensor of shape [:math:`2 \\times \\text{max_relative_positions}`, `hidden_size`].
+                The embedding of relative distances. It's a tensor of shape [:math:`2 \\times
+                \\text{max_relative_positions}`, `hidden_size`].
        """
@@ -737,8 +749,9 @@ class DebertaEmbeddings(nn.Module):
 class DebertaPreTrainedModel(PreTrainedModel):
-    """An abstract class to handle weights initialization and
+    """
-    a simple interface for downloading and loading pretrained models.
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
    """
    config_class = DebertaConfig
@@ -755,21 +768,22 @@ class DebertaPreTrainedModel(PreTrainedModel):
            module.bias.data.zero_()
-DEBERTA_START_DOCSTRING = r"""    The DeBERTa model was proposed in
+DEBERTA_START_DOCSTRING = r"""
-    `DeBERTa: Decoding-enhanced BERT with Disentangled Attention <https://arxiv.org/abs/2006.03654>`_
+    The DeBERTa model was proposed in `DeBERTa: Decoding-enhanced BERT with Disentangled Attention
-    by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build on top of BERT/RoBERTa with two improvements, i.e.
+    <https://arxiv.org/abs/2006.03654>`_ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build on top of
-    disentangled attention and enhanced mask decoder. With those two improvements, it out perform BERT/RoBERTa on a majority
+    BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
-    of tasks with 80GB pre-trianing data.
+    improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pre-trianing data.
-    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass.
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    usage and behavior.```
+    general usage and behavior.```
    Parameters:
        config (:class:`~transformers.DebertaConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Initializing with a config file does not load the weights associated with the model, only the
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
 """
 DEBERTA_INPUTS_DOCSTRING = r"""
@@ -777,26 +791,24 @@ DEBERTA_INPUTS_DOCSTRING = r"""
        input_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using :class:`transformers.DebertaTokenizer`.
+            Indices can be obtained using :class:`transformers.DebertaTokenizer`. See
-            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.__call__` for
-            :func:`transformers.PreTrainedTokenizer.__call__` for details.
+            details.
            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`):
-            Mask to avoid performing attention on padding token indices.
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for
-            Mask values selected in ``[0, 1]``:
+            tokens that are NOT MASKED, ``0`` for MASKED tokens.
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
            `What are attention masks? <../glossary.html#attention-mask>`__
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs.
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
+            1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token
-            corresponds to a `sentence B` token
            `What are token type IDs? <../glossary.html#token-type-ids>`_
        position_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings.
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+            config.max_position_embeddings - 1]``.
            `What are position IDs? <../glossary.html#position-ids>`_
        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
@@ -804,9 +816,11 @@ DEBERTA_INPUTS_DOCSTRING = r"""
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
        output_attentions (:obj:`bool`, `optional`):
-            If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
+            If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under
+            returned tensors for more detail.
        output_hidden_states (:obj:`bool`, `optional`):
-            If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
+            If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned
+            tensors for more detail.
        return_dict (:obj:`bool`, `optional`):
            If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
            plain tuple.
@@ -834,9 +848,9 @@ class DebertaModel(DebertaPreTrainedModel):
        self.embeddings.word_embeddings = new_embeddings
    def _prune_heads(self, heads_to_prune):
-        """Prunes heads of the model.
+        """
-        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        See base class PreTrainedModel
+        class PreTrainedModel
        """
        raise NotImplementedError("The prune function is not implemented in DeBERTa model.")
@@ -928,8 +942,10 @@ class DebertaModel(DebertaPreTrainedModel):
 @add_start_docstrings(
-    """DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of
+    """
-    the pooled output) e.g. for GLUE tasks. """,
+    DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """,
    DEBERTA_START_DOCSTRING,
 )
 class DebertaForSequenceClassification(DebertaPreTrainedModel):
@@ -977,9 +993,8 @@ class DebertaForSequenceClassification(DebertaPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the sequence classification/regression loss.
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
-            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

--- a/src/transformers/modeling_distilbert.py
+++ b/src/transformers/modeling_distilbert.py
@@ -12,9 +12,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch DistilBERT model
+"""
-    adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
+ PyTorch DistilBERT model adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM) and in
-    and in part from HuggingFace PyTorch version of Google AI Bert model (https://github.com/google-research/bert)
+ part from HuggingFace PyTorch version of Google AI Bert model (https://github.com/google-research/bert)
 """
@@ -95,15 +95,11 @@ class Embeddings(nn.Module):
    def forward(self, input_ids):
        """
-        Parameters
+        Parameters:
-        ----------
+            input_ids: torch.tensor(bs, max_seq_length) The token ids to embed.
-        input_ids: torch.tensor(bs, max_seq_length)
-            The token ids to embed.
+        Returns: torch.tensor(bs, max_seq_length, dim) The embedded tokens (plus position embeddings, no token_type
+        embeddings)
-        Outputs
-        -------
-        embeddings: torch.tensor(bs, max_seq_length, dim)
-            The embedded tokens (plus position embeddings, no token_type embeddings)
        """
        seq_length = input_ids.size(1)
        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)  # (max_seq_length)
@@ -152,19 +148,15 @@ class MultiHeadSelfAttention(nn.Module):
    def forward(self, query, key, value, mask, head_mask=None, output_attentions=False):
        """
-        Parameters
+        Parameters:
-        ----------
+            query: torch.tensor(bs, seq_length, dim)
-        query: torch.tensor(bs, seq_length, dim)
+            key: torch.tensor(bs, seq_length, dim)
-        key: torch.tensor(bs, seq_length, dim)
+            value: torch.tensor(bs, seq_length, dim)
-        value: torch.tensor(bs, seq_length, dim)
+            mask: torch.tensor(bs, seq_length)
-        mask: torch.tensor(bs, seq_length)
+        Returns:
-        Outputs
+            weights: torch.tensor(bs, n_heads, seq_length, seq_length) Attention weights context: torch.tensor(bs,
-        -------
+            seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True`
-        weights: torch.tensor(bs, n_heads, seq_length, seq_length)
-            Attention weights
-        context: torch.tensor(bs, seq_length, dim)
-            Contextualized layer. Optional: only if `output_attentions=True`
        """
        bs, q_length, dim = query.size()
        k_length = key.size(1)
@@ -247,17 +239,13 @@ class TransformerBlock(nn.Module):
    def forward(self, x, attn_mask=None, head_mask=None, output_attentions=False):
        """
-        Parameters
+        Parameters:
-        ----------
+            x: torch.tensor(bs, seq_length, dim)
-        x: torch.tensor(bs, seq_length, dim)
+            attn_mask: torch.tensor(bs, seq_length)
-        attn_mask: torch.tensor(bs, seq_length)
+        Returns:
-        Outputs
+            sa_weights: torch.tensor(bs, n_heads, seq_length, seq_length) The attention weights ffn_output:
-        -------
+            torch.tensor(bs, seq_length, dim) The output of the transformer block contextualization.
-        sa_weights: torch.tensor(bs, n_heads, seq_length, seq_length)
-            The attention weights
-        ffn_output: torch.tensor(bs, seq_length, dim)
-            The output of the transformer block contextualization.
        """
        # Self-Attention
        sa_output = self.attention(
@@ -295,25 +283,20 @@ class Transformer(nn.Module):
    def forward(
        self, x, attn_mask=None, head_mask=None, output_attentions=False, output_hidden_states=False, return_dict=None
-    ):
+    ):  # docstyle-ignore
        """
-        Parameters
+        Parameters:
-        ----------
+            x: torch.tensor(bs, seq_length, dim) Input sequence embedded.
-        x: torch.tensor(bs, seq_length, dim)
+            attn_mask: torch.tensor(bs, seq_length) Attention mask on the sequence.
-            Input sequence embedded.
-        attn_mask: torch.tensor(bs, seq_length)
+        Returns:
-            Attention mask on the sequence.
+            hidden_state: torch.tensor(bs, seq_length, dim) Sequence of hiddens states in the last (top)
+            layer all_hidden_states: Tuple[torch.tensor(bs, seq_length, dim)]
-        Outputs
+                Tuple of length n_layers with the hidden states from each layer.
-        -------
+                Optional: only if output_hidden_states=True
-        hidden_state: torch.tensor(bs, seq_length, dim)
+            all_attentions: Tuple[torch.tensor(bs, n_heads, seq_length, seq_length)]
-            Sequence of hiddens states in the last (top) layer
+                Tuple of length n_layers with the attention weights from each layer
-        all_hidden_states: Tuple[torch.tensor(bs, seq_length, dim)]
+                Optional: only if output_attentions=True
-            Tuple of length n_layers with the hidden states from each layer.
-            Optional: only if output_hidden_states=True
-        all_attentions: Tuple[torch.tensor(bs, n_heads, seq_length, seq_length)]
-            Tuple of length n_layers with the attention weights from each layer
-            Optional: only if output_attentions=True
        """
        all_hidden_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None
@@ -348,8 +331,9 @@ class Transformer(nn.Module):
 # INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
 class DistilBertPreTrainedModel(PreTrainedModel):
-    """An abstract class to handle weights initialization and
+    """
-    a simple interface for downloading and loading pretrained models.
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
    """
    config_class = DistilBertConfig
@@ -376,14 +360,15 @@ DISTILBERT_START_DOCSTRING = r"""
    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
    pruning heads etc.)
-    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass.
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    usage and behavior.
+    general usage and behavior.
    Parameters:
        config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Initializing with a config file does not load the weights associated with the model, only the
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
 """
 DISTILBERT_INPUTS_DOCSTRING = r"""
@@ -391,22 +376,20 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using :class:`~transformers.DistilBertTokenizer`.
+            Indices can be obtained using :class:`~transformers.DistilBertTokenizer`. See
-            See :meth:`transformers.PreTrainedTokenizer.encode` and
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            :meth:`transformers.PreTrainedTokenizer.__call__` for details.
+            details.
            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
-            Mask to avoid performing attention on padding token indices.
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            `What are attention masks? <../glossary.html#attention-mask>`__
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules.
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
@@ -446,9 +429,9 @@ class DistilBertModel(DistilBertPreTrainedModel):
        self.embeddings.word_embeddings = new_embeddings
    def _prune_heads(self, heads_to_prune):
-        """Prunes heads of the model.
+        """
-        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        See base class PreTrainedModel
+        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.transformer.layer[layer].attention.prune_heads(heads)
@@ -547,10 +530,9 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the masked language modeling loss.
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
-            labels in ``[0, ..., config.vocab_size]``.
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        """
@@ -595,8 +577,10 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
 @add_start_docstrings(
-    """DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+    """
-    the pooled output) e.g. for GLUE tasks. """,
+    DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """,
    DISTILBERT_START_DOCSTRING,
 )
 class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
@@ -631,9 +615,8 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the sequence classification/regression loss.
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
-            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -676,8 +659,10 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
 @add_start_docstrings(
-    """DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+    """
-    the hidden-states output to compute `span start logits` and `span end logits`). """,
+    DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
+    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
    DISTILBERT_START_DOCSTRING,
 )
 class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
@@ -713,12 +698,12 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
        r"""
        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (:obj:`sequence_length`).
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
-            Position outside of the sequence are not taken into account for computing the loss.
+            sequence are not taken into account for computing the loss.
        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (:obj:`sequence_length`).
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
-            Position outside of the sequence are not taken into account for computing the loss.
+            sequence are not taken into account for computing the loss.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -770,8 +755,10 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
 @add_start_docstrings(
-    """DistilBert Model with a token classification head on top (a linear layer on top of
+    """
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    DistilBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
+    for Named-Entity-Recognition (NER) tasks.
+    """,
    DISTILBERT_START_DOCSTRING,
 )
 class DistilBertForTokenClassification(DistilBertPreTrainedModel):
@@ -805,8 +792,8 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the token classification loss.
+            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
-            Indices should be in ``[0, ..., config.num_labels - 1]``.
+            1]``.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -852,8 +839,10 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
 @add_start_docstrings(
-    """DistilBert Model with a multiple choice classification head on top (a linear layer on top of
+    """
-    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    DistilBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
+    a softmax) e.g. for RocStories/SWAG tasks.
+    """,
    DISTILBERT_START_DOCSTRING,
 )
 class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
@@ -882,9 +871,9 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the multiple choice classification loss.
+            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
-            Indices should be in ``[0, ..., num_choices-1]`` where :obj:`num_choices` is the size of the second dimension
+            num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
-            of the input tensors. (See :obj:`input_ids` above)
+            :obj:`input_ids` above)
        Returns:

--- a/src/transformers/modeling_dpr.py
+++ b/src/transformers/modeling_dpr.py
@@ -59,18 +59,17 @@ class DPRContextEncoderOutput(ModelOutput):
    Args:
        pooler_output: (:obj:``torch.FloatTensor`` of shape ``(batch_size, embeddings_size)``):
-            The DPR encoder outputs the `pooler_output` that corresponds to the context representation.
+            The DPR encoder outputs the `pooler_output` that corresponds to the context representation. Last layer
-            Last layer hidden-state of the first token of the sequence (classification token)
+            hidden-state of the first token of the sequence (classification token) further processed by a Linear layer.
-            further processed by a Linear layer. This output is to be used to embed contexts for
+            This output is to be used to embed contexts for nearest neighbors queries with questions embeddings.
-            nearest neighbors queries with questions embeddings.
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            sequence_length, sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
@@ -88,18 +87,17 @@ class DPRQuestionEncoderOutput(ModelOutput):
    Args:
        pooler_output: (:obj:``torch.FloatTensor`` of shape ``(batch_size, embeddings_size)``):
-            The DPR encoder outputs the `pooler_output` that corresponds to the question representation.
+            The DPR encoder outputs the `pooler_output` that corresponds to the question representation. Last layer
-            Last layer hidden-state of the first token of the sequence (classification token)
+            hidden-state of the first token of the sequence (classification token) further processed by a Linear layer.
-            further processed by a Linear layer. This output is to be used to embed questions for
+            This output is to be used to embed questions for nearest neighbors queries with context embeddings.
-            nearest neighbors queries with context embeddings.
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            sequence_length, sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
@@ -121,16 +119,16 @@ class DPRReaderOutput(ModelOutput):
        end_logits: (:obj:``torch.FloatTensor`` of shape ``(n_passages, sequence_length)``):
            Logits of the end index of the span for each passage.
        relevance_logits: (:obj:`torch.FloatTensor`` of shape ``(n_passages, )``):
-            Outputs of the QA classifier of the DPRReader that corresponds to the scores of each passage
+            Outputs of the QA classifier of the DPRReader that corresponds to the scores of each passage to answer the
-            to answer the question, compared to all the other passages.
+            question, compared to all the other passages.
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            sequence_length, sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
@@ -268,8 +266,9 @@ class DPRSpanPredictor(PreTrainedModel):
 class DPRPretrainedContextEncoder(PreTrainedModel):
-    """An abstract class to handle weights initialization and
+    """
-    a simple interface for downloading and loading pretrained models.
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
    """
    config_class = DPRConfig
@@ -282,8 +281,9 @@ class DPRPretrainedContextEncoder(PreTrainedModel):
 class DPRPretrainedQuestionEncoder(PreTrainedModel):
-    """An abstract class to handle weights initialization and
+    """
-    a simple interface for downloading and loading pretrained models.
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
    """
    config_class = DPRConfig
@@ -296,8 +296,9 @@ class DPRPretrainedQuestionEncoder(PreTrainedModel):
 class DPRPretrainedReader(PreTrainedModel):
-    """An abstract class to handle weights initialization and
+    """
-    a simple interface for downloading and loading pretrained models.
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
    """
    config_class = DPRConfig
@@ -322,88 +323,83 @@ DPR_START_DOCSTRING = r"""
    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
    pruning heads etc.)
-    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass.
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    usage and behavior.
+    general usage and behavior.
    Parameters:
        config (:class:`~transformers.DPRConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Initializing with a config file does not load the weights associated with the model, only the
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
 """
 DPR_ENCODERS_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
+            Indices of input sequence tokens in the vocabulary. To match pretraining, DPR input sequence should be
-            To match pretraining, DPR input sequence should be formatted with [CLS] and [SEP] tokens as follows:
+            formatted with [CLS] and [SEP] tokens as follows:
            (a) For sequence pairs (for a pair title+text for example):
-                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
+            ::
-                ``token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1``
+                tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+                token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
            (b) For single sequences (for a question for example):
-                ``tokens:         [CLS] the dog is hairy . [SEP]``
+            ::
-                ``token_type_ids:   0   0   0   0  0     0   0``
+                tokens:         [CLS] the dog is hairy . [SEP]
+                token_type_ids:   0   0   0   0  0     0   0
-            DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on
+            DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
-            the right rather than the left.
+            rather than the left.
-            Indices can be obtained using :class:`~transformers.DPRTokenizer`.
+            Indices can be obtained using :class:`~transformers.DPRTokenizer`. See
-            See :meth:`transformers.PreTrainedTokenizer.encode` and
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            :meth:`transformers.PreTrainedTokenizer.__call__` for details.
+            details. attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`,
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            `optional`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0,
-            Mask to avoid performing attention on padding token indices.
+            1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
-            `What are attention masks? <../glossary.html#attention-mask>`__
+            `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`torch.LongTensor` of
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            shape :obj:`(batch_size, sequence_length)`, `optional`): Segment token indices to indicate first and second
-            Segment token indices to indicate first and second portions of the inputs.
+            portions of the inputs. Indices are selected in ``[0, 1]``:
-            Indices are selected in ``[0, 1]``:
            - 0 corresponds to a `sentence A` token,
            - 1 corresponds to a `sentence B` token.
-            `What are token type IDs? <../glossary.html#token-type-ids>`_
+            `What are token type IDs? <../glossary.html#token-type-ids>`_ inputs_embeds (:obj:`torch.FloatTensor` of
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): Optionally, instead of passing
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want
-            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            more control over how to convert :obj:`input_ids` indices into associated vectors than the model's internal
-            vectors than the model's internal embedding lookup matrix.
+            embedding lookup matrix. output_attentions (:obj:`bool`, `optional`): Whether or not to return the
-        output_attentions (:obj:`bool`, `optional`):
+            attentions tensors of all attention layers. See ``attentions`` under returned tensors for more detail.
-            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            output_hidden_states (:obj:`bool`, `optional`): Whether or not to return the hidden states of all layers.
-            tensors for more detail.
+            See ``hidden_states`` under returned tensors for more detail. return_dict (:obj:`bool`, `optional`):
-        output_hidden_states (:obj:`bool`, `optional`):
-            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
-            more detail.
-        return_dict (:obj:`bool`, `optional`):
            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
 """
 DPR_READER_INPUTS_DOCSTRING = r"""
    Args:
        input_ids: (:obj:`Tuple[torch.LongTensor]` of shapes :obj:`(n_passages, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
+            Indices of input sequence tokens in the vocabulary. It has to be a sequence triplet with 1) the question
-            It has to be a sequence triplet with 1) the question and 2) the passages titles and 3) the passages texts
+            and 2) the passages titles and 3) the passages texts To match pretraining, DPR :obj:`input_ids` sequence
-            To match pretraining, DPR :obj:`input_ids` sequence should be formatted with [CLS] and [SEP] with the
+            should be formatted with [CLS] and [SEP] with the format:
-            format:
                ``[CLS] <question token ids> [SEP] <titles ids> [SEP] <texts ids>``
-            DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on
+            DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
-            the right rather than the left.
+            rather than the left.
            Indices can be obtained using :class:`~transformers.DPRReaderTokenizer`. See this class documentation for
            more details.
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(n_passages, sequence_length)`, `optional`):
-            Mask to avoid performing attention on padding token indices.
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

--- a/src/transformers/modeling_electra.py
+++ b/src/transformers/modeling_electra.py
@@ -527,8 +527,9 @@ class ElectraGeneratorPredictions(nn.Module):
 class ElectraPreTrainedModel(PreTrainedModel):
-    """An abstract class to handle weights initialization and
+    """
-    a simple interface for downloading and loading pretrained models.
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
    """
    config_class = ElectraConfig
@@ -567,8 +568,8 @@ class ElectraForPreTrainingOutput(ModelOutput):
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            sequence_length, sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
@@ -586,14 +587,15 @@ ELECTRA_START_DOCSTRING = r"""
    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
    pruning heads etc.)
-    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass.
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    usage and behavior.
+    general usage and behavior.
    Parameters:
        config (:class:`~transformers.ElectraConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Initializing with a config file does not load the weights associated with the model, only the
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
 """
 ELECTRA_INPUTS_DOCSTRING = r"""
@@ -601,35 +603,33 @@ ELECTRA_INPUTS_DOCSTRING = r"""
        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using :class:`~transformers.ElectraTokenizer`.
+            Indices can be obtained using :class:`~transformers.ElectraTokenizer`. See
-            See :meth:`transformers.PreTrainedTokenizer.encode` and
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            :meth:`transformers.PreTrainedTokenizer.__call__` for details.
+            details.
            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
-            Mask to avoid performing attention on padding token indices.
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            `What are attention masks? <../glossary.html#attention-mask>`__
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs.
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            Indices are selected in ``[0, 1]``:
+            1]``:
            - 0 corresponds to a `sentence A` token,
            - 1 corresponds to a `sentence B` token.
            `What are token type IDs? <../glossary.html#token-type-ids>`_
        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings.
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+            config.max_position_embeddings - 1]``.
            `What are position IDs? <../glossary.html#position-ids>`_
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules.
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
@@ -639,12 +639,11 @@ ELECTRA_INPUTS_DOCSTRING = r"""
            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
            vectors than the model's internal embedding lookup matrix.
        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
-            if the model is configured as a decoder.
+            the model is configured as a decoder.
        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
-            is used in the cross-attention if the model is configured as a decoder.
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
@@ -687,9 +686,9 @@ class ElectraModel(ElectraPreTrainedModel):
        self.embeddings.word_embeddings = value
    def _prune_heads(self, heads_to_prune):
-        """Prunes heads of the model.
+        """
-        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        See base class PreTrainedModel
+        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)
@@ -777,8 +776,10 @@ class ElectraClassificationHead(nn.Module):
 @add_start_docstrings(
-    """ELECTRA Model transformer with a sequence classification/regression head on top (a linear layer on top of
+    """
-    the pooled output) e.g. for GLUE tasks. """,
+    ELECTRA Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """,
    ELECTRA_START_DOCSTRING,
 )
 class ElectraForSequenceClassification(ElectraPreTrainedModel):
@@ -812,9 +813,8 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the sequence classification/regression loss.
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
-            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -861,7 +861,8 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel):
    Electra model with a binary classification head on top as used during pre-training for identifying generated
    tokens.
-    It is recommended to load the discriminator checkpoint into that model.""",
+    It is recommended to load the discriminator checkpoint into that model.
+    """,
    ELECTRA_START_DOCSTRING,
 )
 class ElectraForPreTraining(ElectraPreTrainedModel):
@@ -889,8 +890,8 @@ class ElectraForPreTraining(ElectraPreTrainedModel):
    ):
        r"""
        labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`):
-            Labels for computing the ELECTRA loss. Input should be a sequence of tokens (see :obj:`input_ids` docstring)
+            Labels for computing the ELECTRA loss. Input should be a sequence of tokens (see :obj:`input_ids`
-            Indices should be in ``[0, 1]``:
+            docstring) Indices should be in ``[0, 1]``:
            - 0 indicates the token is an original token,
            - 1 indicates the token was replaced.
@@ -952,8 +953,9 @@ class ElectraForPreTraining(ElectraPreTrainedModel):
    """
    Electra model with a language modeling head on top.
-    Even though both the discriminator and generator may be loaded into this model, the generator is
+    Even though both the discriminator and generator may be loaded into this model, the generator is the only model of
-    the only model of the two to have been trained for the masked language modeling task.""",
+    the two to have been trained for the masked language modeling task.
+    """,
    ELECTRA_START_DOCSTRING,
 )
 class ElectraForMaskedLM(ElectraPreTrainedModel):
@@ -992,10 +994,9 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the masked language modeling loss.
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
-            in ``[0, ..., config.vocab_size]``
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        """
@@ -1046,7 +1047,8 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):
    """
    Electra model with a token classification head on top.
-    Both the discriminator and generator may be loaded into this model.""",
+    Both the discriminator and generator may be loaded into this model.
+    """,
    ELECTRA_START_DOCSTRING,
 )
 class ElectraForTokenClassification(ElectraPreTrainedModel):
@@ -1080,8 +1082,8 @@ class ElectraForTokenClassification(ElectraPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the token classification loss.
+            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
-            Indices should be in ``[0, ..., config.num_labels - 1]``.
+            1]``.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1128,7 +1130,8 @@ class ElectraForTokenClassification(ElectraPreTrainedModel):
 @add_start_docstrings(
    """
    ELECTRA Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
-    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).""",
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
    ELECTRA_START_DOCSTRING,
 )
 class ElectraForQuestionAnswering(ElectraPreTrainedModel):
@@ -1168,12 +1171,12 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel):
        r"""
        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (:obj:`sequence_length`).
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
-            Position outside of the sequence are not taken into account for computing the loss.
+            sequence are not taken into account for computing the loss.
        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (:obj:`sequence_length`).
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
-            Position outside of the sequence are not taken into account for computing the loss.
+            sequence are not taken into account for computing the loss.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1229,8 +1232,10 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel):
 @add_start_docstrings(
-    """ELECTRA Model with a multiple choice classification head on top (a linear layer on top of
+    """
-    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    ELECTRA Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
    ELECTRA_START_DOCSTRING,
 )
 class ElectraForMultipleChoice(ElectraPreTrainedModel):
@@ -1265,9 +1270,9 @@ class ElectraForMultipleChoice(ElectraPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the multiple choice classification loss.
+            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
-            Indices should be in ``[0, ..., num_choices-1]`` where :obj:`num_choices` is the size of the second dimension
+            num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
-            of the input tensors. (See :obj:`input_ids` above)
+            :obj:`input_ids` above)
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

--- a/src/transformers/modeling_encoder_decoder.py
+++ b/src/transformers/modeling_encoder_decoder.py
@@ -33,9 +33,8 @@ ENCODER_DECODER_START_DOCSTRING = r"""
    This class can be used to inialize a sequence-to-sequnece model with any pretrained autoencoding model as the
    encoder and any pretrained autoregressive model as the decoder. The encoder is loaded via
    :meth:`~transformers.AutoModel.from_pretrained` function and the decoder is loaded via
-    :meth:`~transformers.AutoModelForCausalLM.from_pretrained` function.
+    :meth:`~transformers.AutoModelForCausalLM.from_pretrained` function. Cross-attention layers are automatically added
-    Cross-attention layers are automatically added to the decoder and should be fine-tuned on a downstream generative
+    to the decoder and should be fine-tuned on a downstream generative task, like summarization.
-    task, like summarization.
    The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation
    tasks was shown in `Leveraging Pre-trained Checkpoints for Sequence Generation Tasks
@@ -49,14 +48,15 @@ ENCODER_DECODER_START_DOCSTRING = r"""
    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
    pruning heads etc.)
-    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass.
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    usage and behavior.
+    general usage and behavior.
    Parameters:
        config (:class:`~transformers.T5Config`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Initializing with a config file does not load the weights associated with the model, only the
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
 """
 ENCODER_DECODER_INPUTS_DOCSTRING = r"""
@@ -64,32 +64,30 @@ ENCODER_DECODER_INPUTS_DOCSTRING = r"""
        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using :class:`~transformers.PreTrainedTokenizer`.
+            Indices can be obtained using :class:`~transformers.PreTrainedTokenizer`. See
-            See :meth:`transformers.PreTrainedTokenizer.encode` and
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            :meth:`transformers.PreTrainedTokenizer.__call__` for details.
+            details.
            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on padding token indices.
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            `What are attention masks? <../glossary.html#attention-mask>`__
        decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
-            Provide for sequence to sequence training to the decoder.
+            Provide for sequence to sequence training to the decoder. Indices can be obtained using
-            Indices can be obtained using :class:`~transformers.PretrainedTokenizer`.
+            :class:`~transformers.PretrainedTokenizer`. See :meth:`transformers.PreTrainedTokenizer.encode` and
-            See :meth:`transformers.PreTrainedTokenizer.encode` and
            :meth:`transformers.PreTrainedTokenizer.__call__` for details.
        decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
            Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
            also be used by default.
        encoder_outputs (:obj:`tuple(torch.FloatTensor)`, `optional`):
-            This tuple must consist of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`: :obj:`attentions`)
+            This tuple must consist of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
-            :obj:`last_hidden_state` (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`)
+            :obj:`attentions`) :obj:`last_hidden_state` (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
-            is a tensor of hidden-states at the output of the last layer of the encoder.
+            sequence_length, hidden_size)`) is a tensor of hidden-states at the output of the last layer of the
-            Used in the cross-attention of the decoder.
+            encoder. Used in the cross-attention of the decoder.
        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
@@ -105,10 +103,9 @@ ENCODER_DECODER_INPUTS_DOCSTRING = r"""
            representation. This is useful if you want more control over how to convert :obj:`decoder_input_ids`
            indices into associated vectors than the model's internal embedding lookup matrix.
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the masked language modeling loss for the decoder.
+            Labels for computing the masked language modeling loss for the decoder. Indices should be in ``[-100, 0,
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
-            labels in ``[0, ..., config.vocab_size]``
        use_cache (:obj:`bool`, `optional`):
            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
            decoding (see :obj:`past_key_values`).
@@ -122,6 +119,7 @@ ENCODER_DECODER_INPUTS_DOCSTRING = r"""
            If set to ``True``, the model will return a :class:`~transformers.file_utils.Seq2SeqLMOutput` instead of a
            plain tuple.
        kwargs: (`optional`) Remaining dictionary of keyword arguments. Keyword arguments come in two flavors:
            - Without a prefix which will be input as ``**encoder_kwargs`` for the encoder forward function.
            - With a `decoder_` prefix which will be input as ``**decoder_kwargs`` for the decoder forward function.
 """
@@ -130,10 +128,9 @@ ENCODER_DECODER_INPUTS_DOCSTRING = r"""
 @add_start_docstrings(ENCODER_DECODER_START_DOCSTRING)
 class EncoderDecoderModel(PreTrainedModel):
    r"""
-    :class:`~transformers.EncoderDecoder` is a generic model class that will be
+    :class:`~transformers.EncoderDecoder` is a generic model class that will be instantiated as a transformer
-    instantiated as a transformer architecture with one of the base model
+    architecture with one of the base model classes of the library as encoder and another one as decoder when created
-    classes of the library as encoder and another one as
+    with the :meth`~transformers.AutoModel.from_pretrained` class method for the encoder and
-    decoder when created with the :meth`~transformers.AutoModel.from_pretrained` class method for the encoder and
    :meth`~transformers.AutoModelForCausalLM.from_pretrained` class method for the decoder.
    """
    config_class = EncoderDecoderConfig
@@ -210,8 +207,8 @@ class EncoderDecoderModel(PreTrainedModel):
        checkpoints.
-        The model is set in evaluation mode by default using :obj:`model.eval()` (Dropout modules are deactivated).
+        The model is set in evaluation mode by default using :obj:`model.eval()` (Dropout modules are deactivated). To
-        To train the model, you need to first set it back in training mode with :obj:`model.train()`.
+        train the model, you need to first set it back in training mode with :obj:`model.train()`.
        Params:
            encoder_pretrained_model_name_or_path (:obj: `str`, `optional`):

--- a/src/transformers/modeling_flaubert.py
+++ b/src/transformers/modeling_flaubert.py
@@ -56,14 +56,15 @@ FLAUBERT_START_DOCSTRING = r"""
    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
    pruning heads etc.)
-    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass.
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    usage and behavior.
+    general usage and behavior.
    Parameters:
        config (:class:`~transformers.FlaubertConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Initializing with a config file does not load the weights associated with the model, only the
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
 """
 FLAUBERT_INPUTS_DOCSTRING = r"""
@@ -71,44 +72,42 @@ FLAUBERT_INPUTS_DOCSTRING = r"""
        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using :class:`~transformers.FlaubertTokenizer`.
+            Indices can be obtained using :class:`~transformers.FlaubertTokenizer`. See
-            See :meth:`transformers.PreTrainedTokenizer.encode` and
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            :meth:`transformers.PreTrainedTokenizer.__call__` for details.
+            details.
            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on padding token indices.
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            `What are attention masks? <../glossary.html#attention-mask>`__
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs.
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            Indices are selected in ``[0, 1]``:
+            1]``:
            - 0 corresponds to a `sentence A` token,
            - 1 corresponds to a `sentence B` token.
            `What are token type IDs? <../glossary.html#token-type-ids>`_
        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings.
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+            config.max_position_embeddings - 1]``.
            `What are position IDs? <../glossary.html#position-ids>`_
        lengths (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Length of each sentence that can be used to avoid performing attention on padding token indices.
+            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
-            You can also use :obj:`attention_mask` for the same result (see above), kept here for compatbility.
+            also use :obj:`attention_mask` for the same result (see above), kept here for compatbility. Indices
-            Indices selected in ``[0, ..., input_ids.size(-1)]``:
+            selected in ``[0, ..., input_ids.size(-1)]``:
        cache (:obj:`Dict[str, torch.FloatTensor]`, `optional`):
-            Dictionary strings to ``torch.FloatTensor`` that contains precomputed
+            Dictionary strings to ``torch.FloatTensor`` that contains precomputed hidden-states (key and values in the
-            hidden-states (key and values in the attention blocks) as computed by the model
+            attention blocks) as computed by the model (see :obj:`cache` output below). Can be used to speed up
-            (see :obj:`cache` output below). Can be used to speed up sequential decoding.
+            sequential decoding. The dictionary object will be modified in-place during the forward pass to add newly
-            The dictionary object will be modified in-place during the forward pass to add newly computed hidden-states.
+            computed hidden-states.
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules.
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
@@ -308,14 +307,16 @@ class FlaubertModel(XLMModel):
 @add_start_docstrings(
-    """The Flaubert Model transformer with a language modeling head on top
+    """
-    (linear layer with weights tied to the input embeddings). """,
+    The Flaubert Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
    FLAUBERT_START_DOCSTRING,
 )
 class FlaubertWithLMHeadModel(XLMWithLMHeadModel):
    """
-    This class overrides :class:`~transformers.XLMWithLMHeadModel`. Please check the
+    This class overrides :class:`~transformers.XLMWithLMHeadModel`. Please check the superclass for the appropriate
-    superclass for the appropriate documentation alongside usage examples.
+    documentation alongside usage examples.
    """
    config_class = FlaubertConfig
@@ -327,14 +328,16 @@ class FlaubertWithLMHeadModel(XLMWithLMHeadModel):
 @add_start_docstrings(
-    """Flaubert Model with a sequence classification/regression head on top (a linear layer on top of
+    """
-    the pooled output) e.g. for GLUE tasks. """,
+    Flaubert Model with a sequence classification/regression head on top (a linear layer on top of the pooled output)
+    e.g. for GLUE tasks.
+    """,
    FLAUBERT_START_DOCSTRING,
 )
 class FlaubertForSequenceClassification(XLMForSequenceClassification):
    """
-    This class overrides :class:`~transformers.XLMForSequenceClassification`. Please check the
+    This class overrides :class:`~transformers.XLMForSequenceClassification`. Please check the superclass for the
-    superclass for the appropriate documentation alongside usage examples.
+    appropriate documentation alongside usage examples.
    """
    config_class = FlaubertConfig
@@ -346,14 +349,16 @@ class FlaubertForSequenceClassification(XLMForSequenceClassification):
 @add_start_docstrings(
-    """Flaubert Model with a token classification head on top (a linear layer on top of
+    """
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    Flaubert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
    FLAUBERT_START_DOCSTRING,
 )
 class FlaubertForTokenClassification(XLMForTokenClassification):
    """
-    This class overrides :class:`~transformers.XLMForTokenClassification`. Please check the
+    This class overrides :class:`~transformers.XLMForTokenClassification`. Please check the superclass for the
-    superclass for the appropriate documentation alongside usage examples.
+    appropriate documentation alongside usage examples.
    """
    config_class = FlaubertConfig
@@ -365,14 +370,16 @@ class FlaubertForTokenClassification(XLMForTokenClassification):
 @add_start_docstrings(
-    """Flaubert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+    """
-    the hidden-states output to compute `span start logits` and `span end logits`). """,
+    Flaubert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
    FLAUBERT_START_DOCSTRING,
 )
 class FlaubertForQuestionAnsweringSimple(XLMForQuestionAnsweringSimple):
    """
-    This class overrides :class:`~transformers.XLMForQuestionAnsweringSimple`. Please check the
+    This class overrides :class:`~transformers.XLMForQuestionAnsweringSimple`. Please check the superclass for the
-    superclass for the appropriate documentation alongside usage examples.
+    appropriate documentation alongside usage examples.
    """
    config_class = FlaubertConfig
@@ -384,14 +391,16 @@ class FlaubertForQuestionAnsweringSimple(XLMForQuestionAnsweringSimple):
 @add_start_docstrings(
-    """Flaubert Model with a beam-search span classification head on top for extractive question-answering tasks like
+    """
-    SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
+    Flaubert Model with a beam-search span classification head on top for extractive question-answering tasks like
+    SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
    FLAUBERT_START_DOCSTRING,
 )
 class FlaubertForQuestionAnswering(XLMForQuestionAnswering):
    """
-    This class overrides :class:`~transformers.XLMForQuestionAnswering`. Please check the
+    This class overrides :class:`~transformers.XLMForQuestionAnswering`. Please check the superclass for the
-    superclass for the appropriate documentation alongside usage examples.
+    appropriate documentation alongside usage examples.
    """
    config_class = FlaubertConfig
@@ -403,14 +412,16 @@ class FlaubertForQuestionAnswering(XLMForQuestionAnswering):
 @add_start_docstrings(
-    """Flaubert Model with a multiple choice classification head on top (a linear layer on top of
+    """
-    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    Flaubert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
    FLAUBERT_START_DOCSTRING,
 )
 class FlaubertForMultipleChoice(XLMForMultipleChoice):
    """
-    This class overrides :class:`~transformers.XLMForMultipleChoice`. Please check the
+    This class overrides :class:`~transformers.XLMForMultipleChoice`. Please check the superclass for the appropriate
-    superclass for the appropriate documentation alongside usage examples.
+    documentation alongside usage examples.
    """
    config_class = FlaubertConfig

--- a/src/transformers/modeling_flax_auto.py
+++ b/src/transformers/modeling_flax_auto.py
@@ -46,10 +46,9 @@ MODEL_MAPPING = OrderedDict(
 class FlaxAutoModel(object):
    r"""
-    :class:`~transformers.FlaxAutoModel` is a generic model class
+    :class:`~transformers.FlaxAutoModel` is a generic model class that will be instantiated as one of the base model
-    that will be instantiated as one of the base model classes of the library
+    classes of the library when created with the `FlaxAutoModel.from_pretrained(pretrained_model_name_or_path)` or the
-    when created with the `FlaxAutoModel.from_pretrained(pretrained_model_name_or_path)`
+    `FlaxAutoModel.from_config(config)` class methods.
-    or the `FlaxAutoModel.from_config(config)` class methods.
    This class cannot be instantiated using `__init__()` (throws an error).
    """
@@ -63,19 +62,22 @@ class FlaxAutoModel(object):
    @classmethod
    def from_config(cls, config):
-        r"""Instantiates one of the base model classes of the library
+        r"""
-        from a configuration.
+        Instantiates one of the base model classes of the library from a configuration.
        Args:
            config (:class:`~transformers.PretrainedConfig`):
                The model class to instantiate is selected based on the configuration class:
                - isInstance of `roberta` configuration class: :class:`~transformers.FlaxRobertaModel` (RoBERTa model)
-                - isInstance of `bert` configuration class: :class:`~transformers.FlaxBertModel` (Bert model)
+                - isInstance of `bert` configuration class: :class:`~transformers.FlaxBertModel` (Bert model
-        Examples:
-            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+        Examples::
-            model = FlaxAutoModel.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            config = BertConfig.from_pretrained('bert-base-uncased')
+            # Download configuration from S3 and cache.
+            model = FlaxAutoModel.from_config(config)
+            # E.g. model was saved using `save_pretrained('./test/saved_model/')`
        """
        for config_class, model_class in MODEL_MAPPING.items():
            if isinstance(config, config_class):
@@ -88,60 +90,75 @@ class FlaxAutoModel(object):
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r"""Instantiates one of the base model classes of the library
+        r"""
-        from a pre-trained model configuration.
+        Instantiates one of the base model classes of the library from a pre-trained model configuration.
+        The `from_pretrained()` method takes care of returning the correct model class instance based on the
+        `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the
+        `pretrained_model_name_or_path` string.
-        The `from_pretrained()` method takes care of returning the correct model class instance
+        The base model class to instantiate is selected as the first pattern matching in the
-        based on the `model_type` property of the config object, or when it's missing,
+        `pretrained_model_name_or_path` string (in the following order):
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string.
-        The base model class to instantiate is selected as the first pattern matching
-        in the `pretrained_model_name_or_path` string (in the following order):
            - contains `roberta`: :class:`~transformers.FlaxRobertaModel` (RoBERTa model)
            - contains `bert`: :class:`~transformers.FlaxBertModel` (Bert model)
-            The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+            The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) To
-            To train the model, you should first set it back in training mode with `model.train()`
+            train the model, you should first set it back in training mode with `model.train()`
        Args:
            pretrained_model_name_or_path: either:
-                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.:
-                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
+                  ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~transformers.FlaxPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.:
-                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+                  ``dbmdz/bert-base-german-cased``.
+                - a path to a `directory` containing model weights saved using
+                  :func:`~transformers.FlaxPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this
+                  case, ``from_tf`` should be set to True and a configuration object should be provided as ``config``
+                  argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model
+                  using the provided conversion scripts and loading the PyTorch model afterwards.
            model_args: (`optional`) Sequence of positional arguments:
                All remaining positional arguments will be passed to the underlying model's ``__init__`` method
            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
-                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can
+                be automatically loaded when:
-                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a
-                - the model was saved using :func:`~transformers.FlaxPreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                  pretrained model), or
-                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+                - the model was saved using :func:`~transformers.FlaxPreTrainedModel.save_pretrained` and is reloaded
+                  by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a
+                  configuration JSON file named `config.json` is found in the directory.
            state_dict: (`optional`) dict:
-                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved
-                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                weights file. This option can be used if you want to create a model from a pretrained configuration but
-                In this case though, you should check if using :func:`~transformers.FlaxPreTrainedModel.save_pretrained` and :func:`~transformers.FlaxPreTrainedModel.from_pretrained` is not a simpler option.
+                load your own weights. In this case though, you should check if using
+                :func:`~transformers.FlaxPreTrainedModel.save_pretrained` and
+                :func:`~transformers.FlaxPreTrainedModel.from_pretrained` is not a simpler option.
            cache_dir: (`optional`) string:
-                Path to a directory in which a downloaded pre-trained model
+                Path to a directory in which a downloaded pre-trained model configuration should be cached if the
-                configuration should be cached if the standard cache should not be used.
+                standard cache should not be used.
            force_download: (`optional`) boolean, default False:
-                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+                Force to (re-)download the model weights and configuration files and override the cached versions if
+                they exists.
            resume_download: (`optional`) boolean, default False:
                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
            proxies: (`optional`) dict, default None:
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128',
-                The proxies are used on each request.
+                'http://hostname': 'foo.bar:4012'}. The proxies are used on each request.
            output_loading_info: (`optional`) boolean:
-                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error
+                messages.
            kwargs: (`optional`) Remaining dictionary of keyword arguments:
                These arguments will be passed to the configuration and the model.

--- a/src/transformers/modeling_flax_bert.py
+++ b/src/transformers/modeling_flax_bert.py
@@ -40,14 +40,15 @@ BERT_START_DOCSTRING = r"""
    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
    pruning heads etc.)
-    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass.
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    usage and behavior.
+    general usage and behavior.
    Parameters:
        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Initializing with a config file does not load the weights associated with the model, only the
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
 """
 BERT_INPUTS_DOCSTRING = r"""
@@ -55,35 +56,33 @@ BERT_INPUTS_DOCSTRING = r"""
        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using :class:`~transformers.BertTokenizer`.
+            Indices can be obtained using :class:`~transformers.BertTokenizer`. See
-            See :meth:`transformers.PreTrainedTokenizer.encode` and
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            :meth:`transformers.PreTrainedTokenizer.__call__` for details.
+            details.
            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
-            Mask to avoid performing attention on padding token indices.
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            `What are attention masks? <../glossary.html#attention-mask>`__
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs.
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            Indices are selected in ``[0, 1]``:
+            1]``:
            - 0 corresponds to a `sentence A` token,
            - 1 corresponds to a `sentence B` token.
            `What are token type IDs? <../glossary.html#token-type-ids>`_
        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings.
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+            config.max_position_embeddings - 1]``.
            `What are position IDs? <../glossary.html#position-ids>`_
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules.
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
@@ -104,8 +103,8 @@ BERT_INPUTS_DOCSTRING = r"""
 class FlaxBertLayerNorm(nn.Module):
-    """Layer normalization (https://arxiv.org/abs/1607.06450).
+    """
-    Operates on the last axis of the input data.
+    Layer normalization (https://arxiv.org/abs/1607.06450). Operates on the last axis of the input data.
    """
    epsilon: float = 1e-6
@@ -117,21 +116,21 @@ class FlaxBertLayerNorm(nn.Module):
    @compact
    def __call__(self, x):
-        """Applies layer normalization on the input.
+        """
-        It normalizes the activations of the layer for each given example in a
+        Applies layer normalization on the input. It normalizes the activations of the layer for each given example in
-        batch independently, rather than across a batch like Batch Normalization.
+        a batch independently, rather than across a batch like Batch Normalization. i.e. applies a transformation that
-        i.e. applies a transformation that maintains the mean activation within
+        maintains the mean activation within each example close to 0 and the activation standard deviation close to 1
-        each example close to 0 and the activation standard deviation close to 1.
        Args:
          x: the inputs
          epsilon: A small float added to variance to avoid dividing by zero.
          dtype: the dtype of the computation (default: float32).
          bias:  If True, bias (beta) is added.
          scale: If True, multiply by scale (gamma). When the next layer is linear
-            (also e.g. nn.relu), this can be disabled since the scaling will be done
+            (also e.g. nn.relu), this can be disabled since the scaling will be done by the next layer.
-            by the next layer.
          bias_init: Initializer for bias, by default, zero.
-          scale_init: Initializer for scale, by default, one.
+          scale_init: Initializer for scale, by default, one
        Returns:
          Normalized inputs (the same shape as inputs).
        """
@@ -150,9 +149,8 @@ class FlaxBertLayerNorm(nn.Module):
 class FlaxBertEmbedding(nn.Module):
    """
-    Specify a new class for doing the embedding stuff
+    Specify a new class for doing the embedding stuff as Flax's one use 'embedding' for the parameter name and PyTorch
-    as Flax's one use 'embedding' for the parameter name
+    use 'weight'
-    and PyTorch use 'weight'
    """
    vocab_size: int
@@ -321,11 +319,10 @@ class FlaxBertModule(nn.Module):
 )
 class FlaxBertModel(FlaxPreTrainedModel):
    """
-    The model can behave as an encoder (with only self-attention) as well
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
-    as a decoder, in which case a layer of cross-attention is added between
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
-    the self-attention layers, following the architecture described in `Attention is all you need
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
-    <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
-    Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
    """
    model_class = FlaxBertModule

--- a/src/transformers/modeling_flax_roberta.py
+++ b/src/transformers/modeling_flax_roberta.py
@@ -39,14 +39,15 @@ ROBERTA_START_DOCSTRING = r"""
    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
    pruning heads etc.)
-    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass.
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    usage and behavior.
+    general usage and behavior.
    Parameters:
        config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the
-            model. Initializing with a config file does not load the weights associated with the model, only the configuration.
+            model. Initializing with a config file does not load the weights associated with the model, only the
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
 """
 ROBERTA_INPUTS_DOCSTRING = r"""
@@ -54,35 +55,33 @@ ROBERTA_INPUTS_DOCSTRING = r"""
        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using :class:`~transformers.RobertaTokenizer`.
+            Indices can be obtained using :class:`~transformers.RobertaTokenizer`. See
-            See :meth:`transformers.PreTrainedTokenizer.encode` and
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            :meth:`transformers.PreTrainedTokenizer.__call__` for details.
+            details.
            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
-            Mask to avoid performing attention on padding token indices.
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **maked**.
            `What are attention masks? <../glossary.html#attention-mask>`__
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs.
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            Indices are selected in ``[0, 1]``:
+            1]``:
            - 0 corresponds to a `sentence A` token,
            - 1 corresponds to a `sentence B` token.
            `What are token type IDs? <../glossary.html#token-type-ids>`_
        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings.
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+            config.max_position_embeddings - 1]``.
            `What are position IDs? <../glossary.html#position-ids>`_
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules.
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
@@ -104,8 +103,8 @@ ROBERTA_INPUTS_DOCSTRING = r"""
 # Copied from transformers.modeling_flax_bert.FlaxBertLayerNorm with Bert->Roberta
 class FlaxRobertaLayerNorm(nn.Module):
-    """Layer normalization (https://arxiv.org/abs/1607.06450).
+    """
-    Operates on the last axis of the input data.
+    Layer normalization (https://arxiv.org/abs/1607.06450). Operates on the last axis of the input data.
    """
    epsilon: float = 1e-6
@@ -117,21 +116,21 @@ class FlaxRobertaLayerNorm(nn.Module):
    @compact
    def __call__(self, x):
-        """Applies layer normalization on the input.
+        """
-        It normalizes the activations of the layer for each given example in a
+        Applies layer normalization on the input. It normalizes the activations of the layer for each given example in
-        batch independently, rather than across a batch like Batch Normalization.
+        a batch independently, rather than across a batch like Batch Normalization. i.e. applies a transformation that
-        i.e. applies a transformation that maintains the mean activation within
+        maintains the mean activation within each example close to 0 and the activation standard deviation close to 1
-        each example close to 0 and the activation standard deviation close to 1.
        Args:
          x: the inputs
          epsilon: A small float added to variance to avoid dividing by zero.
          dtype: the dtype of the computation (default: float32).
          bias:  If True, bias (beta) is added.
          scale: If True, multiply by scale (gamma). When the next layer is linear
-            (also e.g. nn.relu), this can be disabled since the scaling will be done
+            (also e.g. nn.relu), this can be disabled since the scaling will be done by the next layer.
-            by the next layer.
          bias_init: Initializer for bias, by default, zero.
-          scale_init: Initializer for scale, by default, one.
+          scale_init: Initializer for scale, by default, one
        Returns:
          Normalized inputs (the same shape as inputs).
        """
@@ -151,9 +150,8 @@ class FlaxRobertaLayerNorm(nn.Module):
 # Copied from transformers.modeling_flax_bert.FlaxBertEmbedding with Bert->Roberta
 class FlaxRobertaEmbedding(nn.Module):
    """
-    Specify a new class for doing the embedding stuff
+    Specify a new class for doing the embedding stuff as Flax's one use 'embedding' for the parameter name and PyTorch
-    as Flax's one use 'embedding' for the parameter name
+    use 'weight'
-    and PyTorch use 'weight'
    """
    vocab_size: int
@@ -332,10 +330,10 @@ class FlaxRobertaModule(nn.Module):
 )
 class FlaxRobertaModel(FlaxPreTrainedModel):
    """
-    The model can behave as an encoder (with only self-attention) as well
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
-    as a decoder, in which case a layer of cross-attention is added between
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
-    the self-attention layers, following the architecture described in `Attention is all you need`_ by Ashish Vaswani,
+    all you need`_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
-    Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+    Kaiser and Illia Polosukhin.
    """
    model_class = FlaxRobertaModule

--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@@ -35,7 +35,8 @@ logger = logging.get_logger(__name__)
 @jax.jit
 def gelu(x):
-    r"""Gaussian error linear unit activation function.
+    r"""
+    Gaussian error linear unit activation function.
    Computes the element-wise function:
@@ -43,9 +44,8 @@ def gelu(x):
      \mathrm{gelu}(x) = \frac{x}{2} \left(1 + \mathrm{tanh} \left(
        \sqrt{\frac{2}{\pi}} \left(x + 0.044715 x^3 \right) \right) \right)
-    We explicitly use the approximation rather than the exact formulation for
+    We explicitly use the approximation rather than the exact formulation for speed. For more information, see
-    speed. For more information, see `Gaussian Error Linear Units (GELUs)
+    `Gaussian Error Linear Units (GELUs) <https://arxiv.org/abs/1606.08415>`_, section 2.
-    <https://arxiv.org/abs/1606.08415>`_, section 2.
    """
    return x * 0.5 * (1.0 + jax.lax.erf(x / jnp.sqrt(2.0)))

--- a/src/transformers/modeling_fsmt.py
+++ b/src/transformers/modeling_fsmt.py
@@ -103,6 +103,7 @@ _TOKENIZER_FOR_DOC = "FSMTTokenizer"
 # TODO:
 # - port model ensemble (fs uses 4 model checkpoints)
 # - solve beam search discrepancies
+# docstyle-ignore
 """
@@ -180,14 +181,15 @@ FSMT_START_DOCSTRING = r"""
    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
    pruning heads etc.)
-    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass.
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    usage and behavior.
+    general usage and behavior.
    Parameters:
        config (:class:`~transformers.FSMTConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Initializing with a config file does not load the weights associated with the model, only the
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
 """
 FSMT_GENERATION_EXAMPLE = r"""
@@ -214,14 +216,13 @@ FSMT_INPUTS_DOCSTRING = r"""
        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.
-            IIndices can be obtained using :class:`~transformers.FSTMTokenizer`.
+            IIndices can be obtained using :class:`~transformers.FSTMTokenizer`. See
-            See :meth:`transformers.PreTrainedTokenizer.encode` and
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            :meth:`transformers.PreTrainedTokenizer.__call__` for details.
+            details.
            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on padding token indices.
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
@@ -232,21 +233,19 @@ FSMT_INPUTS_DOCSTRING = r"""
            shifting the input_ids right, following the paper.
        decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
            Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
-            also be used by default.
+            also be used by default. If you want to change padding behavior, you should read
-            If you want to change padding behavior, you should read
+            :func:`modeling_fstm._prepare_fstm_decoder_inputs` and modify. See diagram 1 in the paper for more info on
-            :func:`modeling_fstm._prepare_fstm_decoder_inputs` and modify.
+            the default strategy
-            See diagram 1 in the paper for more info on the default strategy
        encoder_outputs (:obj:`Tuple(torch.FloatTensor)`, `optional`):
-            Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`: :obj:`attentions`)
+            Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
-            :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)` is a sequence of
+            :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)` is a
-            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+            sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
+            the decoder.
        past_key_values (:obj:`Tuple(torch.FloatTensor)` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-            Contains precomputed key and value hidden-states of the attention blocks.
+            Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding.
-            Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
-            If :obj:`past_key_values` are used, the user can optionally input only the last
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
-            :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of shape
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
-            :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids` of shape
-            :obj:`(batch_size, sequence_length)`.
        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
            decoding (see :obj:`past_key_values`).
@@ -282,9 +281,10 @@ def invert_mask(attention_mask):
 def _prepare_fsmt_decoder_inputs(
    config, input_ids, decoder_input_ids=None, decoder_padding_mask=None, causal_mask_dtype=torch.float32
 ):
-    """Prepare masks that ignore padding tokens in the decoder and a causal mask for the decoder if
+    """
-    none are provided. This mimics the default behavior in fairseq. To override it pass in masks.
+    Prepare masks that ignore padding tokens in the decoder and a causal mask for the decoder if none are provided.
-    Note: this is not called during generation
+    This mimics the default behavior in fairseq. To override it pass in masks. Note: this is not called during
+    generation
    """
    pad_token_id = config.pad_token_id
    if decoder_input_ids is None:
@@ -406,8 +406,8 @@ class EncoderLayer(nn.Module):
 class FSMTEncoder(nn.Module):
    """
-    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
-    is a :class:`EncoderLayer`.
+    :class:`EncoderLayer`.
    Args:
        config: FSMTConfig
@@ -435,14 +435,14 @@ class FSMTEncoder(nn.Module):
        Args:
            input_ids (LongTensor): tokens in the source language of shape
                `(batch, src_len)`
-            attention_mask (torch.LongTensor): indicating which indices are padding tokens.
+            attention_mask (torch.LongTensor): indicating which indices are padding tokens
        Returns:
            BaseModelOutput or Tuple comprised of:
-                - **x** (Tensor): the last encoder layer's output of
-                  shape `(src_len, batch, embed_dim)`
+                - **x** (Tensor): the last encoder layer's output of shape `(src_len, batch, embed_dim)`
-                - **encoder_states** (tuple(torch.FloatTensor)): all intermediate
+                - **encoder_states** (tuple(torch.FloatTensor)): all intermediate hidden states of shape `(src_len,
-                  hidden states of shape `(src_len, batch, embed_dim)`.
+                  batch, embed_dim)`. Only populated if *output_hidden_states:* is True.
-                  Only populated if *output_hidden_states:* is True.
                - **all_attentions** (tuple(torch.FloatTensor)): Attention weights for each layer.
                During training might not be of length n_layers because of layer dropout.
        """
@@ -570,8 +570,8 @@ class DecoderLayer(nn.Module):
 class FSMTDecoder(nn.Module):
    """
-    Transformer decoder consisting of *config.decoder_layers* layers. Each layer
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`DecoderLayer`
-    is a :class:`DecoderLayer`.
    Args:
        config: FSMTConfig
        embed_tokens (torch.nn.Embedding): output embedding
@@ -614,8 +614,8 @@ class FSMTDecoder(nn.Module):
        **unused,
    ):
        """
-        Includes several features from "Jointly Learning to Align and
+        Includes several features from "Jointly Learning to Align and Translate with Transformer Models" (Garg et al.,
-        Translate with Transformer Models" (Garg et al., EMNLP 2019).
+        EMNLP 2019).
        Args:
            input_ids (LongTensor): previous decoder outputs of shape
@@ -627,6 +627,7 @@ class FSMTDecoder(nn.Module):
        Returns:
            BaseModelOutputWithPast or tuple:
                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
                - the cache
                - hidden states
@@ -1058,10 +1059,9 @@ class FSMTForConditionalGeneration(PretrainedFSMTModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the masked language modeling loss.
+            Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
-            Indices should either be in ``[0, ..., config.vocab_size]`` or -100 (see ``input_ids`` docstring).
+            config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
-            with labels in ``[0, ..., config.vocab_size]``.
        Returns:
@@ -1157,8 +1157,7 @@ class SinusoidalPositionalEmbedding(nn.Embedding):
    """
    This module produces sinusoidal positional embeddings of any length.
-    We don't want to save the weight of this embedding since it's not trained
+    We don't want to save the weight of this embedding since it's not trained (deterministic) and it can be huge.
-    (deterministic) and it can be huge.
    Padding symbols are ignored.
@@ -1182,10 +1181,11 @@ class SinusoidalPositionalEmbedding(nn.Embedding):
    @staticmethod
    def get_embedding(num_embeddings, embedding_dim, padding_idx):
-        """Build sinusoidal embeddings.
+        """
+        Build sinusoidal embeddings.
-        This matches the implementation in tensor2tensor, but differs slightly
+        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
-        from the description in Section 3.5 of "Attention Is All You Need".
+        "Attention Is All You Need".
        """
        half_dim = embedding_dim // 2
        emb = math.log(10000) / (half_dim - 1)
@@ -1201,7 +1201,8 @@ class SinusoidalPositionalEmbedding(nn.Embedding):
    @staticmethod
    def make_positions(tensor, padding_idx: int):
-        """Replace non-padding symbols with their position numbers.
+        """
+        Replace non-padding symbols with their position numbers.
        Position numbers begin at padding_idx+1. Padding symbols are ignored.
        """

--- a/src/transformers/modeling_funnel.py
+++ b/src/transformers/modeling_funnel.py
@@ -664,8 +664,9 @@ class FunnelEncoder(nn.Module):
 def upsample(x, stride, target_len, separate_cls=True, truncate_seq=False):
-    """Upsample tensor `x` to match `target_len` by repeating the tokens `stride` time on the sequence length
+    """
-    dimension."""
+    Upsample tensor `x` to match `target_len` by repeating the tokens `stride` time on the sequence length dimension.
+    """
    if stride == 1:
        return x
    if separate_cls:
@@ -748,8 +749,9 @@ class FunnelDiscriminatorPredictions(nn.Module):
 class FunnelPreTrainedModel(PreTrainedModel):
-    """An abstract class to handle weights initialization and
+    """
-    a simple interface for downloading and loading pretrained models.
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
    """
    config_class = FunnelConfig
@@ -809,8 +811,8 @@ class FunnelForPreTrainingOutput(ModelOutput):
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            sequence_length, sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
@@ -824,22 +826,22 @@ class FunnelForPreTrainingOutput(ModelOutput):
 FUNNEL_START_DOCSTRING = r"""
-    The Funnel Transformer model was proposed in
+    The Funnel Transformer model was proposed in `Funnel-Transformer: Filtering out Sequential Redundancy for Efficient
-    `Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing
+    Language Processing <https://arxiv.org/abs/2006.03236>`__ by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-    <https://arxiv.org/abs/2006.03236>`__ by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
    pruning heads etc.)
-    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass.
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    usage and behavior.
+    general usage and behavior.
    Parameters:
        config (:class:`~transformers.FunnelConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Initializing with a config file does not load the weights associated with the model, only the
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
 """
 FUNNEL_INPUTS_DOCSTRING = r"""
@@ -847,22 +849,21 @@ FUNNEL_INPUTS_DOCSTRING = r"""
        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using :class:`~transformers.BertTokenizer`.
+            Indices can be obtained using :class:`~transformers.BertTokenizer`. See
-            See :meth:`transformers.PreTrainedTokenizer.encode` and
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            :meth:`transformers.PreTrainedTokenizer.__call__` for details.
+            details.
            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
-            Mask to avoid performing attention on padding token indices.
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            `What are attention masks? <../glossary.html#attention-mask>`__
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs.
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            Indices are selected in ``[0, 1]``:
+            1]``:
            - 0 corresponds to a `sentence A` token,
            - 1 corresponds to a `sentence B` token.
@@ -884,8 +885,10 @@ FUNNEL_INPUTS_DOCSTRING = r"""
 @add_start_docstrings(
-    """ The base Funnel Transformer Model transformer outputting raw hidden-states without upsampling head (also called
+    """
-    decoder) or any task-specific head on top.""",
+    The base Funnel Transformer Model transformer outputting raw hidden-states without upsampling head (also called
+    decoder) or any task-specific head on top.
+    """,
    FUNNEL_START_DOCSTRING,
 )
 class FunnelBaseModel(FunnelPreTrainedModel):
@@ -1065,7 +1068,8 @@ class FunnelModel(FunnelPreTrainedModel):
 add_start_docstrings(
    """
    Funnel Transformer model with a binary classification head on top as used during pretraining for identifying
-    generated tokens.""",
+    generated tokens.
+    """,
    FUNNEL_START_DOCSTRING,
 )
@@ -1093,8 +1097,8 @@ class FunnelForPreTraining(FunnelPreTrainedModel):
    ):
        r"""
        labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`):
-            Labels for computing the ELECTRA-style loss. Input should be a sequence of tokens (see :obj:`input_ids` docstring)
+            Labels for computing the ELECTRA-style loss. Input should be a sequence of tokens (see :obj:`input_ids`
-            Indices should be in ``[0, 1]``:
+            docstring) Indices should be in ``[0, 1]``:
            - 0 indicates the token is an original token,
            - 1 indicates the token was replaced.
@@ -1184,10 +1188,9 @@ class FunnelForMaskedLM(FunnelPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the masked language modeling loss.
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
-            in ``[0, ..., config.vocab_size]``
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1222,8 +1225,10 @@ class FunnelForMaskedLM(FunnelPreTrainedModel):
 @add_start_docstrings(
-    """Funnel Transfprmer Model with a sequence classification/regression head on top (two linear layer on top of
+    """
-    the first timestep of the last hidden state) e.g. for GLUE tasks. """,
+    Funnel Transfprmer Model with a sequence classification/regression head on top (two linear layer on top of the
+    first timestep of the last hidden state) e.g. for GLUE tasks.
+    """,
    FUNNEL_START_DOCSTRING,
 )
 class FunnelForSequenceClassification(FunnelPreTrainedModel):
@@ -1255,9 +1260,8 @@ class FunnelForSequenceClassification(FunnelPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the sequence classification/regression loss.
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
-            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1299,8 +1303,10 @@ class FunnelForSequenceClassification(FunnelPreTrainedModel):
 @add_start_docstrings(
-    """Funnel Transformer Model with a multiple choice classification head on top (two linear layer on top of
+    """
-    the first timestep of the last hidden state, and a softmax) e.g. for RocStories/SWAG tasks. """,
+    Funnel Transformer Model with a multiple choice classification head on top (two linear layer on top of the first
+    timestep of the last hidden state, and a softmax) e.g. for RocStories/SWAG tasks.
+    """,
    FUNNEL_START_DOCSTRING,
 )
 class FunnelForMultipleChoice(FunnelPreTrainedModel):
@@ -1331,9 +1337,9 @@ class FunnelForMultipleChoice(FunnelPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the multiple choice classification loss.
+            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
-            Indices should be in ``[0, ..., num_choices-1]`` where :obj:`num_choices` is the size of the second dimension
+            num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
-            of the input tensors. (See :obj:`input_ids` above)
+            :obj:`input_ids` above)
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
@@ -1380,8 +1386,10 @@ class FunnelForMultipleChoice(FunnelPreTrainedModel):
 @add_start_docstrings(
-    """Funnel Transformer Model with a token classification head on top (a linear layer on top of
+    """
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    Funnel Transformer Model with a token classification head on top (a linear layer on top of the hidden-states
+    output) e.g. for Named-Entity-Recognition (NER) tasks.
+    """,
    FUNNEL_START_DOCSTRING,
 )
 class FunnelForTokenClassification(FunnelPreTrainedModel):
@@ -1415,8 +1423,8 @@ class FunnelForTokenClassification(FunnelPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the token classification loss.
+            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
-            Indices should be in ``[0, ..., config.num_labels - 1]``.
+            1]``.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1461,8 +1469,10 @@ class FunnelForTokenClassification(FunnelPreTrainedModel):
 @add_start_docstrings(
-    """Funnel Transformer Model with a span classification head on top for extractive question-answering tasks like
+    """
-    SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).""",
+    Funnel Transformer Model with a span classification head on top for extractive question-answering tasks like SQuAD
+    (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
    FUNNEL_START_DOCSTRING,
 )
 class FunnelForQuestionAnswering(FunnelPreTrainedModel):
@@ -1497,12 +1507,12 @@ class FunnelForQuestionAnswering(FunnelPreTrainedModel):
        r"""
        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (:obj:`sequence_length`).
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
-            Position outside of the sequence are not taken into account for computing the loss.
+            sequence are not taken into account for computing the loss.
        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (:obj:`sequence_length`).
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
-            Position outside of the sequence are not taken into account for computing the loss.
+            sequence are not taken into account for computing the loss.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

--- a/src/transformers/modeling_gpt2.py
+++ b/src/transformers/modeling_gpt2.py
@@ -322,8 +322,9 @@ class Block(nn.Module):
 class GPT2PreTrainedModel(PreTrainedModel):
-    """An abstract class to handle weights initialization and
+    """
-    a simple interface for downloading and loading pretrained models.
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
    """
    config_class = GPT2Config
@@ -361,8 +362,8 @@ class GPT2DoubleHeadsModelOutput(ModelOutput):
        mc_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
            Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
        past_key_values (:obj:`List[torch.FloatTensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
-            List of :obj:`torch.FloatTensor` of length :obj:`config.n_layers`,  with each tensor of shape
+            List of :obj:`torch.FloatTensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2,
-            :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
+            batch_size, num_heads, sequence_length, embed_size_per_head)`).
            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
            :obj:`past_key_values` input) to speed up sequential decoding.
@@ -372,8 +373,8 @@ class GPT2DoubleHeadsModelOutput(ModelOutput):
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            sequence_length, sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
@@ -394,60 +395,59 @@ GPT2_START_DOCSTRING = r"""
    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
    pruning heads etc.)
-    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass.
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    usage and behavior.
+    general usage and behavior.
    Parameters:
        config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Initializing with a config file does not load the weights associated with the model, only the
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
 """
 GPT2_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`):
            :obj:`input_ids_length` = ``sequence_length`` if :obj:`past_key_values` is ``None`` else
-            ``past_key_values[0].shape[-2]`` (``sequence_length`` of input past key value states).
+            ``past_key_values[0].shape[-2]`` (``sequence_length`` of input past key value states). Indices of input
-            Indices of input sequence tokens in the vocabulary.
+            sequence tokens in the vocabulary.
-            If :obj:`past_key_values` is used, only ``input_ids`` that do not have their past calculated should be passed
+            If :obj:`past_key_values` is used, only ``input_ids`` that do not have their past calculated should be
-            as ``input_ids``.
+            passed as ``input_ids``.
-            Indices can be obtained using :class:`~transformers.GPT2Tokenizer`.
+            Indices can be obtained using :class:`~transformers.GPT2Tokenizer`. See
-            See :meth:`transformers.PreTrainedTokenizer.encode` and
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            :meth:`transformers.PreTrainedTokenizer.__call__` for details.
+            details.
            `What are input IDs? <../glossary.html#input-ids>`__
        past_key_values (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
-            Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model
+            Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
-            (see :obj:`past_key_values` output below). Can be used to speed up sequential decoding.
+            :obj:`past_key_values` output below). Can be used to speed up sequential decoding. The ``input_ids`` which
-            The ``input_ids`` which have their past given to this model should not be passed as ``input_ids`` as they
+            have their past given to this model should not be passed as ``input_ids`` as they have already been
-            have already been computed.
+            computed.
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on padding token indices.
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            `What are attention masks? <../glossary.html#attention-mask>`__
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs.
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            Indices are selected in ``[0, 1]``:
+            1]``:
            - 0 corresponds to a `sentence A` token,
            - 1 corresponds to a `sentence B` token.
            `What are token type IDs? <../glossary.html#token-type-ids>`_
        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings.
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+            config.max_position_embeddings - 1]``.
            `What are position IDs? <../glossary.html#position-ids>`_
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules.
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
@@ -496,8 +496,8 @@ class GPT2Model(GPT2PreTrainedModel):
        self.wte = new_embeddings
    def _prune_heads(self, heads_to_prune):
-        """Prunes heads of the model.
+        """
-        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        """
        for layer, heads in heads_to_prune.items():
            self.h[layer].attn.prune_heads(heads)
@@ -680,8 +680,10 @@ class GPT2Model(GPT2PreTrainedModel):
 @add_start_docstrings(
-    """The GPT2 Model transformer with a language modeling head on top
+    """
-    (linear layer with weights tied to the input embeddings). """,
+    The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
    GPT2_START_DOCSTRING,
 )
 class GPT2LMHeadModel(GPT2PreTrainedModel):
@@ -748,11 +750,9 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for language modeling.
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
-            Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
+            ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to
-            Indices are selected in ``[-100, 0, ..., config.vocab_size]``
+            ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]``
-            All labels set to ``-100`` are ignored (masked), the loss is only
-            computed for labels in ``[0, ..., config.vocab_size]``
        """
        if "past" in kwargs:
            warnings.warn(
@@ -805,10 +805,11 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
 @add_start_docstrings(
-    """The GPT2 Model transformer with a language modeling and a multiple-choice classification
+    """
-    head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
+The GPT2 Model transformer with a language modeling and a multiple-choice classification head on top e.g. for
-    The language modeling head has its weights tied to the input embeddings,
+RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the
-    the classification head takes as input the input of a specified classification token index in the input sequence).
+input embeddings, the classification head takes as input the input of a specified classification token index in the
+input sequence).
 """,
    GPT2_START_DOCSTRING,
 )
@@ -858,18 +859,16 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
    ):
        r"""
        mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input):
-            Index of the classification token in each input sequence.
+            Index of the classification token in each input sequence. Selected in the range ``[0, input_ids.size(-1) -
-            Selected in the range ``[0, input_ids.size(-1) - 1[``.
+            1[``.
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for language modeling.
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
-            Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
+            ``labels = input_ids`` Indices are selected in ``[-1, 0, ..., config.vocab_size]`` All labels set to
-            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
+            ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]``
-            All labels set to ``-100`` are ignored (masked), the loss is only
-            computed for labels in ``[0, ..., config.vocab_size]``
        mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`):
-            Labels for computing the multiple choice classification loss.
+            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
-            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
+            num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see
-            of the input tensors. (see `input_ids` above)
+            `input_ids` above)
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
@@ -963,17 +962,17 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
 @add_start_docstrings(
-    """The GPT2 Model transformer with a sequence classification head on top
+    """
-    (linear layer).
+    The GPT2 Model transformer with a sequence classification head on top (linear layer).
    :class:`~transformers.GPT2ForSequenceClassification` uses the last token in order to do the classification, as
    other causal models (e.g. GPT-1) do.
-    Since it does classification on the last token, it requires to know the position of the last token.
+    Since it does classification on the last token, it requires to know the position of the last token. If a
-    If a :obj:`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token
+    :obj:`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each
-    in each row. If no :obj:`pad_token_id` is defined, it simply takes the last value in each row of the batch.
+    row. If no :obj:`pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot
-    Since it cannot guess the padding tokens when :obj:`inputs_embeds` are passed instead of :obj:`input_ids`, it
+    guess the padding tokens when :obj:`inputs_embeds` are passed instead of :obj:`input_ids`, it does the same (take
-    does the same (take the last value in each row of the batch).
+    the last value in each row of the batch).
    """,
    GPT2_START_DOCSTRING,
 )
@@ -1012,9 +1011,8 @@ class GPT2ForSequenceClassification(GPT2PreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the sequence classification/regression loss.
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
-            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

--- a/src/transformers/modeling_layoutlm.py
+++ b/src/transformers/modeling_layoutlm.py
@@ -487,8 +487,9 @@ class LayoutLMOnlyMLMHead(nn.Module):
 class LayoutLMPreTrainedModel(PreTrainedModel):
-    """An abstract class to handle weights initialization and
+    """
-    a simple interface for downloading and loading pretrained models.
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
    """
    config_class = LayoutLMConfig
@@ -508,18 +509,19 @@ class LayoutLMPreTrainedModel(PreTrainedModel):
            module.bias.data.zero_()
-LAYOUTLM_START_DOCSTRING = r"""    The LayoutLM model was proposed in
+LAYOUTLM_START_DOCSTRING = r"""
-    `LayoutLM: Pre-training of Text and Layout for Document Image Understanding
+    The LayoutLM model was proposed in `LayoutLM: Pre-training of Text and Layout for Document Image Understanding
    <https://arxiv.org/abs/1912.13318>`__ by....
-    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
+    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class. Use
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
-    usage and behavior.
+    behavior.
    Parameters:
        config (:class:`~transformers.LayoutLMConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Initializing with a config file does not load the weights associated with the model, only the
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
 """
 LAYOUTLM_INPUTS_DOCSTRING = r"""
@@ -527,45 +529,44 @@ LAYOUTLM_INPUTS_DOCSTRING = r"""
        input_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using :class:`transformers.LayoutLMTokenizer`.
+            Indices can be obtained using :class:`transformers.LayoutLMTokenizer`. See
-            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.__call__` for
-            :func:`transformers.PreTrainedTokenizer.__call__` for details.
+            details.
            `What are input IDs? <../glossary.html#input-ids>`__
        bbox (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`):
-            Bounding Boxes of each input sequence tokens.
+            Bounding Boxes of each input sequence tokens. Selected in the range ``[0, config.max_2d_position_embeddings
-            Selected in the range ``[0, config.max_2d_position_embeddings - 1]``.
+            - 1]``.
            `What are bboxes? <../glossary.html#position-ids>`_
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`):
-            Mask to avoid performing attention on padding token indices.
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for
-            Mask values selected in ``[0, 1]``:
+            tokens that are NOT MASKED, ``0`` for MASKED tokens.
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
            `What are attention masks? <../glossary.html#attention-mask>`__
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs.
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
+            1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token
-            corresponds to a `sentence B` token
            `What are token type IDs? <../glossary.html#token-type-ids>`_
        position_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings.
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+            config.max_position_embeddings - 1]``.
            `What are position IDs? <../glossary.html#position-ids>`_
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules.
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1`
-            Mask values selected in ``[0, 1]``:
+            indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
-            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
        output_attentions (:obj:`bool`, `optional`):
-            If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
+            If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under
+            returned tensors for more detail.
        output_hidden_states (:obj:`bool`, `optional`):
-            If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
+            If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned
+            tensors for more detail.
        return_dict (:obj:`bool`, `optional`):
            If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
            plain tuple.
@@ -599,9 +600,9 @@ class LayoutLMModel(LayoutLMPreTrainedModel):
        self.embeddings.word_embeddings = value
    def _prune_heads(self, heads_to_prune):
-        """Prunes heads of the model.
+        """
-        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        See base class PreTrainedModel
+        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)
@@ -632,20 +633,21 @@ class LayoutLMModel(LayoutLMPreTrainedModel):
        input_ids (torch.LongTensor of shape (batch_size, sequence_length)):
            Indices of input sequence tokens in the vocabulary.
        attention_mask (torch.FloatTensor of shape (batch_size, sequence_length), optional):
-            Mask to avoid performing attention on padding token indices.
+            Mask to avoid performing attention on padding token indices. Mask values selected in [0, 1]: 1 for tokens
-            Mask values selected in [0, 1]: 1 for tokens that are NOT MASKED, 0 for MASKED tokens.
+            that are NOT MASKED, 0 for MASKED tokens.
        token_type_ids (torch.LongTensor of shape (batch_size, sequence_length), optional):
-            Segment token indices to indicate first and second portions of the inputs.
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in [0, 1]:
-            Indices are selected in [0, 1]: 0 corresponds to a sentence A token, 1 corresponds to a sentence B token
+            0 corresponds to a sentence A token, 1 corresponds to a sentence B token
        position_ids (torch.LongTensor of shape (batch_size, sequence_length), optional):
-            Indices of positions of each input sequence tokens in the position embeddings.
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range [0,
-            Selected in the range [0, config.max_position_embeddings - 1].
+            config.max_position_embeddings - 1].
        head_mask (torch.FloatTensor of shape (num_heads,) or (num_layers, num_heads), optional):
-            Mask to nullify selected heads of the self-attention modules.
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in [0, 1]: 1 indicates
-            Mask values selected in [0, 1]: 1 indicates the head is not masked, 0 indicates the head is masked.
+            the head is not masked, 0 indicates the head is masked.
        inputs_embeds (torch.FloatTensor of shape (batch_size, sequence_length, hidden_size), optional):
-            Optionally, instead of passing input_ids you can choose to directly pass an embedded representation.
+            Optionally, instead of passing input_ids you can choose to directly pass an embedded representation. This
-            This is useful if you want more control over how to convert input_ids indices into associated vectors than the model’s internal embedding lookup matrix.
+            is useful if you want more control over how to convert input_ids indices into associated vectors than the
+            model’s internal embedding lookup matrix.
        output_attentions (bool, optional):
            If set to True, the attentions tensors of all attention layers are returned.
        output_hidden_states (bool, optional):
@@ -807,8 +809,10 @@ class LayoutLMForMaskedLM(LayoutLMPreTrainedModel):
 @add_start_docstrings(
-    """LayoutLM Model with a token classification head on top (a linear layer on top of
+    """
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    LayoutLM Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
    LAYOUTLM_START_DOCSTRING,
 )
 class LayoutLMForTokenClassification(LayoutLMPreTrainedModel):

--- a/src/transformers/modeling_longformer.py
+++ b/src/transformers/modeling_longformer.py
@@ -80,9 +80,8 @@ def _get_question_end_index(input_ids, sep_token_id):
 def _compute_global_attention_mask(input_ids, sep_token_id, before_sep_token=True):
    """
-    Computes global attention mask by putting attention on all tokens
+    Computes global attention mask by putting attention on all tokens before `sep_token_id` if `before_sep_token is
-    before `sep_token_id` if `before_sep_token is True` else after
+    True` else after `sep_token_id`.
-    `sep_token_id`.
    """
    question_end_index = _get_question_end_index(input_ids, sep_token_id)
    question_end_index = question_end_index.unsqueeze(dim=1)  # size: batch_size x 1
@@ -101,9 +100,9 @@ def _compute_global_attention_mask(input_ids, sep_token_id, before_sep_token=Tru
 # Copied from transformers.modeling_roberta.create_position_ids_from_input_ids
 def create_position_ids_from_input_ids(input_ids, padding_idx):
-    """Replace non-padding symbols with their position numbers. Position numbers begin at
+    """
-    padding_idx+1. Padding symbols are ignored. This is modified from fairseq's
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
-    `utils.make_positions`.
+    are ignored. This is modified from fairseq's `utils.make_positions`.
    Args:
        x: torch.Tensor x:
@@ -175,8 +174,8 @@ class LongformerEmbeddings(nn.Module):
        return embeddings
    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
-        """We are provided embeddings directly. We cannot infer which are padded so just generate
+        """
-        sequential position ids.
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
        Args:
            inputs_embeds: torch.Tensor inputs_embeds:
@@ -233,11 +232,11 @@ class LongformerSelfAttention(nn.Module):
        output_attentions=False,
    ):
        """
-        LongformerSelfAttention expects `len(hidden_states)` to be multiple of `attention_window`.
+        LongformerSelfAttention expects `len(hidden_states)` to be multiple of `attention_window`. Padding to
-        Padding to `attention_window` happens in LongformerModel.forward to avoid redoing the padding on each layer.
+        `attention_window` happens in LongformerModel.forward to avoid redoing the padding on each layer.
+        The `attention_mask` is changed in `BertModel.forward` from 0, 1, 2 to -ve: no attention
-        The `attention_mask` is changed in `BertModel.forward` from 0, 1, 2 to
-            -ve: no attention
              0: local attention
            +ve: global attention
@@ -408,8 +407,10 @@ class LongformerSelfAttention(nn.Module):
    @staticmethod
    def _pad_and_diagonalize(chunked_hidden_states):
-        """shift every row 1 step right, converting columns into diagonals.
+        """
-        Example:
+        shift every row 1 step right, converting columns into diagonals.
+        Example::
              chunked_hidden_states: [ 0.4983,  2.6918, -0.0071,  1.0492,
                                       -1.8348,  0.7672,  0.2986,  0.0285,
                                       -0.7584,  0.4206, -0.0405,  0.1599,
@@ -470,9 +471,11 @@ class LongformerSelfAttention(nn.Module):
        ending_input.masked_fill_(ending_mask == 1, -float("inf"))  # `== 1` converts to bool or uint8
    def _sliding_chunks_query_key_matmul(self, query: torch.Tensor, key: torch.Tensor, window_overlap: int):
-        """Matrix multiplication of query and key tensors using with a sliding window attention pattern.
+        """
-        This implementation splits the input into overlapping chunks of size 2w (e.g. 512 for pretrained Longformer)
+        Matrix multiplication of query and key tensors using with a sliding window attention pattern. This
-        with an overlap of size window_overlap"""
+        implementation splits the input into overlapping chunks of size 2w (e.g. 512 for pretrained Longformer) with an
+        overlap of size window_overlap
+        """
        batch_size, seq_len, num_heads, head_dim = query.size()
        assert (
            seq_len % (window_overlap * 2) == 0
@@ -536,8 +539,10 @@ class LongformerSelfAttention(nn.Module):
    def _sliding_chunks_matmul_attn_probs_value(
        self, attn_probs: torch.Tensor, value: torch.Tensor, window_overlap: int
    ):
-        """Same as _sliding_chunks_query_key_matmul but for attn_probs and value tensors.
+        """
-        Returned tensor will be of the same shape as `attn_probs`"""
+        Same as _sliding_chunks_query_key_matmul but for attn_probs and value tensors. Returned tensor will be of the
+        same shape as `attn_probs`
+        """
        batch_size, seq_len, num_heads, head_dim = value.size()
        assert seq_len % (window_overlap * 2) == 0
@@ -968,8 +973,8 @@ class LongformerLMHead(nn.Module):
 class LongformerPreTrainedModel(PreTrainedModel):
-    """An abstract class to handle weights initialization and
+    """
-    a simple interface for downloading and loading pretrained
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """
@@ -996,9 +1001,9 @@ LONGFORMER_START_DOCSTRING = r"""
    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
    pruning heads etc.)
-    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass.
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    usage and behavior.
+    general usage and behavior.
    Parameters:
        config (:class:`~transformers.LongformerConfig`): Model configuration class with all the parameters of the
@@ -1012,41 +1017,40 @@ LONGFORMER_INPUTS_DOCSTRING = r"""
        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using :class:`~transformers.LongformerTokenizer`.
+            Indices can be obtained using :class:`~transformers.LongformerTokenizer`. See
-            See :meth:`transformers.PreTrainedTokenizer.encode` and
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            :meth:`transformers.PreTrainedTokenizer.__call__` for details.
+            details.
            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
-            Mask to avoid performing attention on padding token indices.
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            `What are attention masks? <../glossary.html#attention-mask>`__
        global_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
-            Mask to decide the attention given on each token, local attention or global attenion.
+            Mask to decide the attention given on each token, local attention or global attenion. Tokens with global
-            Tokens with global attention attends to all other tokens, and all other tokens attend to them. This is important for
+            attention attends to all other tokens, and all other tokens attend to them. This is important for
            task-specific finetuning because it makes the model more flexible at representing the task. For example,
-            for classification, the <s> token should be given global attention. For QA, all question tokens should also have
+            for classification, the <s> token should be given global attention. For QA, all question tokens should also
-            global attention. Please refer to the `Longformer paper <https://arxiv.org/abs/2004.05150>`__ for more details.
+            have global attention. Please refer to the `Longformer paper <https://arxiv.org/abs/2004.05150>`__ for more
-            Mask values selected in ``[0, 1]``:
+            details. Mask values selected in ``[0, 1]``:
            - 0 for local attention (a sliding window attention),
            - 1 for global attention (tokens that attend to all other tokens, and all other tokens attend to them).
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs.
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            Indices are selected in ``[0, 1]``:
+            1]``:
            - 0 corresponds to a `sentence A` token,
            - 1 corresponds to a `sentence B` token.
            `What are token type IDs? <../glossary.html#token-type-ids>`_
        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings.
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+            config.max_position_embeddings - 1]``.
            `What are position IDs? <../glossary.html#position-ids>`_
        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
@@ -1071,17 +1075,16 @@ LONGFORMER_INPUTS_DOCSTRING = r"""
 class LongformerModel(LongformerPreTrainedModel):
    """
    This class copied code from :class:`~transformers.RobertaModel` and overwrote standard self-attention with
-    longformer self-attention to provide the ability to process
+    longformer self-attention to provide the ability to process long sequences following the self-attention approach
-    long sequences following the self-attention approach described in `Longformer: the Long-Document Transformer
+    described in `Longformer: the Long-Document Transformer <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy,
-    <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, and Arman Cohan. Longformer self-attention
+    Matthew E. Peters, and Arman Cohan. Longformer self-attention combines a local (sliding window) and global
-    combines a local (sliding window) and global attention to extend to long documents without the O(n^2) increase in
+    attention to extend to long documents without the O(n^2) increase in memory and compute.
-    memory and compute.
    The self-attention module :obj:`LongformerSelfAttention` implemented here supports the combination of local and
-    global attention but it lacks support for autoregressive attention and dilated attention. Autoregressive
+    global attention but it lacks support for autoregressive attention and dilated attention. Autoregressive and
-    and dilated attention are more relevant for autoregressive language modeling than finetuning on downstream
+    dilated attention are more relevant for autoregressive language modeling than finetuning on downstream tasks.
-    tasks. Future release will add support for autoregressive attention, but the support for dilated attention
+    Future release will add support for autoregressive attention, but the support for dilated attention requires a
-    requires a custom CUDA kernel to be memory and compute efficient.
+    custom CUDA kernel to be memory and compute efficient.
    """
@@ -1112,9 +1115,9 @@ class LongformerModel(LongformerPreTrainedModel):
        self.embeddings.word_embeddings = value
    def _prune_heads(self, heads_to_prune):
-        """Prunes heads of the model.
+        """
-        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        See base class PreTrainedModel
+        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)
@@ -1323,10 +1326,9 @@ class LongformerForMaskedLM(LongformerPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the masked language modeling loss.
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
-            in ``[0, ..., config.vocab_size]``
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
@@ -1391,8 +1393,10 @@ class LongformerForMaskedLM(LongformerPreTrainedModel):
 @add_start_docstrings(
-    """Longformer Model transformer with a sequence classification/regression head on top (a linear layer
+    """
-    on top of the pooled output) e.g. for GLUE tasks. """,
+    Longformer Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """,
    LONGFORMER_START_DOCSTRING,
 )
 class LongformerForSequenceClassification(LongformerPreTrainedModel):
@@ -1430,9 +1434,8 @@ class LongformerForSequenceClassification(LongformerPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the sequence classification/regression loss.
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
-            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1499,8 +1502,10 @@ class LongformerClassificationHead(nn.Module):
 @add_start_docstrings(
-    """Longformer Model with a span classification head on top for extractive question-answering tasks like SQuAD /
+    """
-    TriviaQA (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
+    Longformer Model with a span classification head on top for extractive question-answering tasks like SQuAD /
+    TriviaQA (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
    LONGFORMER_START_DOCSTRING,
 )
 class LongformerForQuestionAnswering(LongformerPreTrainedModel):
@@ -1535,12 +1540,12 @@ class LongformerForQuestionAnswering(LongformerPreTrainedModel):
        r"""
        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (:obj:`sequence_length`).
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
-            Position outside of the sequence are not taken into account for computing the loss.
+            sequence are not taken into account for computing the loss.
        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (:obj:`sequence_length`).
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
-            Position outside of the sequence are not taken into account for computing the loss.
+            sequence are not taken into account for computing the loss.
        Returns:
@@ -1630,8 +1635,10 @@ class LongformerForQuestionAnswering(LongformerPreTrainedModel):
 @add_start_docstrings(
-    """Longformer Model with a token classification head on top (a linear layer on top of
+    """
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    Longformer Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
+    for Named-Entity-Recognition (NER) tasks.
+    """,
    LONGFORMER_START_DOCSTRING,
 )
 class LongformerForTokenClassification(LongformerPreTrainedModel):
@@ -1670,8 +1677,8 @@ class LongformerForTokenClassification(LongformerPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the token classification loss.
+            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
-            Indices should be in ``[0, ..., config.num_labels - 1]``.
+            1]``.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1719,8 +1726,10 @@ class LongformerForTokenClassification(LongformerPreTrainedModel):
 @add_start_docstrings(
-    """Longformer Model with a multiple choice classification head on top (a linear layer on top of
+    """
-    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    Longformer Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
+    a softmax) e.g. for RocStories/SWAG tasks.
+    """,
    LONGFORMER_START_DOCSTRING,
 )
 class LongformerForMultipleChoice(LongformerPreTrainedModel):
@@ -1755,9 +1764,9 @@ class LongformerForMultipleChoice(LongformerPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the multiple choice classification loss.
+            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
-            Indices should be in ``[0, ..., num_choices-1]`` where :obj:`num_choices` is the size of the second dimension
+            num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
-            of the input tensors. (See :obj:`input_ids` above)
+            :obj:`input_ids` above)
        """
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

--- a/src/transformers/modeling_lxmert.py
+++ b/src/transformers/modeling_lxmert.py
@@ -58,9 +58,9 @@ class GeLU(nn.Module):
 @dataclass
 class LxmertModelOutput(ModelOutput):
    """
-    Lxmert's outputs that contain the last hidden states, pooled outputs, and attention probabilites for
+    Lxmert's outputs that contain the last hidden states, pooled outputs, and attention probabilites for the language,
-    the language, visual, and, cross-modality encoders.
+    visual, and, cross-modality encoders. (note: the visual encoder in Lxmert is referred to as the "relation-ship"
-    (note: the visual encoder in Lxmert is referred to as the "relation-ship" encoder")
+    encoder")
    Args:
@@ -69,29 +69,26 @@ class LxmertModelOutput(ModelOutput):
        vision_output (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the visual encoder.
        pooled_output (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, hidden_size)`):
-            Last layer hidden-state of the first token of the sequence (classification, CLS, token)
+            Last layer hidden-state of the first token of the sequence (classification, CLS, token) further processed
-            further processed by a Linear layer and a Tanh activation function. The Linear
+            by a Linear layer and a Tanh activation function. The Linear
        language_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for input features + one for the output of each cross-modality layer)
+            Tuple of :obj:`torch.FloatTensor` (one for input features + one for the output of each cross-modality
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`.
        vision_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for input features + one for the output of each cross-modality layer)
+            Tuple of :obj:`torch.FloatTensor` (one for input features + one for the output of each cross-modality
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`.
        language_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            weighted average in the self-attention heads.
-            heads.
        vision_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            weighted average in the self-attention heads.
-            heads.
        cross_encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            weighted average in the self-attention heads.
-            heads.
    """
    language_output: Optional[torch.FloatTensor] = None
@@ -111,30 +108,28 @@ class LxmertForQuestionAnsweringOutput(ModelOutput):
    Args:
        loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
-            Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.k.
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.k.
        question_answering_score: (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, n_qa_answers)`, `optional`):
            Prediction scores of question answering objective (classification).
        language_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for input features + one for the output of each cross-modality layer)
+            Tuple of :obj:`torch.FloatTensor` (one for input features + one for the output of each cross-modality
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`.
        vision_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for input features + one for the output of each cross-modality layer)
+            Tuple of :obj:`torch.FloatTensor` (one for input features + one for the output of each cross-modality
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`.
        language_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            weighted average in the self-attention heads.
-            heads.
        vision_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            weighted average in the self-attention heads.
-            heads.
        cross_encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            weighted average in the self-attention heads.
-            heads.
    """
    loss: Optional[torch.FloatTensor] = None
@@ -153,7 +148,8 @@ class LxmertForPreTrainingOutput(ModelOutput):
    Args:
        loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
-            Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.
        prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        cross_relationship_score: (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
@@ -162,26 +158,23 @@ class LxmertForPreTrainingOutput(ModelOutput):
        question_answering_score: (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, n_qa_answers)`):
            Prediction scores of question answering objective (classification).
        language_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for input features + one for the output of each cross-modality layer)
+            Tuple of :obj:`torch.FloatTensor` (one for input features + one for the output of each cross-modality
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`.
        vision_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for input features + one for the output of each cross-modality layer)
+            Tuple of :obj:`torch.FloatTensor` (one for input features + one for the output of each cross-modality
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`.
        language_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            weighted average in the self-attention heads.
-            heads.
        vision_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            weighted average in the self-attention heads.
-            heads.
        cross_encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            weighted average in the self-attention heads.
-            heads.
    """
@@ -778,8 +771,9 @@ class LxmertPreTrainingHeads(nn.Module):
 class LxmertPreTrainedModel(PreTrainedModel):
-    """An abstract class to handle weights initialization and
+    """
-    a simple interface for downloading and loading pretrained models.
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
    """
    config_class = LxmertConfig
@@ -804,21 +798,22 @@ LXMERT_START_DOCSTRING = r"""
    The LXMERT model was proposed in `LXMERT: Learning Cross-Modality Encoder Representations from Transformers
    <https://arxiv.org/abs/1908.07490>`__ by Hao Tan and Mohit Bansal. It's a vision and language transformer model,
    pretrained on a variety of multi-modal datasets comprising of GQA, VQAv2.0, MCSCOCO captions, and Visual genome,
-    using a combination of masked language modeling, region of interest feature regression,
+    using a combination of masked language modeling, region of interest feature regression, cross entropy loss for
-    cross entropy loss for question answering attribute prediction, and object tag predicition.
+    question answering attribute prediction, and object tag predicition.
    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
    pruning heads etc.)
-    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass.
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    usage and behavior.
+    general usage and behavior.
    Parameters:
        config (:class:`~transformers.LxmertConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Initializing with a config file does not load the weights associated with the model, only the
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
 """
 LXMERT_INPUTS_DOCSTRING = r"""
@@ -827,9 +822,9 @@ LXMERT_INPUTS_DOCSTRING = r"""
        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using :class:`~transformers.LxmertTokenizer`.
+            Indices can be obtained using :class:`~transformers.LxmertTokenizer`. See
-            See :meth:`transformers.PreTrainedTokenizer.encode` and
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            :meth:`transformers.PreTrainedTokenizer.__call__` for details.
+            details.
            `What are input IDs? <../glossary.html#input-ids>`__
        visual_feats: (:obj:`torch.FloatTensor` of shape :obj:՝(batch_size, num_visual_features, visual_feat_dim)՝):
@@ -838,30 +833,28 @@ LXMERT_INPUTS_DOCSTRING = r"""
            These are currently not provided by the transformers library.
        visual_pos: (:obj:`torch.FloatTensor` of shape :obj:՝(batch_size, num_visual_features, visual_pos_dim)՝):
-            This input represents spacial features corresponding to their relative (via index) visual features.
+            This input represents spacial features corresponding to their relative (via index) visual features. The
-            The pre-trained LXMERT model expects these spacial features to be normalized bounding boxes on a scale of
+            pre-trained LXMERT model expects these spacial features to be normalized bounding boxes on a scale of 0 to
-            0 to 1.
+            1.
            These are currently not provided by the transformers library.
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
-            Mask to avoid performing attention on padding token indices.
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            `What are attention masks? <../glossary.html#attention-mask>`__
        visual_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
-            Mask to avoid performing attention on padding token indices.
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            `What are attention masks? <../glossary.html#attention-mask>`__
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs.
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            Indices are selected in ``[0, 1]``:
+            1]``:
            - 0 corresponds to a `sentence A` token,
            - 1 corresponds to a `sentence B` token.
@@ -1079,17 +1072,17 @@ class LxmertForPreTraining(LxmertPreTrainedModel):
    def resize_num_qa_labels(self, num_labels):
        """
-        Build a resized question answering linear layer Module from a provided new linear layer. Increasing the size will add newly
+        Build a resized question answering linear layer Module from a provided new linear layer. Increasing the size
-        initialized weights. Reducing the size will remove weights from the end
+        will add newly initialized weights. Reducing the size will remove weights from the end
        Args:
            cur_qa_logit_layer (:obj:`torch.nn.Linear`):
                Old linear layer to be resized.
            num_labels (:obj:`int`, `optional`):
-                New number of labels in the linear layer weight matrix.
+                New number of labels in the linear layer weight matrix. Increasing the size will add newly initialized
-                Increasing the size will add newly initialized weights at the end. Reducing the size will remove
+                weights at the end. Reducing the size will remove weights from the end. If not provided or :obj:`None`,
-                weights from the end. If not provided or :obj:`None`, just returns a pointer to the qa labels
+                just returns a pointer to the qa labels :obj:`torch.nn.Linear`` module of the model wihtout doing
-                :obj:`torch.nn.Linear`` module of the model wihtout doing anything.
+                anything.
        Return:
            :obj:`torch.nn.Linear`: Pointer to the resized Linear layer or the old Linear layer
@@ -1116,7 +1109,7 @@ class LxmertForPreTraining(LxmertPreTrainedModel):
        Returns:
            :obj:`nn.Module`: A torch module mapping the question answering prediction hidden states or :obj:`None` if
-                LXMERT does not have a visual answering head.
+            LXMERT does not have a visual answering head.
        """
        if hasattr(self, "answer_head"):
            return self.answer_head.logit_fc[-1]
@@ -1173,17 +1166,16 @@ class LxmertForPreTraining(LxmertPreTrainedModel):
    ):
        r"""
        masked_lm_labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`):
-            Labels for computing the masked language modeling loss.
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
-            in ``[0, ..., config.vocab_size]``
        obj_labels: (``Dict[Str: Tuple[Torch.FloatTensor, Torch.FloatTensor]]``, `optional`):
            each key is named after each one of the visual losses and each element of the tuple is of the shape
-            ``(batch_size, num_features)`` and ``(batch_size, num_features, visual_feature_dim)``
+            ``(batch_size, num_features)`` and ``(batch_size, num_features, visual_feature_dim)`` for each the label id
-            for each the label id and the label score respectively
+            and the label score respectively
        matched_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
-            Labels for computing the whether or not the text input matches the image (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
+            Labels for computing the whether or not the text input matches the image (classification) loss. Input
-            Indices should be in ``[0, 1]``:
+            should be a sequence pair (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``:
            - 0 indicates that the sentence does not match the image,
            - 1 indicates that the sentence does match the image.
@@ -1302,17 +1294,17 @@ class LxmertForQuestionAnswering(LxmertPreTrainedModel):
    def resize_num_qa_labels(self, num_labels):
        """
-        Build a resized question answering linear layer Module from a provided new linear layer. Increasing the size will add newly
+        Build a resized question answering linear layer Module from a provided new linear layer. Increasing the size
-        initialized weights. Reducing the size will remove weights from the end
+        will add newly initialized weights. Reducing the size will remove weights from the end
        Args:
            cur_qa_logit_layer (:obj:`torch.nn.Linear`):
                Old linear layer to be resized.
            num_labels (:obj:`int`, `optional`):
-                New number of labels in the linear layer weight matrix.
+                New number of labels in the linear layer weight matrix. Increasing the size will add newly initialized
-                Increasing the size will add newly initialized weights at the end. Reducing the size will remove
+                weights at the end. Reducing the size will remove weights from the end. If not provided or :obj:`None`,
-                weights from the end. If not provided or :obj:`None`, just returns a pointer to the qa labels
+                just returns a pointer to the qa labels :obj:`torch.nn.Linear`` module of the model wihtout doing
-                :obj:`torch.nn.Linear`` module of the model wihtout doing anything.
+                anything.
        Return:
            :obj:`torch.nn.Linear`: Pointer to the resized Linear layer or the old Linear layer
@@ -1338,8 +1330,8 @@ class LxmertForQuestionAnswering(LxmertPreTrainedModel):
        Returns the the linear layer that produces question answering logits
        Returns:
-            :obj:`nn.Module`: A torch module mapping the question answering prediction hidden states.
+            :obj:`nn.Module`: A torch module mapping the question answering prediction hidden states. :obj:`None`: A
-            :obj:`None`: A NoneType object if Lxmert does not have the visual answering head.
+            NoneType object if Lxmert does not have the visual answering head.
        """
        if hasattr(self, "answer_head"):