Doc styler v2 (#14950)

* New doc styler * Fix issue with args at the start * Code sample fixes * Style code examples in MDX * Fix more patterns * Typo * Typo * More patterns * Do without black for now * Get more info in error * Docstring style * Re-enable check * Quality * Fix add_end_docstring decorator * Fix docstring

Doc styler v2 (#14950)
* New doc styler * Fix issue with args at the start * Code sample fixes * Style code examples in MDX * Fix more patterns * Typo * Typo * More patterns * Do without black for now * Get more info in error * Docstring style * Re-enable check * Quality * Fix add_end_docstring decorator * Fix docstring
87e6e4fe · Sylvain Gugger · GitHub · c1138273 · 87e6e4fe · 87e6e4fe
Unverified Commit 87e6e4fe authored Dec 27, 2021 by Sylvain Gugger Committed by GitHub Dec 27, 2021
20 changed files
--- a/src/transformers/models/deberta/modeling_deberta.py
+++ b/src/transformers/models/deberta/modeling_deberta.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch DeBERTa model. """
+""" PyTorch DeBERTa model."""
 import math
 from collections.abc import Sequence
@@ -79,7 +79,8 @@ class XSoftmax(torch.autograd.Function):
    Args:
        input (`torch.tensor`): The input tensor that will apply softmax.
-        mask (`torch.IntTensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
+        mask (`torch.IntTensor`):
+            The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
        dim (int): The dimension that will apply softmax
    Example:
@@ -480,8 +481,8 @@ def build_relative_position(query_size, key_size, device):
    Build relative position according to the query and key
    We assume the absolute position of query \\(P_q\\) is range from (0, query_size) and the absolute position of key
-    \\(P_k\\) is range from (0, key_size), The relative positions from query to key is \\(R_{q \\rightarrow k} =
+    \\(P_k\\) is range from (0, key_size), The relative positions from query to key is \\(R_{q \\rightarrow k} = P_q -
-    P_q - P_k\\)
+    P_k\\)
    Args:
        query_size (int): the length of query
@@ -814,20 +815,20 @@ class DebertaPreTrainedModel(PreTrainedModel):
 DEBERTA_START_DOCSTRING = r"""
-    The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build on top of
+    The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
-    BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
+    Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
+    on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
    improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    general usage and behavior.```
+    and behavior.```
    Parameters:
        config ([`DebertaConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-            weights.
 """
 DEBERTA_INPUTS_DOCSTRING = r"""
@@ -835,9 +836,8 @@ DEBERTA_INPUTS_DOCSTRING = r"""
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using [`DebertaTokenizer`]. See
+            Indices can be obtained using [`DebertaTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
+            [`PreTrainedTokenizer.__call__`] for details.
-            details.
            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
@@ -848,20 +848,22 @@ DEBERTA_INPUTS_DOCSTRING = r"""
            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            This is useful if you want more control over how to convert *input_ids* indices into associated vectors
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
-            than the model's internal embedding lookup matrix.
+            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
@@ -988,7 +990,7 @@ class DebertaModel(DebertaPreTrainedModel):
        )
-@add_start_docstrings("""DeBERTa Model with a `language modeling` head on top. """, DEBERTA_START_DOCSTRING)
+@add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING)
 class DebertaForMaskedLM(DebertaPreTrainedModel):
    _keys_to_ignore_on_load_unexpected = [r"pooler"]
    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
@@ -1029,8 +1031,9 @@ class DebertaForMaskedLM(DebertaPreTrainedModel):
    ):
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1169,8 +1172,9 @@ class DebertaForSequenceClassification(DebertaPreTrainedModel):
    ):
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1362,12 +1366,12 @@ class DebertaForQuestionAnswering(DebertaPreTrainedModel):
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            sequence are not taken into account for computing the loss.
+            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            sequence are not taken into account for computing the loss.
+            are not taken into account for computing the loss.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

--- a/src/transformers/models/deberta/modeling_tf_deberta.py
+++ b/src/transformers/models/deberta/modeling_tf_deberta.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 DeBERTa model. """
+""" TF 2.0 DeBERTa model."""
 import math
@@ -395,8 +395,8 @@ def build_relative_position(query_size, key_size):
    Build relative position according to the query and key
    We assume the absolute position of query \\(P_q\\) is range from (0, query_size) and the absolute position of key
-    \\(P_k\\) is range from (0, key_size), The relative positions from query to key is \\(R_{q \\rightarrow k} =
+    \\(P_k\\) is range from (0, key_size), The relative positions from query to key is \\(R_{q \\rightarrow k} = P_q -
-    P_q - P_k\\)
+    P_k\\)
    Args:
        query_size (int): the length of query
@@ -1001,13 +1001,14 @@ class TFDebertaPreTrainedModel(TFPreTrainedModel):
 DEBERTA_START_DOCSTRING = r"""
-    The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build on top of
+    The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
-    BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
+    Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
+    on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
    improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.
-    This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
+    This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
-    it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
-    and behavior.
+    behavior.
    <Tip>
@@ -1016,11 +1017,11 @@ DEBERTA_START_DOCSTRING = r"""
    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional arguments.
-    This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+    This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all the
-    the tensors in the first argument of the model call function: `model(inputs)`.
+    tensors in the first argument of the model call function: `model(inputs)`.
-    If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+    If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the
-    the first positional argument :
+    first positional argument :
    - a single Tensor with `input_ids` only and nothing else: `model(inputs_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
@@ -1033,18 +1034,16 @@ DEBERTA_START_DOCSTRING = r"""
    Parameters:
        config ([`DebertaConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-            weights.
 """
 DEBERTA_INPUTS_DOCSTRING = r"""
    Args:
-        input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`)
+        input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using [`DebertaTokenizer`]. See
+            Indices can be obtained using [`DebertaTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
+            [`PreTrainedTokenizer.__call__`] for details.
-            details.
            [What are input IDs?](../glossary#input-ids)
        attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
@@ -1055,20 +1054,22 @@ DEBERTA_INPUTS_DOCSTRING = r"""
            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`np.ndarray` or `tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            This is useful if you want more control over how to convert *input_ids* indices into associated vectors
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
-            than the model's internal embedding lookup matrix.
+            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
@@ -1145,7 +1146,7 @@ class TFDebertaModel(TFDebertaPreTrainedModel):
        return TFBaseModelOutput(last_hidden_state=output.last_hidden_state, hidden_states=hs, attentions=attns)
-@add_start_docstrings("""DeBERTa Model with a `language modeling` head on top. """, DEBERTA_START_DOCSTRING)
+@add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING)
 class TFDebertaForMaskedLM(TFDebertaPreTrainedModel, TFMaskedLanguageModelingLoss):
    def __init__(self, config: DebertaConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
@@ -1185,8 +1186,9 @@ class TFDebertaForMaskedLM(TFDebertaPreTrainedModel, TFMaskedLanguageModelingLos
    ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """
        inputs = input_processing(
            func=self.call,
@@ -1286,8 +1288,9 @@ class TFDebertaForSequenceClassification(TFDebertaPreTrainedModel, TFSequenceCla
    ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        inputs = input_processing(
            func=self.call,
@@ -1476,12 +1479,12 @@ class TFDebertaForQuestionAnswering(TFDebertaPreTrainedModel, TFQuestionAnswerin
        r"""
        start_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            sequence are not taken into account for computing the loss.
+            are not taken into account for computing the loss.
        end_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            sequence are not taken into account for computing the loss.
+            are not taken into account for computing the loss.
        """
        inputs = input_processing(
            func=self.call,

--- a/src/transformers/models/deberta/tokenization_deberta.py
+++ b/src/transformers/models/deberta/tokenization_deberta.py
@@ -203,8 +203,7 @@ class DebertaTokenizer(GPT2Tokenizer):
                Optional second list of IDs for sequence pairs.
        Returns:
-            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
-            sequence(s).
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

--- a/src/transformers/models/deberta/tokenization_deberta_fast.py
+++ b/src/transformers/models/deberta/tokenization_deberta_fast.py
@@ -129,8 +129,8 @@ class DebertaTokenizerFast(GPT2TokenizerFast):
    @property
    def mask_token(self) -> str:
        """
-        `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
+        `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
-        not having been set.
+        having been set.
        Deberta tokenizer has a special mask token to be used in the fill-mask pipeline. The mask token will greedily
        comprise the space before the *[MASK]*.
@@ -196,8 +196,7 @@ class DebertaTokenizerFast(GPT2TokenizerFast):
                Optional second list of IDs for sequence pairs.
        Returns:
-            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
-            sequence(s).
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

--- a/src/transformers/models/deberta_v2/configuration_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/configuration_deberta_v2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" DeBERTa-v2 model configuration """
+""" DeBERTa-v2 model configuration"""
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -30,13 +30,13 @@ DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class DebertaV2Config(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a [`DebertaV2Model`]. It is used
+    This is the configuration class to store the configuration of a [`DebertaV2Model`]. It is used to instantiate a
-    to instantiate a DeBERTa-v2 model according to the specified arguments, defining the model architecture.
+    DeBERTa-v2 model according to the specified arguments, defining the model architecture. Instantiating a
-    Instantiating a configuration with the defaults will yield a similar configuration to that of the DeBERTa
+    configuration with the defaults will yield a similar configuration to that of the DeBERTa
    [microsoft/deberta-v2-xlarge](https://huggingface.co/microsoft/deberta-base) architecture.
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    outputs. Read the documentation from [`PretrainedConfig`] for more information.
+    documentation from [`PretrainedConfig`] for more information.
    Arguments:
        vocab_size (`int`, *optional*, defaults to 128100):
@@ -51,9 +51,9 @@ class DebertaV2Config(PretrainedConfig):
        intermediate_size (`int`, *optional*, defaults to 6144):
            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"gelu"`, `"relu"`, `"silu"`, `"gelu"`, `"tanh"`, `"gelu_fast"`,
+            `"relu"`, `"silu"`, `"gelu"`, `"tanh"`, `"gelu_fast"`, `"mish"`, `"linear"`, `"sigmoid"` and `"gelu_new"`
-            `"mish"`, `"linear"`, `"sigmoid"` and `"gelu_new"` are supported.
+            are supported.
        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
@@ -62,8 +62,7 @@ class DebertaV2Config(PretrainedConfig):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
        type_vocab_size (`int`, *optional*, defaults to 0):
-            The vocabulary size of the `token_type_ids` passed when calling [`DebertaModel`] or
+            The vocabulary size of the `token_type_ids` passed when calling [`DebertaModel`] or [`TFDebertaModel`].
-            [`TFDebertaModel`].
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        layer_norm_eps (`float`, *optional*, defaults to 1e-7):
@@ -71,8 +70,8 @@ class DebertaV2Config(PretrainedConfig):
        relative_attention (`bool`, *optional*, defaults to `True`):
            Whether use relative position encoding.
        max_relative_positions (`int`, *optional*, defaults to -1):
-            The range of relative positions `[-max_position_embeddings, max_position_embeddings]`. Use the same
+            The range of relative positions `[-max_position_embeddings, max_position_embeddings]`. Use the same value
-            value as `max_position_embeddings`.
+            as `max_position_embeddings`.
        pad_token_id (`int`, *optional*, defaults to 0):
            The value used to pad input_ids.
        position_biased_input (`bool`, *optional*, defaults to `False`):

--- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch DeBERTa-v2 model. """
+""" PyTorch DeBERTa-v2 model."""
 import math
 from collections.abc import Sequence
@@ -80,7 +80,8 @@ class XSoftmax(torch.autograd.Function):
    Args:
        input (`torch.tensor`): The input tensor that will apply softmax.
-        mask (`torch.IntTensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
+        mask (`torch.IntTensor`):
+            The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
        dim (int): The dimension that will apply softmax
    Example:
@@ -542,8 +543,8 @@ def build_relative_position(query_size, key_size, bucket_size=-1, max_position=-
    Build relative position according to the query and key
    We assume the absolute position of query \\(P_q\\) is range from (0, query_size) and the absolute position of key
-    \\(P_k\\) is range from (0, key_size), The relative positions from query to key is \\(R_{q \\rightarrow k} =
+    \\(P_k\\) is range from (0, key_size), The relative positions from query to key is \\(R_{q \\rightarrow k} = P_q -
-    P_q - P_k\\)
+    P_k\\)
    Args:
        query_size (int): the length of query
@@ -925,20 +926,20 @@ class DebertaV2PreTrainedModel(PreTrainedModel):
 DEBERTA_START_DOCSTRING = r"""
-    The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build on top of
+    The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
-    BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
+    Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
+    on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
    improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    general usage and behavior.```
+    and behavior.```
    Parameters:
        config ([`DebertaV2Config`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-            weights.
 """
 DEBERTA_INPUTS_DOCSTRING = r"""
@@ -946,9 +947,8 @@ DEBERTA_INPUTS_DOCSTRING = r"""
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using [`DebertaV2Tokenizer`]. See
+            Indices can be obtained using [`DebertaV2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
+            [`PreTrainedTokenizer.__call__`] for details.
-            details.
            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
@@ -959,20 +959,22 @@ DEBERTA_INPUTS_DOCSTRING = r"""
            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            This is useful if you want more control over how to convert *input_ids* indices into associated vectors
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
-            than the model's internal embedding lookup matrix.
+            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
@@ -1100,7 +1102,7 @@ class DebertaV2Model(DebertaV2PreTrainedModel):
        )
-@add_start_docstrings("""DeBERTa Model with a `language modeling` head on top. """, DEBERTA_START_DOCSTRING)
+@add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING)
 # Copied from transformers.models.deberta.modeling_deberta.DebertaForMaskedLM with Deberta->DebertaV2
 class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel):
    _keys_to_ignore_on_load_unexpected = [r"pooler"]
@@ -1142,8 +1144,9 @@ class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel):
    ):
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1283,8 +1286,9 @@ class DebertaV2ForSequenceClassification(DebertaV2PreTrainedModel):
    ):
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1478,12 +1482,12 @@ class DebertaV2ForQuestionAnswering(DebertaV2PreTrainedModel):
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            sequence are not taken into account for computing the loss.
+            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            sequence are not taken into account for computing the loss.
+            are not taken into account for computing the loss.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

--- a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 DeBERTa-v2 model. """
+""" TF 2.0 DeBERTa-v2 model."""
 from typing import Dict, Optional, Tuple, Union
@@ -455,7 +455,6 @@ class TFDebertaV2Encoder(tf.keras.layers.Layer):
 def make_log_bucket_position(relative_pos, bucket_size, max_position):
-    """ """
    sign = tf.math.sign(relative_pos)
    mid = bucket_size // 2
    abs_pos = tf.where((relative_pos < mid) & (relative_pos > -mid), mid - 1, tf.math.abs(relative_pos))
@@ -476,8 +475,8 @@ def build_relative_position(query_size, key_size, bucket_size=-1, max_position=-
    Build relative position according to the query and key
    We assume the absolute position of query \\(P_q\\) is range from (0, query_size) and the absolute position of key
-    \\(P_k\\) is range from (0, key_size), The relative positions from query to key is \\(R_{q \\rightarrow k} =
+    \\(P_k\\) is range from (0, key_size), The relative positions from query to key is \\(R_{q \\rightarrow k} = P_q -
-    P_q - P_k\\)
+    P_k\\)
    Args:
        query_size (int): the length of query
@@ -1126,13 +1125,14 @@ class TFDebertaV2PreTrainedModel(TFPreTrainedModel):
 DEBERTA_START_DOCSTRING = r"""
-    The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build on top of
+    The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
-    BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
+    Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
+    on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
    improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.
-    This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
+    This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
-    it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
-    and behavior.
+    behavior.
    <Tip>
@@ -1141,11 +1141,11 @@ DEBERTA_START_DOCSTRING = r"""
    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional arguments.
-    This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+    This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all the
-    the tensors in the first argument of the model call function: `model(inputs)`.
+    tensors in the first argument of the model call function: `model(inputs)`.
-    If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+    If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the
-    the first positional argument :
+    first positional argument :
    - a single Tensor with `input_ids` only and nothing else: `model(inputs_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
@@ -1158,18 +1158,16 @@ DEBERTA_START_DOCSTRING = r"""
    Parameters:
        config ([`DebertaV2Config`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-            weights.
 """
 DEBERTA_INPUTS_DOCSTRING = r"""
    Args:
-        input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`)
+        input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using [`DebertaV2Tokenizer`]. See
+            Indices can be obtained using [`DebertaV2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
+            [`PreTrainedTokenizer.__call__`] for details.
-            details.
            [What are input IDs?](../glossary#input-ids)
        attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
@@ -1180,20 +1178,22 @@ DEBERTA_INPUTS_DOCSTRING = r"""
            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`np.ndarray` or `tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            This is useful if you want more control over how to convert *input_ids* indices into associated vectors
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
-            than the model's internal embedding lookup matrix.
+            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
@@ -1271,7 +1271,7 @@ class TFDebertaV2Model(TFDebertaV2PreTrainedModel):
        return TFBaseModelOutput(last_hidden_state=output.last_hidden_state, hidden_states=hs, attentions=attns)
-@add_start_docstrings("""DeBERTa Model with a `language modeling` head on top. """, DEBERTA_START_DOCSTRING)
+@add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING)
 # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaForMaskedLM with Deberta->DebertaV2
 class TFDebertaV2ForMaskedLM(TFDebertaV2PreTrainedModel, TFMaskedLanguageModelingLoss):
    def __init__(self, config: DebertaV2Config, *inputs, **kwargs):
@@ -1312,8 +1312,9 @@ class TFDebertaV2ForMaskedLM(TFDebertaV2PreTrainedModel, TFMaskedLanguageModelin
    ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """
        inputs = input_processing(
            func=self.call,
@@ -1414,8 +1415,9 @@ class TFDebertaV2ForSequenceClassification(TFDebertaV2PreTrainedModel, TFSequenc
    ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        inputs = input_processing(
            func=self.call,
@@ -1606,12 +1608,12 @@ class TFDebertaV2ForQuestionAnswering(TFDebertaV2PreTrainedModel, TFQuestionAnsw
        r"""
        start_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            sequence are not taken into account for computing the loss.
+            are not taken into account for computing the loss.
        end_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            sequence are not taken into account for computing the loss.
+            are not taken into account for computing the loss.
        """
        inputs = input_processing(
            func=self.call,

--- a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
@@ -83,7 +83,9 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
        sp_model_kwargs (`dict`, *optional*):
-            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
@@ -242,8 +244,7 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
                Optional second list of IDs for sequence pairs.
        Returns:
-            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
-            sequence(s).
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
@@ -270,7 +271,9 @@ class SPMTokenizer:
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        sp_model_kwargs (`dict`, *optional*):
-            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

--- a/src/transformers/models/deit/configuration_deit.py
+++ b/src/transformers/models/deit/configuration_deit.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" DeiT model configuration """
+""" DeiT model configuration"""
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -28,14 +28,14 @@ DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class DeiTConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a [`DeiTModel`]. It is used to
+    This is the configuration class to store the configuration of a [`DeiTModel`]. It is used to instantiate an DeiT
-    instantiate an DeiT model according to the specified arguments, defining the model architecture. Instantiating a
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    configuration with the defaults will yield a similar configuration to that of the DeiT
+    defaults will yield a similar configuration to that of the DeiT
    [facebook/deit-base-distilled-patch16-224](https://huggingface.co/facebook/deit-base-distilled-patch16-224)
    architecture.
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    outputs. Read the documentation from [`PretrainedConfig`] for more information.
+    documentation from [`PretrainedConfig`] for more information.
    Args:
@@ -48,8 +48,8 @@ class DeiTConfig(PretrainedConfig):
        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):

--- a/src/transformers/models/deit/feature_extraction_deit.py
+++ b/src/transformers/models/deit/feature_extraction_deit.py
@@ -38,26 +38,25 @@ class DeiTFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
    r"""
    Constructs a DeiT feature extractor.
-    This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main
+    This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
-    methods. Users should refer to this superclass for more information regarding those methods.
+    should refer to this superclass for more information regarding those methods.
    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the input to a certain `size`.
        size (`int` or `Tuple(int)`, *optional*, defaults to 256):
            Resize the input to the given size. If a tuple is provided, it should be (width, height). If only an
-            integer is provided, then the input will be resized to (size, size). Only has an effect if `do_resize`
+            integer is provided, then the input will be resized to (size, size). Only has an effect if `do_resize` is
-            is set to `True`.
+            set to `True`.
        resample (`int`, *optional*, defaults to `PIL.Image.BICUBIC`):
            An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
-            `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`.
+            `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`. Only has an effect
-            Only has an effect if `do_resize` is set to `True`.
+            if `do_resize` is set to `True`.
        do_center_crop (`bool`, *optional*, defaults to `True`):
-            Whether to crop the input at the center. If the input size is smaller than `crop_size` along any edge,
+            Whether to crop the input at the center. If the input size is smaller than `crop_size` along any edge, the
-            the image is padded with 0's and then center cropped.
+            image is padded with 0's and then center cropped.
        crop_size (`int`, *optional*, defaults to 224):
-            Desired output size when applying center-cropping. Only has an effect if `do_center_crop` is set to
+            Desired output size when applying center-cropping. Only has an effect if `do_center_crop` is set to `True`.
-            `True`.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether or not to normalize the input with `image_mean` and `image_std`.
        image_mean (`List[int]`, defaults to `[0.485, 0.456, 0.406]`):

--- a/src/transformers/models/deit/modeling_deit.py
+++ b/src/transformers/models/deit/modeling_deit.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch DeiT model. """
+""" PyTorch DeiT model."""
 import collections.abc
@@ -410,15 +410,14 @@ class DeiTPreTrainedModel(PreTrainedModel):
 DEIT_START_DOCSTRING = r"""
-    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
-    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.
    Parameters:
        config ([`DeiTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-            weights.
 """
 DEIT_INPUTS_DOCSTRING = r"""
@@ -592,8 +591,9 @@ class DeiTForImageClassification(DeiTPreTrainedModel):
    ):
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the image classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
-            If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Returns:
@@ -671,12 +671,13 @@ class DeiTForImageClassificationWithTeacherOutput(ModelOutput):
            Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
            distillation token).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
-            each layer plus the initial embedding outputs.
+            plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            weighted average in the self-attention heads.
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
    """
    logits: torch.FloatTensor = None

--- a/src/transformers/models/detr/configuration_detr.py
+++ b/src/transformers/models/detr/configuration_detr.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" DETR model configuration """
+""" DETR model configuration"""
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -28,18 +28,19 @@ DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class DetrConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a [`DetrModel`]. It is used to
+    This is the configuration class to store the configuration of a [`DetrModel`]. It is used to instantiate a DETR
-    instantiate a DETR model according to the specified arguments, defining the model architecture. Instantiating a
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    configuration with the defaults will yield a similar configuration to that of the DETR [facebook/detr-resnet-50](https://huggingface.co/facebook/detr-resnet-50) architecture.
+    defaults will yield a similar configuration to that of the DETR
+    [facebook/detr-resnet-50](https://huggingface.co/facebook/detr-resnet-50) architecture.
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    outputs. Read the documentation from [`PretrainedConfig`] for more information.
+    documentation from [`PretrainedConfig`] for more information.
    Args:
        num_queries (`int`, *optional*, defaults to 100):
-            Number of object queries, i.e. detection slots. This is the maximal number of objects
+            Number of object queries, i.e. detection slots. This is the maximal number of objects [`DetrModel`] can
-            [`DetrModel`] can detect in a single image. For COCO, we recommend 100 queries.
+            detect in a single image. For COCO, we recommend 100 queries.
        d_model (`int`, *optional*, defaults to 256):
            Dimension of the layers.
        encoder_layers (`int`, *optional*, defaults to 6):
@@ -55,8 +56,8 @@ class DetrConfig(PretrainedConfig):
        encoder_ffn_dim (`int`, *optional*, defaults to 2048):
            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
        activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
        dropout (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_dropout (`float`, *optional*, defaults to 0.0):
@@ -68,19 +69,19 @@ class DetrConfig(PretrainedConfig):
        init_xavier_std (`float`, *optional*, defaults to 1):
            The scaling factor used for the Xavier initialization gain in the HM Attention map module.
        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the [LayerDrop paper](see
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
-            https://arxiv.org/abs/1909.11556) for more details.
+            for more details.
        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the [LayerDrop paper](see
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
-            https://arxiv.org/abs/1909.11556) for more details.
+            for more details.
        auxiliary_loss (`bool`, *optional*, defaults to `False`):
            Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
        position_embedding_type (`str`, *optional*, defaults to `"sine"`):
-            Type of position embeddings to be used on top of the image features. One of `"sine"` or
+            Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`.
-            `"learned"`.
        backbone (`str`, *optional*, defaults to `"resnet50"`):
            Name of convolutional backbone to use. Supports any convolutional backbone from the timm package. For a
-            list of all available models, see [this page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model).
+            list of all available models, see [this
+            page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model).
        dilation (`bool`, *optional*, defaults to `False`):
            Whether to replace stride with dilation in the last convolutional block (DC5).
        class_cost (`float`, *optional*, defaults to 1):

--- a/src/transformers/models/detr/feature_extraction_detr.py
+++ b/src/transformers/models/detr/feature_extraction_detr.py
@@ -124,8 +124,8 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
    r"""
    Constructs a DETR feature extractor.
-    This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main
+    This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
-    methods. Users should refer to this superclass for more information regarding those methods.
+    should refer to this superclass for more information regarding those methods.
    Args:
@@ -134,13 +134,13 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the input to a certain `size`.
        size (`int`, *optional*, defaults to 800):
-            Resize the input to the given size. Only has an effect if `do_resize` is set to `True`. If size
+            Resize the input to the given size. Only has an effect if `do_resize` is set to `True`. If size is a
-            is a sequence like `(width, height)`, output size will be matched to this. If size is an int, smaller
+            sequence like `(width, height)`, output size will be matched to this. If size is an int, smaller edge of
-            edge of the image will be matched to this number. i.e, if `height > width`, then image will be
+            the image will be matched to this number. i.e, if `height > width`, then image will be rescaled to `(size *
-            rescaled to `(size * height / width, size)`.
+            height / width, size)`.
        max_size (`int`, *optional*, defaults to `1333`):
-            The largest size an image dimension can have (otherwise it's capped). Only has an effect if
+            The largest size an image dimension can have (otherwise it's capped). Only has an effect if `do_resize` is
-            `do_resize` is set to `True`.
+            set to `True`.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether or not to normalize the input with mean and standard deviation.
        image_mean (`int`, *optional*, defaults to `[0.485, 0.456, 0.406]`):
@@ -432,15 +432,17 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
            annotations (`Dict`, `List[Dict]`, *optional*):
                The corresponding annotations in COCO format.
-                In case [`DetrFeatureExtractor`] was initialized with `format = "coco_detection"`, the annotations for each image should have the following format: {'image_id': int,
+                In case [`DetrFeatureExtractor`] was initialized with `format = "coco_detection"`, the annotations for
-                'annotations': [annotation]}, with the annotations being a list of COCO object annotations.
+                each image should have the following format: {'image_id': int, 'annotations': [annotation]}, with the
+                annotations being a list of COCO object annotations.
-                In case [`DetrFeatureExtractor`] was initialized with `format = "coco_panoptic"`, the annotations for each image should have the following format: {'image_id': int,
+                In case [`DetrFeatureExtractor`] was initialized with `format = "coco_panoptic"`, the annotations for
-                'file_name': str, 'segments_info': [segment_info]} with segments_info being a list of COCO panoptic
+                each image should have the following format: {'image_id': int, 'file_name': str, 'segments_info':
-                annotations.
+                [segment_info]} with segments_info being a list of COCO panoptic annotations.
            return_segmentation_masks (`Dict`, `List[Dict]`, *optional*, defaults to `False`):
-                Whether to also include instance segmentation masks as part of the labels in case `format = "coco_detection"`.
+                Whether to also include instance segmentation masks as part of the labels in case `format =
+                "coco_detection"`.
            masks_path (`pathlib.Path`, *optional*):
                Path to the directory containing the PNG files that store the class-agnostic image segmentations. Only
@@ -455,8 +457,8 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
                - 0 for pixels that are padding (i.e. **masked**).
            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
-                If set, will return tensors instead of NumPy arrays. If set to `'pt'`, return PyTorch
+                If set, will return tensors instead of NumPy arrays. If set to `'pt'`, return PyTorch `torch.Tensor`
-                `torch.Tensor` objects.
+                objects.
        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
@@ -638,8 +640,8 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
            pixel_values_list (`List[torch.Tensor]`):
                List of images (pixel values) to be padded. Each image should be a tensor of shape (C, H, W).
            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
-                If set, will return tensors instead of NumPy arrays. If set to `'pt'`, return PyTorch
+                If set, will return tensors instead of NumPy arrays. If set to `'pt'`, return PyTorch `torch.Tensor`
-                `torch.Tensor` objects.
+                objects.
        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
@@ -674,8 +676,8 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
    # inspired by https://github.com/facebookresearch/detr/blob/master/models/detr.py#L258
    def post_process(self, outputs, target_sizes):
        """
-        Converts the output of [`DetrForObjectDetection`] into the format expected by the COCO api.
+        Converts the output of [`DetrForObjectDetection`] into the format expected by the COCO api. Only supports
-        Only supports PyTorch.
+        PyTorch.
        Args:
            outputs ([`DetrObjectDetectionOutput`]):
@@ -686,8 +688,8 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
                augment, but before padding.
        Returns:
-            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
-            image in the batch as predicted by the model.
+            in the batch as predicted by the model.
        """
        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
@@ -712,8 +714,7 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
    def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_threshold=0.5):
        """
-        Converts the output of [`DetrForSegmentation`] into image segmentation predictions. Only
+        Converts the output of [`DetrForSegmentation`] into image segmentation predictions. Only supports PyTorch.
-        supports PyTorch.
        Parameters:
            outputs ([`DetrSegmentationOutput`]):
@@ -726,8 +727,8 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
                Threshold to use when turning the predicted masks into binary values.
        Returns:
-            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, and masks for an
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, and masks for an image
-            image in the batch as predicted by the model.
+            in the batch as predicted by the model.
        """
        out_logits, raw_masks = outputs.logits, outputs.pred_masks
        preds = []
@@ -755,13 +756,13 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
    # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L218
    def post_process_instance(self, results, outputs, orig_target_sizes, max_target_sizes, threshold=0.5):
        """
-        Converts the output of [`DetrForSegmentation`] into actual instance segmentation
+        Converts the output of [`DetrForSegmentation`] into actual instance segmentation predictions. Only supports
-        predictions. Only supports PyTorch.
+        PyTorch.
        Args:
            results (`List[Dict]`):
-                Results list obtained by [`~DetrFeatureExtractor.post_process`], to which "masks"
+                Results list obtained by [`~DetrFeatureExtractor.post_process`], to which "masks" results will be
-                results will be added.
+                added.
            outputs ([`DetrSegmentationOutput`]):
                Raw outputs of the model.
            orig_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
@@ -774,8 +775,8 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
                Threshold to use when turning the predicted masks into binary values.
        Returns:
-            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, boxes and masks
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, boxes and masks for an
-            for an image in the batch as predicted by the model.
+            image in the batch as predicted by the model.
        """
        if len(orig_target_sizes) != len(max_target_sizes):
@@ -799,8 +800,7 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
    # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L241
    def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_thing_map=None, threshold=0.85):
        """
-        Converts the output of [`DetrForSegmentation`] into actual panoptic predictions. Only
+        Converts the output of [`DetrForSegmentation`] into actual panoptic predictions. Only supports PyTorch.
-        supports PyTorch.
        Parameters:
            outputs ([`DetrSegmentationOutput`]):
@@ -818,8 +818,8 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
                Threshold to use to filter out queries.
        Returns:
-            `List[Dict]`: A list of dictionaries, each dictionary containing a PNG string and segments_info values
+            `List[Dict]`: A list of dictionaries, each dictionary containing a PNG string and segments_info values for
-            for an image in the batch as predicted by the model.
+            an image in the batch as predicted by the model.
        """
        if target_sizes is None:
            target_sizes = processed_sizes

--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch DETR model. """
+""" PyTorch DETR model."""
 import math
@@ -71,15 +71,17 @@ class DetrDecoderOutput(BaseModelOutputWithCrossAttentions):
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
-            each layer plus the initial embedding outputs.
+            plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            weighted average in the self-attention heads.
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            attention softmax, used to compute the weighted average in the cross-attention heads.
+            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the cross-attention heads.
        intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
            Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
            layernorm.
@@ -99,24 +101,27 @@ class DetrModelOutput(Seq2SeqModelOutput):
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the decoder of the model.
        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
-            each layer plus the initial embedding outputs.
+            layer plus the initial embedding outputs.
        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            compute the weighted average in the self-attention heads.
+            sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            attention softmax, used to compute the weighted average in the cross-attention heads.
+            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the cross-attention heads.
        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder of the model.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
-            each layer plus the initial embedding outputs.
+            layer plus the initial embedding outputs.
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            compute the weighted average in the self-attention heads.
+            sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
        intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, sequence_length, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
            Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
            layernorm.
@@ -142,33 +147,36 @@ class DetrObjectDetectionOutput(ModelOutput):
        pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
-            possible padding). You can use [`~DetrFeatureExtractor.post_process`] to retrieve the
+            possible padding). You can use [`~DetrFeatureExtractor.post_process`] to retrieve the unnormalized bounding
-            unnormalized bounding boxes.
+            boxes.
        auxiliary_outputs (`list[Dict]`, *optional*):
-            Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to
+            Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to *True*)
-            *True*) and labels are provided. It is a list of dictionaries containing the two above keys (`logits`
+            and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
-            and `pred_boxes`) for each decoder layer.
+            `pred_boxes`) for each decoder layer.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the decoder of the model.
        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
-            each layer plus the initial embedding outputs.
+            layer plus the initial embedding outputs.
        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            compute the weighted average in the self-attention heads.
+            sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            attention softmax, used to compute the weighted average in the cross-attention heads.
+            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the cross-attention heads.
        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder of the model.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
-            each layer plus the initial embedding outputs.
+            layer plus the initial embedding outputs.
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            compute the weighted average in the self-attention heads.
+            sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
    """
    loss: Optional[torch.FloatTensor] = None
@@ -202,38 +210,40 @@ class DetrSegmentationOutput(ModelOutput):
        pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
-            possible padding). You can use [`~DetrFeatureExtractor.post_process`] to retrieve the
+            possible padding). You can use [`~DetrFeatureExtractor.post_process`] to retrieve the unnormalized bounding
-            unnormalized bounding boxes.
+            boxes.
        pred_masks (`torch.FloatTensor` of shape `(batch_size, num_queries, height/4, width/4)`):
-            Segmentation masks logits for all queries. See also
+            Segmentation masks logits for all queries. See also [`~DetrFeatureExtractor.post_process_segmentation`] or
-            [`~DetrFeatureExtractor.post_process_segmentation`] or
+            [`~DetrFeatureExtractor.post_process_panoptic`] to evaluate instance and panoptic segmentation masks
-            [`~DetrFeatureExtractor.post_process_panoptic`] to evaluate instance and panoptic
+            respectively.
-            segmentation masks respectively.
        auxiliary_outputs (`list[Dict]`, *optional*):
-            Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to
+            Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to *True*)
-            *True*) and labels are provided. It is a list of dictionaries containing the two above keys (`logits`
+            and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
-            and `pred_boxes`) for each decoder layer.
+            `pred_boxes`) for each decoder layer.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the decoder of the model.
        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
-            each layer plus the initial embedding outputs.
+            layer plus the initial embedding outputs.
        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            compute the weighted average in the self-attention heads.
+            sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            attention softmax, used to compute the weighted average in the cross-attention heads.
+            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the cross-attention heads.
        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder of the model.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
-            each layer plus the initial embedding outputs.
+            layer plus the initial embedding outputs.
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            compute the weighted average in the self-attention heads.
+            sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
    """
    loss: Optional[torch.FloatTensor] = None
@@ -692,11 +702,14 @@ class DetrDecoderLayer(nn.Module):
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            position_embeddings (`torch.FloatTensor`, *optional*): position embeddings that are added to the queries and keys
+            position_embeddings (`torch.FloatTensor`, *optional*):
+                position embeddings that are added to the queries and keys
            in the cross-attention layer.
-            query_position_embeddings (`torch.FloatTensor`, *optional*): position embeddings that are added to the queries and keys
+            query_position_embeddings (`torch.FloatTensor`, *optional*):
+                position embeddings that are added to the queries and keys
            in the self-attention layer.
-            encoder_hidden_states (`torch.FloatTensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*):
@@ -804,13 +817,13 @@ class DetrPreTrainedModel(PreTrainedModel):
 DETR_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    pruning heads etc.)
+    etc.)
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    general usage and behavior.
+    and behavior.
    Parameters:
        config ([`DetrConfig`]):
@@ -824,8 +837,8 @@ DETR_INPUTS_DOCSTRING = r"""
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it.
-            Pixel values can be obtained using [`DetrFeatureExtractor`]. See
+            Pixel values can be obtained using [`DetrFeatureExtractor`]. See [`DetrFeatureExtractor.__call__`] for
-            [`DetrFeatureExtractor.__call__`] for details.
+            details.
        pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
@@ -838,10 +851,9 @@ DETR_INPUTS_DOCSTRING = r"""
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, num_queries)`, *optional*):
            Not used by default. Can be used to mask object queries.
        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
-            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
-            `attentions`) `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`,
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
-            *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
-            cross-attention of the decoder.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
            can choose to directly pass a flattened representation of an image.
@@ -1032,8 +1044,8 @@ class DetrDecoder(DetrPreTrainedModel):
            position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Position embeddings that are added to the queries and keys in each cross-attention layer.
-            query_position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):, *optional*):
+            query_position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
-                Position embeddings that are added to the queries and keys in each self-attention layer.
+                , *optional*): Position embeddings that are added to the queries and keys in each self-attention layer.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
@@ -1357,7 +1369,8 @@ class DetrForObjectDetection(DetrPreTrainedModel):
        labels (`List[Dict]` of len `(batch_size,)`, *optional*):
            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
            following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
-            respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
+            respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
+            in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
        Returns:
@@ -2072,7 +2085,8 @@ class DetrHungarianMatcher(nn.Module):
        Params:
            class_cost: This is the relative weight of the classification error in the matching cost
-            bbox_cost: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
+            bbox_cost:
+                This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
            giou_cost: This is the relative weight of the giou loss of the bounding box in the matching cost
        """
        super().__init__()

--- a/src/transformers/models/distilbert/configuration_distilbert.py
+++ b/src/transformers/models/distilbert/configuration_distilbert.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" DistilBERT model configuration """
+""" DistilBERT model configuration"""
 from collections import OrderedDict
 from typing import Mapping
@@ -36,19 +36,18 @@ DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class DistilBertConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a [`DistilBertModel`] or a
+    This is the configuration class to store the configuration of a [`DistilBertModel`] or a [`TFDistilBertModel`]. It
-    [`TFDistilBertModel`]. It is used to instantiate a DistilBERT model according to the specified
+    is used to instantiate a DistilBERT model according to the specified arguments, defining the model architecture.
-    arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the DistilBERT
-    configuration to that of the DistilBERT [distilbert-base-uncased](https://huggingface.co/distilbert-base-uncased) architecture.
+    [distilbert-base-uncased](https://huggingface.co/distilbert-base-uncased) architecture.
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    outputs. Read the documentation from [`PretrainedConfig`] for more information.
+    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 30522):
            Vocabulary size of the DistilBERT model. Defines the number of different tokens that can be represented by
-            the `inputs_ids` passed when calling [`DistilBertModel`] or
+            the `inputs_ids` passed when calling [`DistilBertModel`] or [`TFDistilBertModel`].
-            [`TFDistilBertModel`].
        max_position_embeddings (`int`, *optional*, defaults to 512):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
@@ -67,13 +66,12 @@ class DistilBertConfig(PretrainedConfig):
        attention_dropout (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention probabilities.
        activation (`str` or `Callable`, *optional*, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        qa_dropout (`float`, *optional*, defaults to 0.1):
-            The dropout probabilities used in the question answering model
+            The dropout probabilities used in the question answering model [`DistilBertForQuestionAnswering`].
-            [`DistilBertForQuestionAnswering`].
        seq_classif_dropout (`float`, *optional*, defaults to 0.2):
            The dropout probabilities used in the sequence classification and the multiple choice model
            [`DistilBertForSequenceClassification`].

--- a/src/transformers/models/distilbert/modeling_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_distilbert.py
@@ -377,19 +377,18 @@ class DistilBertPreTrainedModel(PreTrainedModel):
 DISTILBERT_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    pruning heads etc.)
+    etc.)
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    general usage and behavior.
+    and behavior.
    Parameters:
        config ([`DistilBertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-            weights.
 """
 DISTILBERT_INPUTS_DOCSTRING = r"""
@@ -397,9 +396,8 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using [`DistilBertTokenizer`]. See
+            Indices can be obtained using [`DistilBertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
+            [`PreTrainedTokenizer.__call__`] for details.
-            details.
            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
@@ -416,9 +414,9 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
            - 0 indicates the head is **masked**.
        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            This is useful if you want more control over how to convert `input_ids` indices into associated
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            vectors than the model's internal embedding lookup matrix.
+            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
@@ -559,7 +557,7 @@ class DistilBertModel(DistilBertPreTrainedModel):
 @add_start_docstrings(
-    """DistilBert Model with a `masked language modeling` head on top. """,
+    """DistilBert Model with a `masked language modeling` head on top.""",
    DISTILBERT_START_DOCSTRING,
 )
 class DistilBertForMaskedLM(DistilBertPreTrainedModel):
@@ -622,8 +620,9 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
    ):
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -719,8 +718,9 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
    ):
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -836,12 +836,12 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            sequence are not taken into account for computing the loss.
+            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            sequence are not taken into account for computing the loss.
+            are not taken into account for computing the loss.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1052,7 +1052,8 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
    ):
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Returns:

--- a/src/transformers/models/distilbert/modeling_flax_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_flax_distilbert.py
@@ -47,12 +47,12 @@ _TOKENIZER_FOR_DOC = "DistilBertTokenizer"
 FLAX_DISTILBERT_START_DOCSTRING = r"""
-    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
-    generic methods the library implements for all its model (such as downloading, saving and converting weights from
+    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)
-    PyTorch models)
-    This model is also a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module) subclass. Use it as a regular Flax linen Module
+    This model is also a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module)
-    and refer to the Flax documentation for all matter related to general usage and behavior.
+    subclass. Use it as a regular Flax linen Module and refer to the Flax documentation for all matter related to
+    general usage and behavior.
    Finally, this model supports inherent JAX features such as:
@@ -64,8 +64,7 @@ FLAX_DISTILBERT_START_DOCSTRING = r"""
    Parameters:
        config ([`DistilBertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-            weights.
 """
 DISTILBERT_INPUTS_DOCSTRING = r"""
@@ -73,9 +72,8 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
        input_ids (`numpy.ndarray` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using [`BertTokenizer`]. See
+            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
+            [`PreTrainedTokenizer.__call__`] for details.
-            details.
            [What are input IDs?](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
@@ -598,7 +596,7 @@ class FlaxDistilBertForMaskedLMModule(nn.Module):
        )
-@add_start_docstrings("""DistilBert Model with a `language modeling` head on top. """, FLAX_DISTILBERT_START_DOCSTRING)
+@add_start_docstrings("""DistilBert Model with a `language modeling` head on top.""", FLAX_DISTILBERT_START_DOCSTRING)
 class FlaxDistilBertForMaskedLM(FlaxDistilBertPreTrainedModel):
    module_class = FlaxDistilBertForMaskedLMModule

--- a/src/transformers/models/distilbert/modeling_tf_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_tf_distilbert.py
@@ -451,13 +451,13 @@ class TFDistilBertPreTrainedModel(TFPreTrainedModel):
 DISTILBERT_START_DOCSTRING = r"""
-    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
-    generic methods the library implements for all its model (such as downloading or saving, resizing the input
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    embeddings, pruning heads etc.)
+    etc.)
-    This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
+    This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
-    it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
-    and behavior.
+    behavior.
    <Tip>
@@ -466,11 +466,11 @@ DISTILBERT_START_DOCSTRING = r"""
    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional arguments.
-    This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+    This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all the
-    the tensors in the first argument of the model call function: `model(inputs)`.
+    tensors in the first argument of the model call function: `model(inputs)`.
-    If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+    If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the
-    the first positional argument :
+    first positional argument :
    - a single Tensor with `input_ids` only and nothing else: `model(inputs_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
@@ -483,8 +483,7 @@ DISTILBERT_START_DOCSTRING = r"""
    Parameters:
        config ([`DistilBertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-            weights.
 """
 DISTILBERT_INPUTS_DOCSTRING = r"""
@@ -492,9 +491,8 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
        input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using [`DistilBertTokenizer`]. See
+            Indices can be obtained using [`DistilBertTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
-            [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
+            [`PreTrainedTokenizer.encode`] for details.
-            details.
            [What are input IDs?](../glossary#input-ids)
        attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
@@ -511,9 +509,9 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
            - 0 indicates the head is **masked**.
        inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            This is useful if you want more control over how to convert `input_ids` indices into associated
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            vectors than the model's internal embedding lookup matrix.
+            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
@@ -523,8 +521,8 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
            used instead.
        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This argument can be used
-            argument can be used in eager mode, in graph mode the value will always be set to True.
+            in eager mode, in graph mode the value will always be set to True.
        training (`bool`, *optional*, defaults to `False`):
            Whether or not to use the model in training mode (some modules like dropout modules have different
            behaviors between training and evaluation).
@@ -632,7 +630,7 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
 @add_start_docstrings(
-    """DistilBert Model with a `masked language modeling` head on top. """,
+    """DistilBert Model with a `masked language modeling` head on top.""",
    DISTILBERT_START_DOCSTRING,
 )
 class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModelingLoss):
@@ -677,8 +675,9 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModel
    ):
        r"""
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """
        inputs = input_processing(
            func=self.call,
@@ -777,8 +776,9 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque
    ):
        r"""
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        inputs = input_processing(
            func=self.call,
@@ -978,8 +978,8 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic
    ):
        r"""
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
-            `input_ids` above)
+            where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
        """
        inputs = input_processing(
            func=self.call,
@@ -1105,12 +1105,12 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn
        r"""
        start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            sequence are not taken into account for computing the loss.
+            are not taken into account for computing the loss.
        end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            sequence are not taken into account for computing the loss.
+            are not taken into account for computing the loss.
        """
        inputs = input_processing(
            func=self.call,

--- a/src/transformers/models/distilbert/tokenization_distilbert.py
+++ b/src/transformers/models/distilbert/tokenization_distilbert.py
@@ -57,11 +57,10 @@ class DistilBertTokenizer(BertTokenizer):
    r"""
    Construct a DistilBERT tokenizer.
-    [`DistilBertTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end
+    [`DistilBertTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end tokenization: punctuation splitting
-    tokenization: punctuation splitting and wordpiece.
+    and wordpiece.
-    Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning parameters.
-    parameters.
    """
    vocab_files_names = VOCAB_FILES_NAMES

--- a/src/transformers/models/distilbert/tokenization_distilbert_fast.py
+++ b/src/transformers/models/distilbert/tokenization_distilbert_fast.py
@@ -66,11 +66,10 @@ class DistilBertTokenizerFast(BertTokenizerFast):
    r"""
    Construct a "fast" DistilBERT tokenizer (backed by HuggingFace's *tokenizers* library).
-    [`DistilBertTokenizerFast`] is identical to [`BertTokenizerFast`] and runs
+    [`DistilBertTokenizerFast`] is identical to [`BertTokenizerFast`] and runs end-to-end tokenization: punctuation
-    end-to-end tokenization: punctuation splitting and wordpiece.
+    splitting and wordpiece.
-    Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning parameters.
-    parameters.
    """
    vocab_files_names = VOCAB_FILES_NAMES