Mass conversion of documentation from rst to Markdown (#14866)

* Convert docstrings of all configurations and tokenizers * Processors and fixes * Last modeling files and fixes to models * Pipeline modules * Utils files * Data submodule * All the other files * Style * Missing examples * Style again * Fix copies * Say bye bye to rst docstrings forever

Mass conversion of documentation from rst to Markdown (#14866)
* Convert docstrings of all configurations and tokenizers * Processors and fixes * Last modeling files and fixes to models * Pipeline modules * Utils files * Data submodule * All the other files * Style * Missing examples * Style again * Fix copies * Say bye bye to rst docstrings forever
27b3031d · Sylvain Gugger · GitHub · 18587639 · 27b3031d · 27b3031d
Unverified Commit 27b3031d authored Dec 21, 2021 by Sylvain Gugger Committed by GitHub Dec 21, 2021
20 changed files
--- a/src/transformers/models/t5/modeling_tf_t5.py
+++ b/src/transformers/models/t5/modeling_tf_t5.py
@@ -1172,8 +1172,9 @@ class TFT5Model(TFT5PreTrainedModel):
        r"""
        Returns:

-        Examples::
+        Examples:

+        ```python
        >>> from transformers import T5Tokenizer, TFT5Model

        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
@@ -1185,8 +1186,7 @@ class TFT5Model(TFT5PreTrainedModel):
        >>> # forward pass
        >>> outputs = model(input_ids, decoder_input_ids=decoder_input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
-
-        """
+        ```"""
        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
        if head_mask is not None and decoder_head_mask is None:
            warnings.warn(_HEAD_MASK_WARNING_MSG, FutureWarning)
@@ -1627,8 +1627,9 @@ class TFT5EncoderModel(TFT5PreTrainedModel):
        r"""
        Returns:

-        Examples::
+        Examples:

+        ```python
        >>> from transformers import T5Tokenizer, TFT5EncoderModel

        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
@@ -1636,8 +1637,7 @@ class TFT5EncoderModel(TFT5PreTrainedModel):

        >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="tf").input_ids  # Batch size 1
        >>> outputs = model(input_ids)
-
-        """
+        ```"""
        inputs = input_processing(
            func=self.call,
            config=self.config,

--- a/src/transformers/models/t5/tokenization_t5.py
+++ b/src/transformers/models/t5/tokenization_t5.py
@@ -52,53 +52,54 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {

 class T5Tokenizer(PreTrainedTokenizer):
    """
-    Construct a T5 tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
+    Construct a T5 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
    Users should refer to this superclass for more information regarding those methods.

    Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

-            .. note::
+            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
-        extra_ids (:obj:`int`, `optional`, defaults to 100):
+        extra_ids (`int`, *optional*, defaults to 100):
            Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are
            accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are
            indexed from the end of the vocabulary up to beginning ("<extra_id_0>" is the last token in the vocabulary
-            like in T5 preprocessing see `here
-            <https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117>`__).
-        additional_special_tokens (:obj:`List[str]`, `optional`):
+            like in T5 preprocessing see [here](https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)).
+        additional_special_tokens (`List[str]`, *optional*):
            Additional special tokens used by the tokenizer.
-        sp_model_kwargs (:obj:`dict`, `optional`):
-            Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
-            <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:

-            - ``enable_sampling``: Enable subword regularization.
-            - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

-              - ``nbest_size = {0,1}``: No sampling is performed.
-              - ``nbest_size > 1``: samples from the nbest_size results.
-              - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

-            - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

    Attributes:
-        sp_model (:obj:`SentencePieceProcessor`):
-            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
    """

    vocab_files_names = VOCAB_FILES_NAMES
@@ -161,18 +162,18 @@ class T5Tokenizer(PreTrainedTokenizer):
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
@@ -202,13 +203,13 @@ class T5Tokenizer(PreTrainedTokenizer):
        use of token type ids, therefore a list of zeros is returned.

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`: List of zeros.
+            `List[int]`: List of zeros.
        """
        eos = [self.eos_token_id]

@@ -223,17 +224,17 @@ class T5Tokenizer(PreTrainedTokenizer):
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

-        - single sequence: ``X </s>``
-        - pair of sequences: ``A </s> B </s>``
+        - single sequence: `X </s>`
+        - pair of sequences: `A </s> B </s>`

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        token_ids_0 = self._add_eos_if_not_present(token_ids_0)
        if token_ids_1 is None:

--- a/src/transformers/models/t5/tokenization_t5_fast.py
+++ b/src/transformers/models/t5/tokenization_t5_fast.py
@@ -62,35 +62,36 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {

 class T5TokenizerFast(PreTrainedTokenizerFast):
    """
-    Construct a "fast" T5 tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
-    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__.
+    Construct a "fast" T5 tokenizer (backed by HuggingFace's *tokenizers* library). Based on [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).

-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
    methods. Users should refer to this superclass for more information regarding those methods.

    Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

-            .. note::
+            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
-        extra_ids (:obj:`int`, `optional`, defaults to 100):
+        extra_ids (`int`, *optional*, defaults to 100):
            Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are
            accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are
            indexed from the end of the vocabulary up to beginning ("<extra_id_0>" is the last token in the vocabulary
-            like in T5 preprocessing see `here
-            <https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117>`__).
-        additional_special_tokens (:obj:`List[str]`, `optional`):
+            like in T5 preprocessing see [here](https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)).
+        additional_special_tokens (`List[str]`, *optional*):
            Additional special tokens used by the tokenizer.
    """

@@ -167,17 +168,17 @@ class T5TokenizerFast(PreTrainedTokenizerFast):
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

-        - single sequence: ``X </s>``
-        - pair of sequences: ``A </s> B </s>``
+        - single sequence: `X </s>`
+        - pair of sequences: `A </s> B </s>`

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        token_ids_0 = token_ids_0 + [self.eos_token_id]
        if token_ids_1 is None:
@@ -194,13 +195,13 @@ class T5TokenizerFast(PreTrainedTokenizerFast):
        use of token type ids, therefore a list of zeros is returned.

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`: List of zeros.
+            `List[int]`: List of zeros.
        """
        eos = [self.eos_token_id]


--- a/src/transformers/models/tapas/configuration_tapas.py
+++ b/src/transformers/models/tapas/configuration_tapas.py
@@ -36,101 +36,102 @@ TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class TapasConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.TapasModel`. It is used to
+    This is the configuration class to store the configuration of a [`TapasModel`]. It is used to
    instantiate a TAPAS model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the TAPAS `tapas-base-finetuned-sqa`
-    architecture. Configuration objects inherit from :class:`~transformers.PreTrainedConfig` and can be used to control
-    the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    configuration with the defaults will yield a similar configuration to that of the TAPAS *tapas-base-finetuned-sqa*
+    architecture. Configuration objects inherit from [`PreTrainedConfig`] and can be used to control
+    the model outputs. Read the documentation from [`PretrainedConfig`] for more information.

    Hyperparameters additional to BERT are taken from run_task_main.py and hparam_utils.py of the original
    implementation. Original implementation available at https://github.com/google-research/tapas/tree/master.

    Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+        vocab_size (`int`, *optional*, defaults to 30522):
            Vocabulary size of the TAPAS model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.TapasModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            `inputs_ids` passed when calling [`TapasModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"swish"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
+        max_position_embeddings (`int`, *optional*, defaults to 1024):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_sizes (:obj:`List[int]`, `optional`, defaults to :obj:`[3, 256, 256, 2, 256, 256, 10]`):
-            The vocabulary sizes of the :obj:`token_type_ids` passed when calling :class:`~transformers.TapasModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_sizes (`List[int]`, *optional*, defaults to `[3, 256, 256, 2, 256, 256, 10]`):
+            The vocabulary sizes of the `token_type_ids` passed when calling [`TapasModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
-        positive_label_weight (:obj:`float`, `optional`, defaults to 10.0):
+        positive_label_weight (`float`, *optional*, defaults to 10.0):
            Weight for positive labels.
-        num_aggregation_labels (:obj:`int`, `optional`, defaults to 0):
+        num_aggregation_labels (`int`, *optional*, defaults to 0):
            The number of aggregation operators to predict.
-        aggregation_loss_weight (:obj:`float`, `optional`, defaults to 1.0):
+        aggregation_loss_weight (`float`, *optional*, defaults to 1.0):
            Importance weight for the aggregation loss.
-        use_answer_as_supervision (:obj:`bool`, `optional`):
+        use_answer_as_supervision (`bool`, *optional*):
            Whether to use the answer as the only supervision for aggregation examples.
-        answer_loss_importance (:obj:`float`, `optional`, defaults to 1.0):
+        answer_loss_importance (`float`, *optional*, defaults to 1.0):
            Importance weight for the regression loss.
-        use_normalized_answer_loss (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        use_normalized_answer_loss (`bool`, *optional*, defaults to `False`):
            Whether to normalize the answer loss by the maximum of the predicted and expected value.
-        huber_loss_delta (:obj:`float`, `optional`):
+        huber_loss_delta (`float`, *optional*):
            Delta parameter used to calculate the regression loss.
-        temperature (:obj:`float`, `optional`, defaults to 1.0):
+        temperature (`float`, *optional*, defaults to 1.0):
            Value used to control (OR change) the skewness of cell logits probabilities.
-        aggregation_temperature (:obj:`float`, `optional`, defaults to 1.0):
+        aggregation_temperature (`float`, *optional*, defaults to 1.0):
            Scales aggregation logits to control the skewness of probabilities.
-        use_gumbel_for_cells (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        use_gumbel_for_cells (`bool`, *optional*, defaults to `False`):
            Whether to apply Gumbel-Softmax to cell selection.
-        use_gumbel_for_aggregation (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        use_gumbel_for_aggregation (`bool`, *optional*, defaults to `False`):
            Whether to apply Gumbel-Softmax to aggregation selection.
-        average_approximation_function (:obj:`string`, `optional`, defaults to :obj:`"ratio"`):
-            Method to calculate the expected average of cells in the weak supervision case. One of :obj:`"ratio"`,
-            :obj:`"first_order"` or :obj:`"second_order"`.
-        cell_selection_preference (:obj:`float`, `optional`):
+        average_approximation_function (`string`, *optional*, defaults to `"ratio"`):
+            Method to calculate the expected average of cells in the weak supervision case. One of `"ratio"`,
+            `"first_order"` or `"second_order"`.
+        cell_selection_preference (`float`, *optional*):
            Preference for cell selection in ambiguous cases. Only applicable in case of weak supervision for
            aggregation (WTQ, WikiSQL). If the total mass of the aggregation probabilities (excluding the "NONE"
            operator) is higher than this hyperparameter, then aggregation is predicted for an example.
-        answer_loss_cutoff (:obj:`float`, `optional`):
+        answer_loss_cutoff (`float`, *optional*):
            Ignore examples with answer loss larger than cutoff.
-        max_num_rows (:obj:`int`, `optional`, defaults to 64):
+        max_num_rows (`int`, *optional*, defaults to 64):
            Maximum number of rows.
-        max_num_columns (:obj:`int`, `optional`, defaults to 32):
+        max_num_columns (`int`, *optional*, defaults to 32):
            Maximum number of columns.
-        average_logits_per_cell (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        average_logits_per_cell (`bool`, *optional*, defaults to `False`):
            Whether to average logits per cell.
-        select_one_column (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        select_one_column (`bool`, *optional*, defaults to `True`):
            Whether to constrain the model to only select cells from a single column.
-        allow_empty_column_selection (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        allow_empty_column_selection (`bool`, *optional*, defaults to `False`):
            Whether to allow not to select any column.
-        init_cell_selection_weights_to_zero (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        init_cell_selection_weights_to_zero (`bool`, *optional*, defaults to `False`):
            Whether to initialize cell selection weights to 0 so that the initial probabilities are 50%.
-        reset_position_index_per_cell (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        reset_position_index_per_cell (`bool`, *optional*, defaults to `True`):
            Whether to restart position indexes at every cell (i.e. use relative position embeddings).
-        disable_per_token_loss (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        disable_per_token_loss (`bool`, *optional*, defaults to `False`):
            Whether to disable any (strong or weak) supervision on cells.
-        aggregation_labels (:obj:`Dict[int, label]`, `optional`):
+        aggregation_labels (`Dict[int, label]`, *optional*):
            The aggregation labels used to aggregate the results. For example, the WTQ models have the following
-            aggregation labels: :obj:`{0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"}`
-        no_aggregation_label_index (:obj:`int`, `optional`):
+            aggregation labels: `{0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"}`
+        no_aggregation_label_index (`int`, *optional*):
            If the aggregation labels are defined and one of these labels represents "No aggregation", this should be
            set to its index. For example, the WTQ models have the "NONE" aggregation label at index 0, so that value
            should be set to 0 for these models.


-    Example::
+    Example:

+    ```python
    >>> from transformers import TapasModel, TapasConfig
    >>> # Initializing a default (SQA) Tapas configuration
    >>> configuration = TapasConfig()
@@ -138,7 +139,7 @@ class TapasConfig(PretrainedConfig):
    >>> model = TapasModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
-    """
+    ```"""

    model_type = "tapas"


--- a/src/transformers/models/tapas/modeling_tapas.py
+++ b/src/transformers/models/tapas/modeling_tapas.py
@@ -912,8 +912,9 @@ class TapasModel(TapasPreTrainedModel):
        r"""
        Returns:

-        Examples::
+        Examples:

+        ```python
        >>> from transformers import TapasTokenizer, TapasModel
        >>> import pandas as pd

@@ -931,7 +932,7 @@ class TapasModel(TapasPreTrainedModel):
        >>> outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
-        """
+        ```"""
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states

--- a/src/transformers/models/tapas/modeling_tf_tapas.py
+++ b/src/transformers/models/tapas/modeling_tf_tapas.py
@@ -1004,8 +1004,9 @@ class TFTapasModel(TFTapasPreTrainedModel):
        r"""
        Returns:

-        Examples::
+        Examples:

+        ```python
        >>> from transformers import TapasTokenizer, TapasModel
        >>> import pandas as pd

@@ -1023,7 +1024,7 @@ class TFTapasModel(TFTapasPreTrainedModel):
        >>> outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
-        """
+        ```"""
        inputs = input_processing(
            func=self.call,
            config=self.config,

--- a/src/transformers/models/tapas/tokenization_tapas.py
+++ b/src/transformers/models/tapas/tokenization_tapas.py
@@ -89,7 +89,7 @@ PRETRAINED_INIT_CONFIGURATION = {name: {"do_lower_case": True} for name in PRETR

 class TapasTruncationStrategy(ExplicitEnum):
    """
-    Possible values for the ``truncation`` argument in :meth:`~transformers.TapasTokenizer.__call__`. Useful for
+    Possible values for the `truncation` argument in [`~TapasTokenizer.__call__`]. Useful for
    tab-completion in an IDE.
    """

@@ -146,44 +146,44 @@ def whitespace_tokenize(text):


 TAPAS_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
-            add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            add_special_tokens (`bool`, *optional*, defaults to `True`):
                Whether or not to encode the sequences with the special tokens relative to their model.
-            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
+            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
                Activates and controls padding. Accepts the following values:

-                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a
                  single sequence if provided).
-                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided.
-                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
                  different lengths).
-            truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.TapasTruncationStrategy`, `optional`, defaults to :obj:`False`):
+            truncation (`bool`, `str` or [`TapasTruncationStrategy`], *optional*, defaults to `False`):
                Activates and controls truncation. Accepts the following values:

-                * :obj:`True` or :obj:`'drop_rows_to_fit'`: Truncate to a maximum length specified with the argument
-                  :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not
+                - `True` or `'drop_rows_to_fit'`: Truncate to a maximum length specified with the argument
+                  `max_length` or to the maximum acceptable input length for the model if that argument is not
                  provided. This will truncate row by row, removing rows from the table.
-                * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with
+                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with
                  sequence lengths greater than the model maximum admissible input size).
-            max_length (:obj:`int`, `optional`):
+            max_length (`int`, *optional*):
                Controls the maximum length to use by one of the truncation/padding parameters.

-                If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
+                If left unset or set to `None`, this will use the predefined model maximum length if a maximum
                length is required by one of the truncation/padding parameters. If the model has no specific maximum
                input length (like XLNet) truncation/padding to a maximum length will be deactivated.
-            is_split_into_words (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the input is already pre-tokenized (e.g., split into words). If set to :obj:`True`, the
+            is_split_into_words (`bool`, *optional*, defaults to `False`):
+                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
                which it will tokenize. This is useful for NER or token classification.
-            pad_to_multiple_of (:obj:`int`, `optional`):
+            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
 """


@@ -192,11 +192,11 @@ class TapasTokenizer(PreTrainedTokenizer):
    Construct a TAPAS tokenizer. Based on WordPiece. Flattens a table and one or more related sentences to be used by
    TAPAS models.

-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
    Users should refer to this superclass for more information regarding those methods.
-    :class:`~transformers.TapasTokenizer` creates several token type ids to encode tabular structure. To be more
-    precise, it adds 7 token type ids, in the following order: :obj:`segment_ids`, :obj:`column_ids`, :obj:`row_ids`,
-    :obj:`prev_labels`, :obj:`column_ranks`, :obj:`inv_column_ranks` and :obj:`numeric_relations`:
+    [`TapasTokenizer`] creates several token type ids to encode tabular structure. To be more
+    precise, it adds 7 token type ids, in the following order: `segment_ids`, `column_ids`, `row_ids`,
+    `prev_labels`, `column_ranks`, `inv_column_ranks` and `numeric_relations`:

    - segment_ids: indicate whether a token belongs to the question (0) or the table (1). 0 for special tokens and
      padding.
@@ -215,56 +215,56 @@ class TapasTokenizer(PreTrainedTokenizer):
    - numeric_relations: indicate numeric relations between the question and the tokens of the table. 0 for all
      question tokens, special tokens and padding.

-    :class:`~transformers.TapasTokenizer` runs end-to-end tokenization on a table and associated sentences: punctuation
+    [`TapasTokenizer`] runs end-to-end tokenization on a table and associated sentences: punctuation
    splitting and wordpiece.

    Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
            File containing the vocabulary.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
-        do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
            Whether or not to do basic tokenization before WordPiece.
-        never_split (:obj:`Iterable`, `optional`):
+        never_split (`Iterable`, *optional*):
            Collection of tokens which will never be split during tokenization. Only has an effect when
-            :obj:`do_basic_tokenize=True`
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+            `do_basic_tokenize=True`
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
-        empty_token (:obj:`str`, `optional`, defaults to :obj:`"[EMPTY]"`):
+        empty_token (`str`, *optional*, defaults to `"[EMPTY]"`):
            The token used for empty cell values in a table. Empty cell values include "", "n/a", "nan" and "?".
-        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see this
-            `issue <https://github.com/huggingface/transformers/issues/328>`__).
-        strip_accents: (:obj:`bool`, `optional`):
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents: (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for :obj:`lowercase` (as in the original BERT).
-        cell_trim_length (:obj:`int`, `optional`, defaults to -1):
+            value for `lowercase` (as in the original BERT).
+        cell_trim_length (`int`, *optional*, defaults to -1):
            If > 0: Trim cells so that the length is <= this value. Also disables further cell trimming, should thus be
-            used with :obj:`truncation` set to :obj:`True`.
-        max_column_id (:obj:`int`, `optional`):
+            used with `truncation` set to `True`.
+        max_column_id (`int`, *optional*):
            Max column id to extract.
-        max_row_id (:obj:`int`, `optional`):
+        max_row_id (`int`, *optional*):
            Max row id to extract.
-        strip_column_names (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        strip_column_names (`bool`, *optional*, defaults to `False`):
            Whether to add empty strings instead of column names.
-        update_answer_coordinates (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        update_answer_coordinates (`bool`, *optional*, defaults to `False`):
            Whether to recompute the answer coordinates from the answer text.
-        min_question_length (:obj:`int`, `optional`):
+        min_question_length (`int`, *optional*):
            Minimum length of each question in terms of tokens (will be skipped otherwise).
-        max_question_length (:obj:`int`, `optional`):
+        max_question_length (`int`, *optional*):
            Maximum length of each question in terms of tokens (will be skipped otherwise).
    """

@@ -421,12 +421,12 @@ class TapasTokenizer(PreTrainedTokenizer):
        Creates the attention mask according to the query token IDs and a list of table values.

        Args:
-            query_ids (:obj:`List[int]`): list of token IDs corresponding to the ID.
-            table_values (:obj:`List[TableValue]`): lift of table values, which are named tuples containing the
+            query_ids (`List[int]`): list of token IDs corresponding to the ID.
+            table_values (`List[TableValue]`): lift of table values, which are named tuples containing the
                token value, the column ID and the row ID of said token.

        Returns:
-            :obj:`List[int]`: List of ints containing the attention mask values.
+            `List[int]`: List of ints containing the attention mask values.
        """
        return [1] * (1 + len(query_ids) + 1 + len(table_values))

@@ -437,12 +437,12 @@ class TapasTokenizer(PreTrainedTokenizer):
        Creates the segment token type IDs according to the query token IDs and a list of table values.

        Args:
-            query_ids (:obj:`List[int]`): list of token IDs corresponding to the ID.
-            table_values (:obj:`List[TableValue]`): lift of table values, which are named tuples containing the
+            query_ids (`List[int]`): list of token IDs corresponding to the ID.
+            table_values (`List[TableValue]`): lift of table values, which are named tuples containing the
                token value, the column ID and the row ID of said token.

        Returns:
-            :obj:`List[int]`: List of ints containing the segment token type IDs values.
+            `List[int]`: List of ints containing the segment token type IDs values.
        """
        table_ids = list(zip(*table_values))[0] if table_values else []
        return [0] * (1 + len(query_ids) + 1) + [1] * len(table_ids)
@@ -454,12 +454,12 @@ class TapasTokenizer(PreTrainedTokenizer):
        Creates the column token type IDs according to the query token IDs and a list of table values.

        Args:
-            query_ids (:obj:`List[int]`): list of token IDs corresponding to the ID.
-            table_values (:obj:`List[TableValue]`): lift of table values, which are named tuples containing the
+            query_ids (`List[int]`): list of token IDs corresponding to the ID.
+            table_values (`List[TableValue]`): lift of table values, which are named tuples containing the
                token value, the column ID and the row ID of said token.

        Returns:
-            :obj:`List[int]`: List of ints containing the column token type IDs values.
+            `List[int]`: List of ints containing the column token type IDs values.
        """
        table_column_ids = list(zip(*table_values))[1] if table_values else []
        return [0] * (1 + len(query_ids) + 1) + list(table_column_ids)
@@ -471,12 +471,12 @@ class TapasTokenizer(PreTrainedTokenizer):
        Creates the row token type IDs according to the query token IDs and a list of table values.

        Args:
-            query_ids (:obj:`List[int]`): list of token IDs corresponding to the ID.
-            table_values (:obj:`List[TableValue]`): lift of table values, which are named tuples containing the
+            query_ids (`List[int]`): list of token IDs corresponding to the ID.
+            table_values (`List[TableValue]`): lift of table values, which are named tuples containing the
                token value, the column ID and the row ID of said token.

        Returns:
-            :obj:`List[int]`: List of ints containing the row token type IDs values.
+            `List[int]`: List of ints containing the row token type IDs values.
        """
        table_row_ids = list(zip(*table_values))[2] if table_values else []
        return [0] * (1 + len(query_ids) + 1) + list(table_row_ids)
@@ -489,11 +489,11 @@ class TapasTokenizer(PreTrainedTokenizer):
        by concatenating and adding special tokens.

        Args:
-            token_ids_0 (:obj:`List[int]`): The ids of the question.
-            token_ids_1 (:obj:`List[int]`, `optional`): The ids of the flattened table.
+            token_ids_0 (`List[int]`): The ids of the question.
+            token_ids_1 (`List[int]`, *optional*): The ids of the flattened table.

        Returns:
-            :obj:`List[int]`: The model input with special tokens.
+            `List[int]`: The model input with special tokens.
        """
        if token_ids_1 is None:
            raise ValueError("With TAPAS, you must provide both question IDs and table IDs.")
@@ -505,18 +505,18 @@ class TapasTokenizer(PreTrainedTokenizer):
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of question IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                List of flattened table IDs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        if already_has_special_tokens:
@@ -563,20 +563,20 @@ class TapasTokenizer(PreTrainedTokenizer):
        Main method to tokenize and prepare for the model one or several sequence(s) related to a table.

        Args:
-            table (:obj:`pd.DataFrame`):
-                Table containing tabular data. Note that all cell values must be text. Use `.astype(str)` on a Pandas
+            table (`pd.DataFrame`):
+                Table containing tabular data. Note that all cell values must be text. Use *.astype(str)* on a Pandas
                dataframe to convert it to string.
-            queries (:obj:`str` or :obj:`List[str]`):
+            queries (`str` or `List[str]`):
                Question or batch of questions related to a table to be encoded. Note that in case of a batch, all
                questions must refer to the **same** table.
-            answer_coordinates (:obj:`List[Tuple]` or :obj:`List[List[Tuple]]`, `optional`):
+            answer_coordinates (`List[Tuple]` or `List[List[Tuple]]`, *optional*):
                Answer coordinates of each table-question pair in the batch. In case only a single table-question pair
                is provided, then the answer_coordinates must be a single list of one or more tuples. Each tuple must
                be a (row_index, column_index) pair. The first data row (not the column header row) has index 0. The
                first column has index 0. In case a batch of table-question pairs is provided, then the
                answer_coordinates must be a list of lists of tuples (each list corresponding to a single
                table-question pair).
-            answer_text (:obj:`List[str]` or :obj:`List[List[str]]`, `optional`):
+            answer_text (`List[str]` or `List[List[str]]`, *optional*):
                Answer text of each table-question pair in the batch. In case only a single table-question pair is
                provided, then the answer_text must be a single list of one or more strings. Each string must be the
                answer text of a corresponding answer coordinate. In case a batch of table-question pairs is provided,
@@ -675,22 +675,25 @@ class TapasTokenizer(PreTrainedTokenizer):
        """
        Prepare a table and a list of strings for the model.

-        .. warning::
-            This method is deprecated, ``__call__`` should be used instead.
+        <Tip warning={true}>
+
+        This method is deprecated, `__call__` should be used instead.
+
+        </Tip>

        Args:
-            table (:obj:`pd.DataFrame`):
-                Table containing tabular data. Note that all cell values must be text. Use `.astype(str)` on a Pandas
+            table (`pd.DataFrame`):
+                Table containing tabular data. Note that all cell values must be text. Use *.astype(str)* on a Pandas
                dataframe to convert it to string.
-            queries (:obj:`List[str]`):
+            queries (`List[str]`):
                Batch of questions related to a table to be encoded. Note that all questions must refer to the **same**
                table.
-            answer_coordinates (:obj:`List[Tuple]` or :obj:`List[List[Tuple]]`, `optional`):
+            answer_coordinates (`List[Tuple]` or `List[List[Tuple]]`, *optional*):
                Answer coordinates of each table-question pair in the batch. Each tuple must be a (row_index,
                column_index) pair. The first data row (not the column header row) has index 0. The first column has
                index 0. The answer_coordinates must be a list of lists of tuples (each list corresponding to a single
                table-question pair).
-            answer_text (:obj:`List[str]` or :obj:`List[List[str]]`, `optional`):
+            answer_text (`List[str]` or `List[List[str]]`, *optional*):
                Answer text of each table-question pair in the batch. In case a batch of table-question pairs is
                provided, then the answer_coordinates must be a list of lists of strings (each list corresponding to a
                single table-question pair). Each string must be the answer text of a corresponding answer coordinate.
@@ -900,13 +903,13 @@ class TapasTokenizer(PreTrainedTokenizer):
        """
        Prepare a table and a string for the model. This method does not return token type IDs, attention masks, etc.
        which are necessary for the model to work correctly. Use that method if you want to build your processing on
-        your own, otherwise refer to ``__call__``.
+        your own, otherwise refer to `__call__`.

        Args:
-            table (:obj:`pd.DataFrame`):
-                Table containing tabular data. Note that all cell values must be text. Use `.astype(str)` on a Pandas
+            table (`pd.DataFrame`):
+                Table containing tabular data. Note that all cell values must be text. Use *.astype(str)* on a Pandas
                dataframe to convert it to string.
-            query (:obj:`str` or :obj:`List[str]`):
+            query (`str` or `List[str]`):
                Question related to a table to be encoded.
        """
        encoded_inputs = self.encode_plus(
@@ -953,16 +956,16 @@ class TapasTokenizer(PreTrainedTokenizer):
        Prepare a table and a string for the model.

        Args:
-            table (:obj:`pd.DataFrame`):
-                Table containing tabular data. Note that all cell values must be text. Use `.astype(str)` on a Pandas
+            table (`pd.DataFrame`):
+                Table containing tabular data. Note that all cell values must be text. Use *.astype(str)* on a Pandas
                dataframe to convert it to string.
-            query (:obj:`str` or :obj:`List[str]`):
+            query (`str` or `List[str]`):
                Question related to a table to be encoded.
-            answer_coordinates (:obj:`List[Tuple]` or :obj:`List[List[Tuple]]`, `optional`):
+            answer_coordinates (`List[Tuple]` or `List[List[Tuple]]`, *optional*):
                Answer coordinates of each table-question pair in the batch. The answer_coordinates must be a single
                list of one or more tuples. Each tuple must be a (row_index, column_index) pair. The first data row
                (not the column header row) has index 0. The first column has index 0.
-            answer_text (:obj:`List[str]` or :obj:`List[List[str]]`, `optional`):
+            answer_text (`List[str]` or `List[List[str]]`, *optional*):
                Answer text of each table-question pair in the batch. The answer_text must be a single list of one or
                more strings. Each string must be the answer text of a corresponding answer coordinate.
        """
@@ -1094,19 +1097,19 @@ class TapasTokenizer(PreTrainedTokenizer):
        sequences if overflowing while taking into account the special tokens.

        Args:
-            raw_table (:obj:`pd.DataFrame`):
+            raw_table (`pd.DataFrame`):
                The original table before any transformation (like tokenization) was applied to it.
-            raw_query (:obj:`TextInput` or :obj:`PreTokenizedInput` or :obj:`EncodedInput`):
+            raw_query (`TextInput` or `PreTokenizedInput` or `EncodedInput`):
                The original query before any transformation (like tokenization) was applied to it.
-            tokenized_table (:obj:`TokenizedTable`):
+            tokenized_table (`TokenizedTable`):
                The table after tokenization.
-            query_tokens (:obj:`List[str]`):
+            query_tokens (`List[str]`):
                The query after tokenization.
-            answer_coordinates (:obj:`List[Tuple]` or :obj:`List[List[Tuple]]`, `optional`):
+            answer_coordinates (`List[Tuple]` or `List[List[Tuple]]`, *optional*):
                Answer coordinates of each table-question pair in the batch. The answer_coordinates must be a single
                list of one or more tuples. Each tuple must be a (row_index, column_index) pair. The first data row
                (not the column header row) has index 0. The first column has index 0.
-            answer_text (:obj:`List[str]` or :obj:`List[List[str]]`, `optional`):
+            answer_text (`List[str]` or `List[List[str]]`, *optional*):
                Answer text of each table-question pair in the batch. The answer_text must be a single list of one or
                more strings. Each string must be the answer text of a corresponding answer coordinate.
        """
@@ -1267,22 +1270,22 @@ class TapasTokenizer(PreTrainedTokenizer):
        Truncates a sequence pair in-place following the strategy.

        Args:
-            query_tokens (:obj:`List[str]`):
+            query_tokens (`List[str]`):
                List of strings corresponding to the tokenized query.
-            tokenized_table (:obj:`TokenizedTable`):
+            tokenized_table (`TokenizedTable`):
                Tokenized table
-            num_rows (:obj:`int`):
+            num_rows (`int`):
                Total number of table rows
-            num_columns (:obj:`int`):
+            num_columns (`int`):
                Total number of table columns
-            max_length (:obj:`int`):
+            max_length (`int`):
                Total maximum length.
-            truncation_strategy (:obj:`str` or :class:`~transformers.TapasTruncationStrategy`):
+            truncation_strategy (`str` or [`TapasTruncationStrategy`]):
                Truncation strategy to use. Seeing as this method should only be called when truncating, the only
-                available strategy is the :obj:`"drop_rows_to_fit"` strategy.
+                available strategy is the `"drop_rows_to_fit"` strategy.

        Returns:
-            :obj:`Tuple(int, int)`: tuple containing the number of rows after truncation, and the number of tokens
+            `Tuple(int, int)`: tuple containing the number of rows after truncation, and the number of tokens
            available for each table element.
        """
        if not isinstance(truncation_strategy, TapasTruncationStrategy):
@@ -1319,8 +1322,8 @@ class TapasTokenizer(PreTrainedTokenizer):
        Tokenizes column headers and cell texts of a table.

        Args:
-            table (:obj:`pd.Dataframe`):
-                Table. Returns: :obj:`TokenizedTable`: TokenizedTable object.
+            table (`pd.Dataframe`):
+                Table. Returns: `TokenizedTable`: TokenizedTable object.
        """
        tokenized_rows = []
        tokenized_row = []
@@ -1366,8 +1369,8 @@ class TapasTokenizer(PreTrainedTokenizer):
        sequence length of the model.

        Args:
-            question_tokens (:obj:`List[String]`):
-                List of question tokens. Returns: :obj:`int`: the number of tokens left for the table, given the model
+            question_tokens (`List[String]`):
+                List of question tokens. Returns: `int`: the number of tokens left for the table, given the model
                max length.
        """
        return (max_length if max_length is not None else self.model_max_length) - self._question_encoding_cost(
@@ -1887,33 +1890,32 @@ class TapasTokenizer(PreTrainedTokenizer):

    def convert_logits_to_predictions(self, data, logits, logits_agg=None, cell_classification_threshold=0.5):
        """
-        Converts logits of :class:`~transformers.TapasForQuestionAnswering` to actual predicted answer coordinates and
+        Converts logits of [`TapasForQuestionAnswering`] to actual predicted answer coordinates and
        optional aggregation indices.

-        The original implementation, on which this function is based, can be found `here
-        <https://github.com/google-research/tapas/blob/4908213eb4df7aa988573350278b44c4dbe3f71b/tapas/experiments/prediction_utils.py#L288>`__.
+        The original implementation, on which this function is based, can be found [here](https://github.com/google-research/tapas/blob/4908213eb4df7aa988573350278b44c4dbe3f71b/tapas/experiments/prediction_utils.py#L288).

        Args:
-            data (:obj:`dict`):
+            data (`dict`):
                Dictionary mapping features to actual values. Should be created using
-                :class:`~transformers.TapasTokenizer`.
-            logits (:obj:`torch.Tensor` or :obj:`tf.Tensor` of shape ``(batch_size, sequence_length)``):
+                [`TapasTokenizer`].
+            logits (`torch.Tensor` or `tf.Tensor` of shape `(batch_size, sequence_length)`):
                Tensor containing the logits at the token level.
-            logits_agg (:obj:`torch.Tensor` or :obj:`tf.Tensor` of shape ``(batch_size, num_aggregation_labels)``, `optional`):
+            logits_agg (`torch.Tensor` or `tf.Tensor` of shape `(batch_size, num_aggregation_labels)`, *optional*):
                Tensor containing the aggregation logits.
-            cell_classification_threshold (:obj:`float`, `optional`, defaults to 0.5):
+            cell_classification_threshold (`float`, *optional*, defaults to 0.5):
                Threshold to be used for cell selection. All table cells for which their probability is larger than
                this threshold will be selected.

        Returns:
-            :obj:`tuple` comprising various elements depending on the inputs:
+            `tuple` comprising various elements depending on the inputs:

-            - predicted_answer_coordinates (``List[List[[tuple]]`` of length ``batch_size``): Predicted answer
+            - predicted_answer_coordinates (`List[List[[tuple]]` of length `batch_size`): Predicted answer
              coordinates as a list of lists of tuples. Each element in the list contains the predicted answer
              coordinates of a single example in the batch, as a list of tuples. Each tuple is a cell, i.e. (row index,
              column index).
-            - predicted_aggregation_indices (``List[int]``of length ``batch_size``, `optional`, returned when
-              ``logits_aggregation`` is provided): Predicted aggregation operator indices of the aggregation head.
+            - predicted_aggregation_indices (`List[int]`of length `batch_size`, *optional*, returned when
+              `logits_aggregation` is provided): Predicted aggregation operator indices of the aggregation head.
        """
        # converting to numpy arrays to work with PT/TF
        logits = logits.numpy()
@@ -1994,19 +1996,18 @@ class BasicTokenizer(object):
    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).

    Args:
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
-        never_split (:obj:`Iterable`, `optional`):
+        never_split (`Iterable`, *optional*):
            Collection of tokens which will never be split during tokenization. Only has an effect when
-            :obj:`do_basic_tokenize=True`
-        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            `do_basic_tokenize=True`
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters.

-            This should likely be deactivated for Japanese (see this `issue
-            <https://github.com/huggingface/transformers/issues/328>`__).
-        strip_accents: (:obj:`bool`, `optional`):
+            This should likely be deactivated for Japanese (see this [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents: (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for :obj:`lowercase` (as in the original BERT).
+            value for `lowercase` (as in the original BERT).
    """

    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
@@ -2023,9 +2024,9 @@ class BasicTokenizer(object):
        WordPieceTokenizer.

        Args:
-            **never_split**: (`optional`) list of str
+            never_split (`LIst[str]`, *optional*)
                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
-                :func:`PreTrainedTokenizer.tokenize`) List of token not to split.
+                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
        """
        # union() returns a new set by concatenating the two sets.
        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
@@ -2152,11 +2153,11 @@ class WordpieceTokenizer(object):
        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
        tokenization using the given vocabulary.

-        For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`.
+        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.

        Args:
            text: A single token or whitespace separated tokens. This should have
-            already been passed through `BasicTokenizer`.
+                already been passed through *BasicTokenizer*.

        Returns:
            A list of wordpiece tokens.

--- a/src/transformers/models/transfo_xl/configuration_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/configuration_transfo_xl.py
@@ -28,70 +28,71 @@ TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class TransfoXLConfig(PretrainedConfig):
    """
-    This is the configuration class to store the configuration of a :class:`~transformers.TransfoXLModel` or a
-    :class:`~transformers.TFTransfoXLModel`. It is used to instantiate a Transformer-XL model according to the
+    This is the configuration class to store the configuration of a [`TransfoXLModel`] or a
+    [`TFTransfoXLModel`]. It is used to instantiate a Transformer-XL model according to the
    specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a
-    similar configuration to that of the `Transformer XL <https://huggingface.co/transfo-xl-wt103>`__ architecture.
+    similar configuration to that of the [Transformer XL](https://huggingface.co/transfo-xl-wt103) architecture.

-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.

    Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 267735):
+        vocab_size (`int`, *optional*, defaults to 267735):
            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.TransfoXLModel` or
-            :class:`~transformers.TFTransfoXLModel`.
-        cutoffs (:obj:`List[int]`, `optional`, defaults to :obj:`[20000, 40000, 200000]`):
+            `inputs_ids` passed when calling [`TransfoXLModel`] or
+            [`TFTransfoXLModel`].
+        cutoffs (`List[int]`, *optional*, defaults to `[20000, 40000, 200000]`):
            Cutoffs for the adaptive softmax.
-        d_model (:obj:`int`, `optional`, defaults to 1024):
+        d_model (`int`, *optional*, defaults to 1024):
            Dimensionality of the model's hidden states.
-        d_embed (:obj:`int`, `optional`, defaults to 1024):
+        d_embed (`int`, *optional*, defaults to 1024):
            Dimensionality of the embeddings
-        n_head (:obj:`int`, `optional`, defaults to 16):
+        n_head (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
-        d_head (:obj:`int`, `optional`, defaults to 64):
+        d_head (`int`, *optional*, defaults to 64):
            Dimensionality of the model's heads.
-        d_inner (:obj:`int`, `optional`, defaults to 4096):
+        d_inner (`int`, *optional*, defaults to 4096):
            Inner dimension in FF
-        div_val (:obj:`int`, `optional`, defaults to 4):
+        div_val (`int`, *optional*, defaults to 4):
            Divident value for adapative input and softmax
-        pre_lnorm (:obj:`boolean`, `optional`, defaults to :obj:`False`):
+        pre_lnorm (`boolean`, *optional*, defaults to `False`):
            Whether or not to apply LayerNorm to the input instead of the output in the blocks.
-        n_layer (:obj:`int`, `optional`, defaults to 18):
+        n_layer (`int`, *optional*, defaults to 18):
            Number of hidden layers in the Transformer encoder.
-        mem_len (:obj:`int`, `optional`, defaults to 1600):
+        mem_len (`int`, *optional*, defaults to 1600):
            Length of the retained previous heads.
-        clamp_len (:obj:`int`, `optional`, defaults to 1000):
+        clamp_len (`int`, *optional*, defaults to 1000):
            Use the same pos embeddings after clamp_len.
-        same_length (:obj:`boolean`, `optional`, defaults to :obj:`True`):
+        same_length (`boolean`, *optional*, defaults to `True`):
            Whether or not to use the same attn length for all tokens
-        proj_share_all_but_first (:obj:`boolean`, `optional`, defaults to :obj:`True`):
+        proj_share_all_but_first (`boolean`, *optional*, defaults to `True`):
            True to share all but first projs, False not to share.
-        attn_type (:obj:`int`, `optional`, defaults to 0):
+        attn_type (`int`, *optional*, defaults to 0):
            Attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
-        sample_softmax (:obj:`int`, `optional`, defaults to -1):
+        sample_softmax (`int`, *optional*, defaults to -1):
            Number of samples in the sampled softmax.
-        adaptive (:obj:`boolean`, `optional`, defaults to :obj:`True`):
+        adaptive (`boolean`, *optional*, defaults to `True`):
            Whether or not to use adaptive softmax.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
+        dropout (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        dropatt (:obj:`float`, `optional`, defaults to 0):
+        dropatt (`float`, *optional*, defaults to 0):
            The dropout ratio for the attention probabilities.
-        untie_r (:obj:`boolean`, `optional`, defaults to :obj:`True`):
+        untie_r (`boolean`, *optional*, defaults to `True`):
            Whether ot not to untie relative position biases.
-        init (:obj:`str`, `optional`, defaults to :obj:`"normal"`):
+        init (`str`, *optional*, defaults to `"normal"`):
            Parameter initializer to use.
-        init_range (:obj:`float`, `optional`, defaults to 0.01):
+        init_range (`float`, *optional*, defaults to 0.01):
            Parameters initialized by U(-init_range, init_range).
-        proj_init_std (:obj:`float`, `optional`, defaults to 0.01):
+        proj_init_std (`float`, *optional*, defaults to 0.01):
            Parameters initialized by N(0, init_std)
-        init_std (:obj:`float`, `optional`, defaults to 0.02):
+        init_std (`float`, *optional*, defaults to 0.02):
            Parameters initialized by N(0, init_std)
-        layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
            The epsilon to use in the layer normalization layers

-    Examples::
+    Examples:

+    ```python
    >>> from transformers import TransfoXLConfig, TransfoXLModel

    >>> # Initializing a Transformer XL configuration
@@ -102,7 +103,7 @@ class TransfoXLConfig(PretrainedConfig):

    >>> # Accessing the model configuration
    >>> configuration = model.config
-    """
+    ```"""

    model_type = "transfo-xl"
    keys_to_ignore_at_inference = ["mems"]

--- a/src/transformers/models/transfo_xl/modeling_transfo_xl_utilities.py
+++ b/src/transformers/models/transfo_xl/modeling_transfo_xl_utilities.py
@@ -189,18 +189,18 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):

    def log_prob(self, hidden):
        r"""
-        Computes log probabilities for all :math:`n\_classes` From:
+        Computes log probabilities for all \\(n\_classes\\) From:
        https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.p

        Args:
            hidden (Tensor): a minibatch of example

        Returns:
-            log-probabilities of for each class :math:`c` in range :math:`0 <= c <= n\_classes`, where
-            :math:`n\_classes` is a parameter passed to ``AdaptiveLogSoftmaxWithLoss`` constructor. Shape:
+            log-probabilities of for each class \\(c\\) in range \\(0 <= c <= n\_classes\\), where
+            \\(n\_classes\\) is a parameter passed to `AdaptiveLogSoftmaxWithLoss` constructor. Shape:

-            - Input: :math:`(N, in\_features)`
-            - Output: :math:`(N, n\_classes)`
+            - Input: \\((N, in\_features)\\)
+            - Output: \\((N, n\_classes)\\)
        """
        if self.n_clusters == 0:
            logit = self._compute_logit(hidden, self.out_layers[0].weight, self.out_layers[0].bias, self.out_projs[0])

--- a/src/transformers/models/transfo_xl/tokenization_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/tokenization_transfo_xl.py
@@ -76,10 +76,12 @@ def tokenize_numbers(text_array: List[str]) -> List[str]:
    Returns:
        A list of strings with tokenized numbers.

-    Example::
+    Example:
+
+    ```python
    >>> tokenize_numbers(["$", "5,000", "1.73", "m"])
    ["$", "5", "@,@", "000", "1", "@.@", "73", "m"]
-    """
+    ```"""
    tokenized = []
    for i in range(len(text_array)):
        reg, sub = MATCH_NUMBERS
@@ -91,7 +93,7 @@ def tokenize_numbers(text_array: List[str]) -> List[str]:

 def detokenize_numbers(text: str) -> str:
    """
-    Inverts the operation of `tokenize_numbers`. This is replacing ' @,@ ' and ' @.@' by ',' and '.'.
+    Inverts the operation of *tokenize_numbers*. This is replacing ' @,@ ' and ' @.@' by ',' and '.'.

    Args:
        text: A string where the number should be detokenized.
@@ -99,10 +101,12 @@ def detokenize_numbers(text: str) -> str:
    Returns:
        A detokenized string.

-    Example::
+    Example:
+
+    ```python
    >>> detokenize_numbers("$ 5 @,@ 000 1 @.@ 73 m")
    "$ 5,000 1.73 m"
-    """
+    ```"""
    for reg, sub in DETOKENIZE_NUMBERS:
        text = re.sub(reg, sub, text)
    return text
@@ -110,41 +114,40 @@ def detokenize_numbers(text: str) -> str:

 class TransfoXLTokenizer(PreTrainedTokenizer):
    """
-    Construct a Transformer-XL tokenizer adapted from Vocab class in `the original code
-    <https://github.com/kimiyoung/transformer-xl>`__. The Transformer-XL tokenizer is a word-level tokenizer (no
+    Construct a Transformer-XL tokenizer adapted from Vocab class in [the original code](https://github.com/kimiyoung/transformer-xl). The Transformer-XL tokenizer is a word-level tokenizer (no
    sub-word tokenization).

-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
    Users should refer to this superclass for more information regarding those methods.

    Args:
-        special (:obj:`List[str]`, `optional`):
+        special (`List[str]`, *optional*):
            A list of special tokens (to be treated by the original implementation of this tokenizer).
-        min_freq (:obj:`int`, `optional`, defaults to 0):
+        min_freq (`int`, *optional*, defaults to 0):
            The minimum number of times a token has to be present in order to be kept in the vocabulary (otherwise it
-            will be mapped to :obj:`unk_token`).
-        max_size (:obj:`int`, `optional`):
+            will be mapped to `unk_token`).
+        max_size (`int`, *optional*):
            The maximum size of the vocabulary. If left unset, it will default to the size of the vocabulary found
-            after excluding the tokens according to the :obj:`min_freq` rule.
-        lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            after excluding the tokens according to the `min_freq` rule.
+        lower_case (`bool`, *optional*, defaults to `False`):
            Whether or not to lowercase the input when tokenizing.
-        delimiter (:obj:`str`, `optional`):
+        delimiter (`str`, *optional*):
            The delimiter used between tokens.
-        vocab_file (:obj:`str`, `optional`):
+        vocab_file (`str`, *optional*):
            File containing the vocabulary (from the original implementation).
-        pretrained_vocab_file (:obj:`str`, `optional`):
-            File containing the vocabulary as saved with the :obj:`save_pretrained()` method.
-        never_split (:obj:`List[str]`, `optional`):
+        pretrained_vocab_file (`str`, *optional*):
+            File containing the vocabulary as saved with the `save_pretrained()` method.
+        never_split (`List[str]`, *optional*):
            List of tokens that should never be split. If no list is specified, will simply use the existing special
            tokens.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"<eos>"`):
+        eos_token (`str`, *optional*, defaults to `"<eos>"`):
            The end of sequence token.
-        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<formula>"]`):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `["<formula>"]`):
            A list of additional special tokens (for the HuggingFace functionality).
-        language (:obj:`str`, `optional`, defaults to :obj:`"en"`):
+        language (`str`, *optional*, defaults to `"en"`):
            The language of this tokenizer (used for mose preprocessing).
    """

@@ -407,8 +410,8 @@ class TransfoXLTokenizer(PreTrainedTokenizer):

    def moses_pipeline(self, text: str) -> List[str]:
        """
-        Does basic tokenization using :class:`sacremoses.MosesPunctNormalizer` and :class:`sacremoses.MosesTokenizer`
-        with `aggressive_dash_splits=True` (see :func:`sacremoses.tokenize.MosesTokenizer.tokenize`). Additionally,
+        Does basic tokenization using [`sacremoses.MosesPunctNormalizer`] and [`sacremoses.MosesTokenizer`]
+        with *aggressive_dash_splits=True* (see [`sacremoses.tokenize.MosesTokenizer.tokenize`]). Additionally,
        large comma-separated numbers and floating point values are split. E.g. "23,000 people are 1.80m tall" -> "23
        @,@ 000 people are 1 @.@ 80m tall"

@@ -418,11 +421,13 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
        Returns:
            A list of tokenized string

-        Example::
+        Example:
+
+        ```python
        >>> tokenizer = TransfoXLTokenizer.from_pretrained("transfo-xl-wt103")
        >>> tokenizer.moses_pipeline("23,000 people are 1.80 m tall")
        ['23', '@,@', '000', 'people', 'are', '1', '@.@', '80', 'm', 'tall']
-        """
+        ```"""
        text = self.moses_punct_norm(text)
        text = self.moses_tokenize(text)
        text = tokenize_numbers(text)

--- a/src/transformers/models/trocr/configuration_trocr.py
+++ b/src/transformers/models/trocr/configuration_trocr.py
@@ -28,57 +28,57 @@ TROCR_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class TrOCRConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.TrOCRForCausalLM`. It is used
+    This is the configuration class to store the configuration of a [`TrOCRForCausalLM`]. It is used
    to instantiate an TrOCR model according to the specified arguments, defining the model architecture. Instantiating
-    a configuration with the defaults will yield a similar configuration to that of the TrOCR `microsoft/trocr-base
-    <https://huggingface.co/microsoft/trocr-base>`__ architecture.
+    a configuration with the defaults will yield a similar configuration to that of the TrOCR [microsoft/trocr-base](https://huggingface.co/microsoft/trocr-base) architecture.

-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.


    Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 50265):
+        vocab_size (`int`, *optional*, defaults to 50265):
            Vocabulary size of the TrOCR model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.TrOCRForCausalLM`.
-        d_model (:obj:`int`, `optional`, defaults to 1024):
+            `inputs_ids` passed when calling [`TrOCRForCausalLM`].
+        d_model (`int`, *optional*, defaults to 1024):
            Dimensionality of the layers and the pooler layer.
-        decoder_layers (:obj:`int`, `optional`, defaults to 12):
+        decoder_layers (`int`, *optional*, defaults to 12):
            Number of decoder layers.
-        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
-            The non-linear activation function (function or string) in the pooler. If string, :obj:`"gelu"`,
-            :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
+        dropout (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
-        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        activation_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for classifier.
-        init_std (:obj:`float`, `optional`, defaults to 0.02):
+        init_std (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
-        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        scale_embedding (`bool`, *optional*, defaults to `False`):
            Whether or not to scale the word embeddings by sqrt(d_model).
-        use_learned_position_embeddings (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_learned_position_embeddings (`bool`, *optional*, defaults to `True`):
            Whether or not to use learned position embeddings. If not, sinusoidal position embeddings will be used.
-        layernorm_embedding (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        layernorm_embedding (`bool`, *optional*, defaults to `True`):
            Whether or not to use a layernorm after the word + position embeddings.

-    Example::
+    Example:

+    ```python
    >>> from transformers import TrOCRForCausalLM, TrOCRConfig

    >>> # Initializing a TrOCR-base style configuration
@@ -89,7 +89,7 @@ class TrOCRConfig(PretrainedConfig):

    >>> # Accessing the model configuration
    >>> configuration = model.config
-    """
+    ```"""
    model_type = "trocr"
    keys_to_ignore_at_inference = ["past_key_values"]
    attribute_map = {

--- a/src/transformers/models/trocr/processing_trocr.py
+++ b/src/transformers/models/trocr/processing_trocr.py
@@ -28,15 +28,15 @@ class TrOCRProcessor:
    r"""
    Constructs a TrOCR processor which wraps a vision feature extractor and a TrOCR tokenizer into a single processor.

-    :class:`~transformers.TrOCRProcessor` offers all the functionalities of :class:`~transformers.AutoFeatureExtractor`
-    and :class:`~transformers.RobertaTokenizer`. See the :meth:`~transformers.TrOCRProcessor.__call__` and
-    :meth:`~transformers.TrOCRProcessor.decode` for more information.
+    [`TrOCRProcessor`] offers all the functionalities of [`AutoFeatureExtractor`]
+    and [`RobertaTokenizer`]. See the [`~TrOCRProcessor.__call__`] and
+    [`~TrOCRProcessor.decode`] for more information.

    Args:
-        feature_extractor (:class:`~transformers.AutoFeatureExtractor`):
-            An instance of :class:`~transformers.AutoFeatureExtractor`. The feature extractor is a required input.
-        tokenizer (:class:`~transformers.RobertaTokenizer`):
-            An instance of :class:`~transformers.RobertaTokenizer`. The tokenizer is a required input.
+        feature_extractor ([`AutoFeatureExtractor`]):
+            An instance of [`AutoFeatureExtractor`]. The feature extractor is a required input.
+        tokenizer ([`RobertaTokenizer`]):
+            An instance of [`RobertaTokenizer`]. The tokenizer is a required input.
    """

    def __init__(self, feature_extractor, tokenizer):
@@ -55,17 +55,19 @@ class TrOCRProcessor:

    def save_pretrained(self, save_directory):
        """
-        Save a TrOCR feature extractor object and TrOCR tokenizer object to the directory ``save_directory``, so that
-        it can be re-loaded using the :func:`~transformers.TrOCRProcessor.from_pretrained` class method.
+        Save a TrOCR feature extractor object and TrOCR tokenizer object to the directory `save_directory`, so that
+        it can be re-loaded using the [`~TrOCRProcessor.from_pretrained`] class method.

-        .. note::
+        <Tip>

-            This class method is simply calling :meth:`~transformers.PreTrainedFeatureExtractor.save_pretrained` and
-            :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.save_pretrained`. Please refer to the
+        This class method is simply calling [`~PreTrainedFeatureExtractor.save_pretrained`] and
+        [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the
        docstrings of the methods above for more information.

+        </Tip>
+
        Args:
-            save_directory (:obj:`str` or :obj:`os.PathLike`):
+            save_directory (`str` or `os.PathLike`):
                Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
                be created if it does not exist).
        """
@@ -76,30 +78,32 @@ class TrOCRProcessor:
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
        r"""
-        Instantiate a :class:`~transformers.TrOCRProcessor` from a pretrained TrOCR processor.
+        Instantiate a [`TrOCRProcessor`] from a pretrained TrOCR processor.

-        .. note::
+        <Tip>

        This class method is simply calling AutoFeatureExtractor's
-            :meth:`~transformers.PreTrainedFeatureExtractor.from_pretrained` and TrOCRTokenizer's
-            :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained`. Please refer to the
+        [`~PreTrainedFeatureExtractor.from_pretrained`] and TrOCRTokenizer's
+        [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the
        docstrings of the methods above for more information.

+        </Tip>
+
        Args:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
                This can be either:

-                - a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
-                  namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing a feature extractor file saved using the
-                  :meth:`~transformers.PreTrainedFeatureExtractor.save_pretrained` method, e.g.,
-                  ``./my_model_directory/``.
-                - a path or url to a saved feature extractor JSON `file`, e.g.,
-                  ``./my_model_directory/preprocessor_config.json``.
+                - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
+                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                - a path to a *directory* containing a feature extractor file saved using the
+                  [`~PreTrainedFeatureExtractor.save_pretrained`] method, e.g.,
+                  `./my_model_directory/`.
+                - a path or url to a saved feature extractor JSON *file*, e.g.,
+                  `./my_model_directory/preprocessor_config.json`.
            **kwargs
-                Additional keyword arguments passed along to both :class:`~transformers.PreTrainedFeatureExtractor` and
-                :class:`~transformers.PreTrainedTokenizer`
+                Additional keyword arguments passed along to both [`PreTrainedFeatureExtractor`] and
+                [`PreTrainedTokenizer`]
        """
        feature_extractor = AutoFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
        tokenizer = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
@@ -109,9 +113,9 @@ class TrOCRProcessor:
    def __call__(self, *args, **kwargs):
        """
        When used in normal mode, this method forwards all its arguments to AutoFeatureExtractor's
-        :meth:`~transformers.AutoFeatureExtractor.__call__` and returns its output. If used in the context
-        :meth:`~transformers.TrOCRProcessor.as_target_processor` this method forwards all its arguments to
-        TrOCRTokenizer's :meth:`~transformers.TrOCRTokenizer.__call__`. Please refer to the doctsring of the above two
+        [`~AutoFeatureExtractor.__call__`] and returns its output. If used in the context
+        [`~TrOCRProcessor.as_target_processor`] this method forwards all its arguments to
+        TrOCRTokenizer's [`~TrOCRTokenizer.__call__`]. Please refer to the doctsring of the above two
        methods for more information.
        """
        return self.current_processor(*args, **kwargs)
@@ -119,14 +123,14 @@ class TrOCRProcessor:
    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to TrOCRTokenizer's
-        :meth:`~transformers.PreTrainedTokenizer.batch_decode`. Please refer to the docstring of this method for more
+        [`~PreTrainedTokenizer.batch_decode`]. Please refer to the docstring of this method for more
        information.
        """
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        """
-        This method forwards all its arguments to TrOCRTokenizer's :meth:`~transformers.PreTrainedTokenizer.decode`.
+        This method forwards all its arguments to TrOCRTokenizer's [`~PreTrainedTokenizer.decode`].
        Please refer to the docstring of this method for more information.
        """
        return self.tokenizer.decode(*args, **kwargs)

--- a/src/transformers/models/unispeech/configuration_unispeech.py
+++ b/src/transformers/models/unispeech/configuration_unispeech.py
@@ -28,136 +28,133 @@ UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class UniSpeechConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.UniSpeechModel`. It is used
+    This is the configuration class to store the configuration of a [`UniSpeechModel`]. It is used
    to instantiate an UniSpeech model according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the UniSpeech
-    `facebook/unispeech-base-960h <https://huggingface.co/facebook/unispeech-base-960h>`__ architecture.
+    [facebook/unispeech-base-960h](https://huggingface.co/facebook/unispeech-base-960h) architecture.

-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.


    Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 32):
+        vocab_size (`int`, *optional*, defaults to 32):
            Vocabulary size of the UniSpeech model. Defines the number of different tokens that can be represented by
-            the :obj:`inputs_ids` passed when calling :class:`~transformers.UniSpeechModel`. Vocabulary size of the
-            model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward
-            method of :class:`~transformers.UniSpeechModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            the `inputs_ids` passed when calling [`UniSpeechModel`]. Vocabulary size of the
+            model. Defines the different tokens that can be represented by the *inputs_ids* passed to the forward
+            method of [`UniSpeechModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
+        attention_dropout (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention probabilities.
-        final_dropout (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probability for the final projection layer of :class:`UniSpeechForCTC`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        final_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the final projection layer of [`UniSpeechForCTC`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
-        feat_extract_norm (:obj:`str`, `optional`, defaults to :obj:`"group"`):
-            The norm to be applied to 1D convolutional layers in feature extractor. One of :obj:`"group"` for group
-            normalization of only the first 1D convolutional layer or :obj:`"layer"` for layer normalization of all 1D
+        feat_extract_norm (`str`, *optional*, defaults to `"group"`):
+            The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group
+            normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
            convolutional layers.
-        feat_proj_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
            The dropout probability for output of the feature extractor.
-        feat_extract_activation (:obj:`str, `optional`, defaults to :obj:`"gelu"`):
+        feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the 1D convolutional layers of the feature
-            extractor. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        feat_quantizer_dropout (obj:`float`, `optional`, defaults to 0.0):
+            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        feat_quantizer_dropout (obj:*float*, *optional*, defaults to 0.0):
            The dropout probabilitiy for quantized feature extractor states.
-        conv_dim (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(512, 512, 512, 512, 512, 512, 512)`):
+        conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
            A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
-            feature extractor. The length of `conv_dim` defines the number of 1D convolutional layers.
-        conv_stride (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(5, 2, 2, 2, 2, 2, 2)`):
+            feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers.
+        conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
            A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length
-            of `conv_stride` defines the number of convolutional layers and has to match the the length of `conv_dim`.
-        conv_kernel (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(10, 3, 3, 3, 3, 3, 3)`):
+            of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
+        conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
            A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The
-            length of `conv_kernel` defines the number of convolutional layers and has to match the the length of
-            `conv_dim`.
-        conv_bias (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
+            *conv_dim*.
+        conv_bias (`bool`, *optional*, defaults to `False`):
            Whether the 1D convolutional layers have a bias.
-        num_conv_pos_embeddings (:obj:`int`, `optional`, defaults to 128):
+        num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
            Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
            embeddings layer.
-        num_conv_pos_embedding_groups (:obj:`int`, `optional`, defaults to 16):
+        num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
            Number of groups of 1D convolutional positional embeddings layer.
-        do_stable_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to apply `stable` layer norm architecture of the Transformer encoder. ``do_stable_layer_norm is
-            True`` corresponds to applying layer norm before the attention layer, whereas ``do_stable_layer_norm is
-            False`` corresponds to applying layer norm after the attention layer.
-        apply_spec_augment (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_stable_layer_norm (`bool`, *optional*, defaults to `False`):
+            Whether to apply *stable* layer norm architecture of the Transformer encoder. `do_stable_layer_norm is True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is False` corresponds to applying layer norm after the attention layer.
+        apply_spec_augment (`bool`, *optional*, defaults to `True`):
            Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see
-            `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
-            <https://arxiv.org/abs/1904.08779>`__.
-        mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
+            [SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition](https://arxiv.org/abs/1904.08779).
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
            Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
            procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
            reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
-            masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
-            the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
-        mask_time_length (:obj:`int`, `optional`, defaults to 10):
+            masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease
+            the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
+        mask_time_length (`int`, *optional*, defaults to 10):
            Length of vector span along the time axis.
-        mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
-            The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
-            step, irrespectively of ``mask_feature_prob``. Only relevant if
+        mask_time_min_masks (`int`, *optional*, defaults to 2),:
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
            ''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
-        mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
            Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
            masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
            the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
-            span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
-            overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
-            is True``.
-        mask_feature_length (:obj:`int`, `optional`, defaults to 10):
+            span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that
+            overlap may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
            Length of vector span along the feature axis.
-        mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
-            The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
-            step, irrespectively of ``mask_feature_prob``. Only relevant if
+        mask_feature_min_masks (`int`, *optional*, defaults to 0),:
+            The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
            ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
-        num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320):
+        num_codevectors_per_group (`int`, *optional*, defaults to 320):
            Number of entries in each quantization codebook (group).
-        num_codevector_groups (:obj:`int`, `optional`, defaults to 2):
+        num_codevector_groups (`int`, *optional*, defaults to 2):
            Number of codevector groups for product codevector quantization.
-        contrastive_logits_temperature (:obj:`float`, `optional`, defaults to 0.1):
-            The temperature `kappa` in the contrastive loss.
-        feat_quantizer_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
+            The temperature *kappa* in the contrastive loss.
+        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
            The dropout probabilitiy for the output of the feature extractor that's used by the quantizer.
-        num_negatives (:obj:`int`, `optional`, defaults to 100):
+        num_negatives (`int`, *optional*, defaults to 100):
            Number of negative samples for the contrastive loss.
-        codevector_dim (:obj:`int`, `optional`, defaults to 256):
+        codevector_dim (`int`, *optional*, defaults to 256):
            Dimensionality of the quantized feature vectors.
-        proj_codevector_dim (:obj:`int`, `optional`, defaults to 256):
+        proj_codevector_dim (`int`, *optional*, defaults to 256):
            Dimensionality of the final projection of both the quantized and the transformer features.
-        diversity_loss_weight (:obj:`int`, `optional`, defaults to 0.1):
+        diversity_loss_weight (`int`, *optional*, defaults to 0.1):
            The weight of the codebook diversity loss component.
-        ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"mean"`):
-            Specifies the reduction to apply to the output of ``torch.nn.CTCLoss``. Only relevant when training an
-            instance of :class:`~transformers.UniSpeechForCTC`.
-        ctc_zero_infinity (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to zero infinite losses and the associated gradients of ``torch.nn.CTCLoss``. Infinite losses
+        ctc_loss_reduction (`str`, *optional*, defaults to `"mean"`):
+            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
+            instance of [`UniSpeechForCTC`].
+        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
+            Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses
            mainly occur when the inputs are too short to be aligned to the targets. Only relevant when training an
-            instance of :class:`~transformers.UniSpeechForCTC`.
-        use_weighted_layer_sum (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            instance of [`UniSpeechForCTC`].
+        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
            Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
-            instance of :class:`~transformers.UniSpeechForSequenceClassification`.
-        classifier_proj_size (:obj:`int`, `optional`, defaults to 256):
+            instance of [`UniSpeechForSequenceClassification`].
+        classifier_proj_size (`int`, *optional*, defaults to 256):
            Dimensionality of the projection before token mean-pooling for classification.
-        replace_prob (:obj:`float`, `optional`, defaults to 0.5):
+        replace_prob (`float`, *optional*, defaults to 0.5):
            Propability that transformer feature is replaced by quantized feature for pretraining.

-    Example::
+    Example:

+    ```python
    >>> from transformers import UniSpeechModel, UniSpeechConfig

    >>> # Initializing a UniSpeech facebook/unispeech-base-960h style configuration
@@ -168,7 +165,7 @@ class UniSpeechConfig(PretrainedConfig):

    >>> # Accessing the model configuration
    >>> configuration = model.config
-    """
+    ```"""
    model_type = "unispeech"

    def __init__(

--- a/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py
@@ -28,145 +28,142 @@ UNISPEECH_SAT_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class UniSpeechSatConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.UniSpeechSatModel`. It is
+    This is the configuration class to store the configuration of a [`UniSpeechSatModel`]. It is
    used to instantiate an UniSpeechSat model according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the UniSpeechSat
-    `facebook/unispeech_sat-base-960h <https://huggingface.co/facebook/unispeech_sat-base-960h>`__ architecture.
+    [facebook/unispeech_sat-base-960h](https://huggingface.co/facebook/unispeech_sat-base-960h) architecture.

-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.


    Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 32):
+        vocab_size (`int`, *optional*, defaults to 32):
            Vocabulary size of the UniSpeechSat model. Defines the number of different tokens that can be represented
-            by the :obj:`inputs_ids` passed when calling :class:`~transformers.UniSpeechSatModel`. Vocabulary size of
-            the model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward
-            method of :class:`~transformers.UniSpeechSatModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            by the `inputs_ids` passed when calling [`UniSpeechSatModel`]. Vocabulary size of
+            the model. Defines the different tokens that can be represented by the *inputs_ids* passed to the forward
+            method of [`UniSpeechSatModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
+        attention_dropout (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention probabilities.
-        final_dropout (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probability for the final projection layer of :class:`UniSpeechSatForCTC`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        final_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the final projection layer of [`UniSpeechSatForCTC`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
-        feat_extract_norm (:obj:`str`, `optional`, defaults to :obj:`"group"`):
-            The norm to be applied to 1D convolutional layers in feature extractor. One of :obj:`"group"` for group
-            normalization of only the first 1D convolutional layer or :obj:`"layer"` for layer normalization of all 1D
+        feat_extract_norm (`str`, *optional*, defaults to `"group"`):
+            The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group
+            normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
            convolutional layers.
-        feat_proj_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
            The dropout probability for output of the feature extractor.
-        feat_extract_activation (:obj:`str, `optional`, defaults to :obj:`"gelu"`):
+        feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the 1D convolutional layers of the feature
-            extractor. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        feat_quantizer_dropout (obj:`float`, `optional`, defaults to 0.0):
+            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        feat_quantizer_dropout (obj:*float*, *optional*, defaults to 0.0):
            The dropout probabilitiy for quantized feature extractor states.
-        conv_dim (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(512, 512, 512, 512, 512, 512, 512)`):
+        conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
            A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
-            feature extractor. The length of `conv_dim` defines the number of 1D convolutional layers.
-        conv_stride (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(5, 2, 2, 2, 2, 2, 2)`):
+            feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers.
+        conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
            A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length
-            of `conv_stride` defines the number of convolutional layers and has to match the the length of `conv_dim`.
-        conv_kernel (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(10, 3, 3, 3, 3, 3, 3)`):
+            of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
+        conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
            A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The
-            length of `conv_kernel` defines the number of convolutional layers and has to match the the length of
-            `conv_dim`.
-        conv_bias (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
+            *conv_dim*.
+        conv_bias (`bool`, *optional*, defaults to `False`):
            Whether the 1D convolutional layers have a bias.
-        num_conv_pos_embeddings (:obj:`int`, `optional`, defaults to 128):
+        num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
            Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
            embeddings layer.
-        num_conv_pos_embedding_groups (:obj:`int`, `optional`, defaults to 16):
+        num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
            Number of groups of 1D convolutional positional embeddings layer.
-        do_stable_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to apply `stable` layer norm architecture of the Transformer encoder. ``do_stable_layer_norm is
-            True`` corresponds to applying layer norm before the attention layer, whereas ``do_stable_layer_norm is
-            False`` corresponds to applying layer norm after the attention layer.
-        apply_spec_augment (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_stable_layer_norm (`bool`, *optional*, defaults to `False`):
+            Whether to apply *stable* layer norm architecture of the Transformer encoder. `do_stable_layer_norm is True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is False` corresponds to applying layer norm after the attention layer.
+        apply_spec_augment (`bool`, *optional*, defaults to `True`):
            Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see
-            `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
-            <https://arxiv.org/abs/1904.08779>`__.
-        mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
+            [SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition](https://arxiv.org/abs/1904.08779).
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
            Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
            procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
            reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
-            masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
-            the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
-        mask_time_length (:obj:`int`, `optional`, defaults to 10):
+            masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease
+            the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
+        mask_time_length (`int`, *optional*, defaults to 10):
            Length of vector span along the time axis.
-        mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
-            The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
-            step, irrespectively of ``mask_feature_prob``. Only relevant if
+        mask_time_min_masks (`int`, *optional*, defaults to 2),:
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
            ''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
-        mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
            Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
            masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
            the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
-            span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
-            overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
-            is True``.
-        mask_feature_length (:obj:`int`, `optional`, defaults to 10):
+            span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that
+            overlap may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
            Length of vector span along the feature axis.
-        mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
-            The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
-            step, irrespectively of ``mask_feature_prob``. Only relevant if
+        mask_feature_min_masks (`int`, *optional*, defaults to 0),:
+            The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
            ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
-        num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320):
+        num_codevectors_per_group (`int`, *optional*, defaults to 320):
            Number of entries in each quantization codebook (group).
-        num_codevector_groups (:obj:`int`, `optional`, defaults to 2):
+        num_codevector_groups (`int`, *optional*, defaults to 2):
            Number of codevector groups for product codevector quantization.
-        contrastive_logits_temperature (:obj:`float`, `optional`, defaults to 0.1):
-            The temperature `kappa` in the contrastive loss.
-        feat_quantizer_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
+            The temperature *kappa* in the contrastive loss.
+        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
            The dropout probabilitiy for the output of the feature extractor that's used by the quantizer.
-        num_negatives (:obj:`int`, `optional`, defaults to 100):
+        num_negatives (`int`, *optional*, defaults to 100):
            Number of negative samples for the contrastive loss.
-        codevector_dim (:obj:`int`, `optional`, defaults to 256):
+        codevector_dim (`int`, *optional*, defaults to 256):
            Dimensionality of the quantized feature vectors.
-        proj_codevector_dim (:obj:`int`, `optional`, defaults to 256):
+        proj_codevector_dim (`int`, *optional*, defaults to 256):
            Dimensionality of the final projection of both the quantized and the transformer features.
-        diversity_loss_weight (:obj:`int`, `optional`, defaults to 0.1):
+        diversity_loss_weight (`int`, *optional*, defaults to 0.1):
            The weight of the codebook diversity loss component.
-        ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"mean"`):
-            Specifies the reduction to apply to the output of ``torch.nn.CTCLoss``. Only relevant when training an
-            instance of :class:`~transformers.UniSpeechSatForCTC`.
-        ctc_zero_infinity (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to zero infinite losses and the associated gradients of ``torch.nn.CTCLoss``. Infinite losses
+        ctc_loss_reduction (`str`, *optional*, defaults to `"mean"`):
+            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
+            instance of [`UniSpeechSatForCTC`].
+        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
+            Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses
            mainly occur when the inputs are too short to be aligned to the targets. Only relevant when training an
-            instance of :class:`~transformers.UniSpeechSatForCTC`.
-        use_weighted_layer_sum (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            instance of [`UniSpeechSatForCTC`].
+        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
            Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
-            instance of :class:`~transformers.UniSpeechSatForSequenceClassification`.
-        classifier_proj_size (:obj:`int`, `optional`, defaults to 256):
+            instance of [`UniSpeechSatForSequenceClassification`].
+        classifier_proj_size (`int`, *optional*, defaults to 256):
            Dimensionality of the projection before token mean-pooling for classification.
-        tdnn_dim (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(512, 512, 512, 512, 1500)`):
-            A tuple of integers defining the number of output channels of each 1D convolutional layer in the `TDNN`
-            module of the `XVector` model. The length of `tdnn_dim` defines the number of `TDNN` layers.
-        tdnn_kernel (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(5, 3, 3, 1, 1)`):
-            A tuple of integers defining the kernel size of each 1D convolutional layer in the `TDNN` module of the
-            `XVector` model. The length of `tdnn_kernel` has to match the length of `tdnn_dim`.
-        tdnn_dilation (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(1, 2, 3, 1, 1)`):
-            A tuple of integers defining the dilation factor of each 1D convolutional layer in `TDNN` module of the
-            `XVector` model. The length of `tdnn_dilation` has to match the length of `tdnn_dim`.
-        xvector_output_dim (:obj:`int`, `optional`, defaults to 512):
-            Dimensionality of the `XVector` embedding vectors.
+        tdnn_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`):
+            A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN*
+            module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers.
+        tdnn_kernel (`Tuple[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the
+            *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*.
+        tdnn_dilation (`Tuple[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`):
+            A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the
+            *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*.
+        xvector_output_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of the *XVector* embedding vectors.

-    Example::
+    Example:

+    ```python
    >>> from transformers import UniSpeechSatModel, UniSpeechSatConfig

    >>> # Initializing a UniSpeechSat facebook/unispeech_sat-base-960h style configuration
@@ -177,7 +174,7 @@ class UniSpeechSatConfig(PretrainedConfig):

    >>> # Accessing the model configuration
    >>> configuration = model.config
-    """
+    ```"""
    model_type = "unispeech-sat"

    def __init__(

--- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
@@ -1266,8 +1266,9 @@ class UniSpeechSatForPreTraining(UniSpeechSatPreTrainedModel):
        r"""
        Returns:

-        Example::
+        Example:

+        ```python
        >>> import torch
        >>> from transformers import UniSpeechSatFeatureExtractor, UniSpeechSatForPreTraining
        >>> from transformers.models.unispeech_sat.modeling_unispeech_sat import _compute_mask_indices
@@ -1308,7 +1309,7 @@ class UniSpeechSatForPreTraining(UniSpeechSatPreTrainedModel):
        >>> # for contrastive loss training model should be put into train mode
        >>> model.train()
        >>> loss = model(input_values, mask_time_indices=mask_time_indices).loss
-        """
+        ```"""

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict


--- a/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
@@ -26,24 +26,25 @@ logger = logging.get_logger(__name__)

 class VisionEncoderDecoderConfig(PretrainedConfig):
    r"""
-    :class:`~transformers.VisionEncoderDecoderConfig` is the configuration class to store the configuration of a
-    :class:`~transformers.VisionEncoderDecoderModel`. It is used to instantiate a Vision-Encoder-Text-Decoder model
+    [`VisionEncoderDecoderConfig`] is the configuration class to store the configuration of a
+    [`VisionEncoderDecoderModel`]. It is used to instantiate a Vision-Encoder-Text-Decoder model
    according to the specified arguments, defining the encoder and decoder configs.

-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.

    Args:
-        kwargs (`optional`):
+        kwargs (*optional*):
            Dictionary of keyword arguments. Notably:

-                - **encoder** (:class:`~transformers.PretrainedConfig`, `optional`) -- An instance of a configuration
+                - **encoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration
                  object that defines the encoder config.
-                - **decoder** (:class:`~transformers.PretrainedConfig`, `optional`) -- An instance of a configuration
+                - **decoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration
                  object that defines the decoder config.

-    Examples::
+    Examples:

+    ```python
    >>> from transformers import BertConfig, ViTConfig, VisionEncoderDecoderConfig, VisionEncoderDecoderModel

    >>> # Initializing a ViT & BERT style configuration
@@ -68,7 +69,7 @@ class VisionEncoderDecoderConfig(PretrainedConfig):
    >>> # loading model and config from pretrained folder
    >>> encoder_decoder_config = VisionEncoderDecoderConfig.from_pretrained('my-model')
    >>> model = VisionEncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config)
-    """
+    ```"""
    model_type = "vision-encoder-decoder"
    is_composition = True

@@ -94,11 +95,11 @@ class VisionEncoderDecoderConfig(PretrainedConfig):
        cls, encoder_config: PretrainedConfig, decoder_config: PretrainedConfig, **kwargs
    ) -> PretrainedConfig:
        r"""
-        Instantiate a :class:`~transformers.VisionEncoderDecoderConfig` (or a derived class) from a pre-trained encoder
+        Instantiate a [`VisionEncoderDecoderConfig`] (or a derived class) from a pre-trained encoder
        model configuration and decoder model configuration.

        Returns:
-            :class:`VisionEncoderDecoderConfig`: An instance of a configuration object
+            [`VisionEncoderDecoderConfig`]: An instance of a configuration object
        """
        logger.info("Setting `config.is_decoder=True` and `config.add_cross_attention=True` for decoder_config")
        decoder_config.is_decoder = True
@@ -108,10 +109,10 @@ class VisionEncoderDecoderConfig(PretrainedConfig):

    def to_dict(self):
        """
-        Serializes this instance to a Python dictionary. Override the default `to_dict()` from `PretrainedConfig`.
+        Serializes this instance to a Python dictionary. Override the default *to_dict()* from *PretrainedConfig*.

        Returns:
-            :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
        """
        output = copy.deepcopy(self.__dict__)
        output["encoder"] = self.encoder.to_dict()

--- a/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
@@ -395,8 +395,9 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
        r"""
        Returns:

-        Example::
+        Example:

+        ```python
        >>> from transformers import FlaxVisionEncoderDecoderModel
        >>> from PIL import Image
        >>> import requests
@@ -411,8 +412,7 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):

        >>> pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values
        >>> encoder_outputs = model.encode(pixel_values)
-
-        """
+        ```"""
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -471,8 +471,9 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
        r"""
        Returns:

-        Example::
+        Example:

+        ```python
        >>> from transformers import FlaxVisionEncoderDecoderModel
        >>> import jax.numpy as jnp
        >>> from PIL import Image
@@ -494,8 +495,7 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):

        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
        >>> logits = outputs.logits
-
-        """
+        ```"""
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -599,8 +599,9 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
        r"""
        Returns:

-        Examples::
+        Examples:

+        ```python
        >>> from transformers import FlaxVisionEncoderDecoderModel, ViTFeatureExtractor, GPT2Tokenizer
        >>> from PIL import Image
        >>> import requests
@@ -626,7 +627,7 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
        >>> sequences = model.generate(pixel_values, num_beams=4, max_length=12).sequences

        >>> captions = tokenizer_output.batch_decode(sequences, skip_special_tokens=True)
-        """
+        ```"""

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (

--- a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
@@ -410,8 +410,9 @@ class VisionEncoderDecoderModel(PreTrainedModel):
        r"""
        Returns:

-        Examples::
+        Examples:

+        ```python
        >>> from transformers import TrOCRProcessor, VisionEncoderDecoderModel
        >>> import requests
        >>> from PIL import Image
@@ -438,8 +439,7 @@ class VisionEncoderDecoderModel(PreTrainedModel):
        >>> # inference (generation)
        >>> generated_ids = model.generate(pixel_values)
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-
-        """
+        ```"""
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        kwargs_encoder = {argument: value for argument, value in kwargs.items() if not argument.startswith("decoder_")}

--- a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py
@@ -27,28 +27,29 @@ logger = logging.get_logger(__name__)

 class VisionTextDualEncoderConfig(PretrainedConfig):
    r"""
-    :class:`~transformers.VisionTextDualEncoderConfig` is the configuration class to store the configuration of a
-    :class:`~transformers.VisionTextDualEncoderModel`. It is used to instantiate
-    :class:`~transformers.VisionTextDualEncoderModel` model according to the specified arguments, defining the text
+    [`VisionTextDualEncoderConfig`] is the configuration class to store the configuration of a
+    [`VisionTextDualEncoderModel`]. It is used to instantiate
+    [`VisionTextDualEncoderModel`] model according to the specified arguments, defining the text
    model and vision model configs.

-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.

    Args:
-        text_config_dict (:obj:`dict`):
+        text_config_dict (`dict`):
            Dictionary of configuration options that defines text model config.
-        vision_config_dict (:obj:`dict`):
+        vision_config_dict (`dict`):
            Dictionary of configuration options that defines vison model config.
-        projection_dim (:obj:`int`, `optional`, defaults to 512):
+        projection_dim (`int`, *optional*, defaults to 512):
            Dimentionality of text and vision projection layers.
-        logit_scale_init_value (:obj:`float`, `optional`, defaults to 2.6592):
-            The inital value of the `logit_scale` paramter. Default is used as per the original CLIP implementation.
-        kwargs (`optional`):
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
+        kwargs (*optional*):
            Dictionary of keyword arguments.

-    Examples::
+    Examples:

+    ```python
    >>> from transformers import ViTConfig, BertConfig, VisionTextDualEncoderConfig, VisionTextDualEncoderModel

    >>> # Initializing a BERT and ViT configuration
@@ -70,7 +71,7 @@ class VisionTextDualEncoderConfig(PretrainedConfig):
    >>> # loading model and config from pretrained folder
    >>> vision_text_config = VisionTextDualEncoderConfig.from_pretrained('vit-bert')
    >>> model = VisionTextDualEncoderModel.from_pretrained('vit-bert', config=vision_text_config)
-    """
+    ```"""

    model_type = "vision-text-dual-encoder"
    is_composition = True
@@ -105,11 +106,11 @@ class VisionTextDualEncoderConfig(PretrainedConfig):
    @classmethod
    def from_vision_text_configs(cls, vision_config: PretrainedConfig, text_config: PretrainedConfig, **kwargs):
        r"""
-        Instantiate a :class:`VisionTextDualEncoderConfig` (or a derived class) from text model configuration and
+        Instantiate a [`VisionTextDualEncoderConfig`] (or a derived class) from text model configuration and
        vision model configuration.

        Returns:
-            :class:`VisionTextDualEncoderConfig`: An instance of a configuration object
+            [`VisionTextDualEncoderConfig`]: An instance of a configuration object
        """

        return cls(vision_config=vision_config.to_dict(), text_config=text_config.to_dict(), **kwargs)
@@ -117,10 +118,10 @@ class VisionTextDualEncoderConfig(PretrainedConfig):
    def to_dict(self):
        """
        Serializes this instance to a Python dictionary. Override the default
-        :meth:`~transformers.PretrainedConfig.to_dict`.
+        [`~PretrainedConfig.to_dict`].

        Returns:
-            :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
        """
        output = copy.deepcopy(self.__dict__)
        output["vision_config"] = self.vision_config.to_dict()

--- a/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
@@ -528,8 +528,9 @@ class FlaxVisionTextDualEncoderModel(FlaxPreTrainedModel):
 VISION_TEXT_DUAL_ENCODER_MODEL_DOCSTRING = r"""
    Returns:

-    Examples::
+    Examples:

+    ```python
    >>> from PIL import Image
    >>> import requests
    >>> import jax
@@ -555,7 +556,7 @@ VISION_TEXT_DUAL_ENCODER_MODEL_DOCSTRING = r"""
    >>> outputs = model(**inputs)
    >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
    >>> probs = jax.nn.softmax(logits_per_image, axis=1) # we can take the softmax to get the label probabilities
-
+    ```
 """

 overwrite_call_docstring(