Mass conversion of documentation from rst to Markdown (#14866)

* Convert docstrings of all configurations and tokenizers * Processors and fixes * Last modeling files and fixes to models * Pipeline modules * Utils files * Data submodule * All the other files * Style * Missing examples * Style again * Fix copies * Say bye bye to rst docstrings forever

Mass conversion of documentation from rst to Markdown (#14866)
* Convert docstrings of all configurations and tokenizers * Processors and fixes * Last modeling files and fixes to models * Pipeline modules * Utils files * Data submodule * All the other files * Style * Missing examples * Style again * Fix copies * Say bye bye to rst docstrings forever
27b3031d · Sylvain Gugger · GitHub · 18587639 · 27b3031d · 27b3031d
Unverified Commit 27b3031d authored Dec 21, 2021 by Sylvain Gugger Committed by GitHub Dec 21, 2021
20 changed files
--- a/src/transformers/models/mt5/configuration_mt5.py
+++ b/src/transformers/models/mt5/configuration_mt5.py
@@ -23,44 +23,43 @@ logger = logging.get_logger(__name__)

 class MT5Config(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.MT5Model` or a
-    :class:`~transformers.TFMT5Model`. It is used to instantiate a mT5 model according to the specified arguments,
+    This is the configuration class to store the configuration of a [`MT5Model`] or a
+    [`TFMT5Model`]. It is used to instantiate a mT5 model according to the specified arguments,
    defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
-    to that of the mT5 `google/mt5-small <https://huggingface.co/google/mt5-small>`__ architecture.
+    to that of the mT5 [google/mt5-small](https://huggingface.co/google/mt5-small) architecture.

-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.

    Arguments:
-        vocab_size (:obj:`int`, `optional`, defaults to 250112):
+        vocab_size (`int`, *optional*, defaults to 250112):
            Vocabulary size of the T5 model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.T5Model` or :class:`~transformers.TFT5Model`.
-        d_model (:obj:`int`, `optional`, defaults to 512):
+            `inputs_ids` passed when calling [`T5Model`] or [`TFT5Model`].
+        d_model (`int`, *optional*, defaults to 512):
            Size of the encoder layers and the pooler layer.
-        d_kv (:obj:`int`, `optional`, defaults to 64):
-            Size of the key, query, value projections per attention head. :obj:`d_kv` has to be equal to :obj:`d_model
-            // num_heads`.
-        d_ff (:obj:`int`, `optional`, defaults to 1024):
-            Size of the intermediate feed forward layer in each :obj:`T5Block`.
-        num_layers (:obj:`int`, `optional`, defaults to 8):
+        d_kv (`int`, *optional*, defaults to 64):
+            Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model // num_heads`.
+        d_ff (`int`, *optional*, defaults to 1024):
+            Size of the intermediate feed forward layer in each `T5Block`.
+        num_layers (`int`, *optional*, defaults to 8):
            Number of hidden layers in the Transformer encoder.
-        num_decoder_layers (:obj:`int`, `optional`):
-            Number of hidden layers in the Transformer decoder. Will use the same value as :obj:`num_layers` if not
+        num_decoder_layers (`int`, *optional*):
+            Number of hidden layers in the Transformer decoder. Will use the same value as `num_layers` if not
            set.
-        num_heads (:obj:`int`, `optional`, defaults to 6):
+        num_heads (`int`, *optional*, defaults to 6):
            Number of attention heads for each attention layer in the Transformer encoder.
-        relative_attention_num_buckets (:obj:`int`, `optional`, defaults to 32):
+        relative_attention_num_buckets (`int`, *optional*, defaults to 32):
            The number of buckets to use for each attention layer.
-        dropout_rate (:obj:`float`, `optional`, defaults to 0.1):
+        dropout_rate (`float`, *optional*, defaults to 0.1):
            The ratio for all dropout layers.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-6):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
            The epsilon used by the layer normalization layers.
-        initializer_factor (:obj:`float`, `optional`, defaults to 1):
+        initializer_factor (`float`, *optional*, defaults to 1):
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).
-        feed_forward_proj (:obj:`string`, `optional`, defaults to :obj:`"gated-gelu"`):
-            Type of feed forward layer to be used. Should be one of :obj:`"relu"` or :obj:`"gated-gelu"`.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        feed_forward_proj (`string`, *optional*, defaults to `"gated-gelu"`):
+            Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`.
+        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
    """
    model_type = "mt5"

--- a/src/transformers/models/openai/configuration_openai.py
+++ b/src/transformers/models/openai/configuration_openai.py
@@ -26,91 +26,92 @@ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {"openai-gpt": "https://huggingface.c

 class OpenAIGPTConfig(PretrainedConfig):
    """
-    This is the configuration class to store the configuration of a :class:`~transformers.OpenAIGPTModel` or a
-    :class:`~transformers.TFOpenAIGPTModel`. It is used to instantiate a GPT model according to the specified
+    This is the configuration class to store the configuration of a [`OpenAIGPTModel`] or a
+    [`TFOpenAIGPTModel`]. It is used to instantiate a GPT model according to the specified
    arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
-    configuration to that of the `GPT <https://huggingface.co/openai-gpt>`__ architecture from OpenAI.
+    configuration to that of the [GPT](https://huggingface.co/openai-gpt) architecture from OpenAI.

-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.

    Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 40478):
+        vocab_size (`int`, *optional*, defaults to 40478):
            Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.OpenAIGPTModel` or
-            :class:`~transformers.TFOpenAIGPTModel`.
-        n_positions (:obj:`int`, `optional`, defaults to 512):
+            `inputs_ids` passed when calling [`OpenAIGPTModel`] or
+            [`TFOpenAIGPTModel`].
+        n_positions (`int`, *optional*, defaults to 512):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
-        n_embd (:obj:`int`, `optional`, defaults to 768):
+        n_embd (`int`, *optional*, defaults to 768):
            Dimensionality of the embeddings and hidden states.
-        n_layer (:obj:`int`, `optional`, defaults to 12):
+        n_layer (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
-        n_head (:obj:`int`, `optional`, defaults to 12):
+        n_head (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
-        afn (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+        afn (`str` or `Callable`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        resid_pdrop (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        embd_pdrop (:obj:`int`, `optional`, defaults to 0.1):
+        embd_pdrop (`int`, *optional*, defaults to 0.1):
            The dropout ratio for the embeddings.
-        attn_pdrop (:obj:`float`, `optional`, defaults to 0.1):
+        attn_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention.
-        layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
            The epsilon to use in the layer normalization layers
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        predict_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        predict_special_tokens (`bool`, *optional*, defaults to `True`):
            Whether or not special tokens should be predicted when the model has a language modeling head.
-        summary_type (:obj:`str`, `optional`, defaults to :obj:`"cls_index"`):
+        summary_type (`str`, *optional*, defaults to `"cls_index"`):
            Argument used when doing sequence summary, used in the models
-            :class:`~transformers.OpenAIGPTDoubleHeadsModel` and :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+            [`OpenAIGPTDoubleHeadsModel`] and [`OpenAIGPTDoubleHeadsModel`].

            Has to be one of the following options:

-                - :obj:`"last"`: Take the last token hidden state (like XLNet).
-                - :obj:`"first"`: Take the first token hidden state (like BERT).
-                - :obj:`"mean"`: Take the mean of all tokens hidden states.
-                - :obj:`"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
-                - :obj:`"attn"`: Not implemented now, use multi-head attention.
-        summary_use_proj (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                - `"last"`: Take the last token hidden state (like XLNet).
+                - `"first"`: Take the first token hidden state (like BERT).
+                - `"mean"`: Take the mean of all tokens hidden states.
+                - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
+                - `"attn"`: Not implemented now, use multi-head attention.
+        summary_use_proj (`bool`, *optional*, defaults to `True`):
            Argument used when doing sequence summary, used in the models
-            :class:`~transformers.OpenAIGPTDoubleHeadsModel` and :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+            [`OpenAIGPTDoubleHeadsModel`] and [`OpenAIGPTDoubleHeadsModel`].

            Whether or not to add a projection after the vector extraction.
-        summary_activation (:obj:`str`, `optional`):
+        summary_activation (`str`, *optional*):
            Argument used when doing sequence summary, used in the models
-            :class:`~transformers.OpenAIGPTDoubleHeadsModel` and :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+            [`OpenAIGPTDoubleHeadsModel`] and [`OpenAIGPTDoubleHeadsModel`].

-            Pass :obj:`"tanh"` for a tanh activation to the output, any other value will result in no activation.
-        summary_proj_to_labels (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
+        summary_proj_to_labels (`bool`, *optional*, defaults to `True`):
            Argument used when doing sequence summary, used in the models
-            :class:`~transformers.OpenAIGPTDoubleHeadsModel` and :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+            [`OpenAIGPTDoubleHeadsModel`] and [`OpenAIGPTDoubleHeadsModel`].

-            Whether the projection outputs should have :obj:`config.num_labels` or :obj:`config.hidden_size` classes.
-        summary_first_dropout (:obj:`float`, `optional`, defaults to 0.1):
+            Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes.
+        summary_first_dropout (`float`, *optional*, defaults to 0.1):
            Argument used when doing sequence summary, used in the models
-            :class:`~transformers.OpenAIGPTDoubleHeadsModel` and :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+            [`OpenAIGPTDoubleHeadsModel`] and [`OpenAIGPTDoubleHeadsModel`].

            The dropout ratio to be used after the projection and activation.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).


-    Examples::
+    Examples:

-        >>> from transformers import OpenAIGPTConfig, OpenAIGPTModel
+    ```python
+    >>> from transformers import OpenAIGPTConfig, OpenAIGPTModel

-        >>> # Initializing a GPT configuration
-        >>> configuration = OpenAIGPTConfig()
+    >>> # Initializing a GPT configuration
+    >>> configuration = OpenAIGPTConfig()

-        >>> # Initializing a model from the configuration
-        >>> model = OpenAIGPTModel(configuration)
+    >>> # Initializing a model from the configuration
+    >>> model = OpenAIGPTModel(configuration)

-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""

    model_type = "openai-gpt"
    attribute_map = {

--- a/src/transformers/models/openai/tokenization_openai.py
+++ b/src/transformers/models/openai/tokenization_openai.py
@@ -75,18 +75,18 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
    Construct a GPT Tokenizer. Based on Byte-Pair-Encoding with the following peculiarities:

    - lowercases all inputs,
-    - uses :obj:`SpaCy` tokenizer and :obj:`ftfy` for pre-BPE tokenization if they are installed, fallback to BERT's
-      :obj:`BasicTokenizer` if not.
+    - uses `SpaCy` tokenizer and `ftfy` for pre-BPE tokenization if they are installed, fallback to BERT's
+      `BasicTokenizer` if not.

-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
    Users should refer to this superclass for more information regarding those methods.

    Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
            Path to the vocabulary file.
-        merges_file (:obj:`str`):
+        merges_file (`str`):
            Path to the merges file.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
    """

--- a/src/transformers/models/openai/tokenization_openai_fast.py
+++ b/src/transformers/models/openai/tokenization_openai_fast.py
@@ -39,21 +39,21 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {

 class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast):
    """
-    Construct a "fast" GPT Tokenizer (backed by HuggingFace's `tokenizers` library). Based on Byte-Pair-Encoding with
+    Construct a "fast" GPT Tokenizer (backed by HuggingFace's *tokenizers* library). Based on Byte-Pair-Encoding with
    the following peculiarities:

    - lower case all inputs
    - uses BERT's BasicTokenizer for pre-BPE tokenization

-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
    methods. Users should refer to this superclass for more information regarding those methods.

    Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
            Path to the vocabulary file.
-        merges_file (:obj:`str`):
+        merges_file (`str`):
            Path to the merges file.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
    """

--- a/src/transformers/models/pegasus/configuration_pegasus.py
+++ b/src/transformers/models/pegasus/configuration_pegasus.py
@@ -28,77 +28,77 @@ PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class PegasusConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.PegasusModel`. It is used to
+    This is the configuration class to store the configuration of a [`PegasusModel`]. It is used to
    instantiate an PEGASUS model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the PEGASUS `google/pegasus-large
-    <https://huggingface.co/google/pegasus-large>`__ architecture.
+    configuration with the defaults will yield a similar configuration to that of the PEGASUS [google/pegasus-large](https://huggingface.co/google/pegasus-large) architecture.

-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.


    Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 50265):
+        vocab_size (`int`, *optional*, defaults to 50265):
            Vocabulary size of the PEGASUS model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.PegasusModel` or
-            :class:`~transformers.TFPegasusModel`.
-        d_model (:obj:`int`, `optional`, defaults to 1024):
+            `inputs_ids` passed when calling [`PegasusModel`] or
+            [`TFPegasusModel`].
+        d_model (`int`, *optional*, defaults to 1024):
            Dimensionality of the layers and the pooler layer.
-        encoder_layers (:obj:`int`, `optional`, defaults to 12):
+        encoder_layers (`int`, *optional*, defaults to 12):
            Number of encoder layers.
-        decoder_layers (:obj:`int`, `optional`, defaults to 12):
+        decoder_layers (`int`, *optional*, defaults to 12):
            Number of decoder layers.
-        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
-        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        activation_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for classifier.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
+        max_position_embeddings (`int`, *optional*, defaults to 1024):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
-        init_std (:obj:`float`, `optional`, defaults to 0.02):
+        init_std (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        scale_embedding (`bool`, *optional*, defaults to `False`):
            Scale embeddings by diving by sqrt(d_model).
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models)
-        forced_eos_token_id (:obj:`int`, `optional`, defaults to 1):
-            The id of the token to force as the last generated token when :obj:`max_length` is reached. Usually set to
-            :obj:`eos_token_id`.
+        forced_eos_token_id (`int`, *optional*, defaults to 1):
+            The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+            `eos_token_id`.

-    Example::
+    Example:

-        >>> from transformers import PegasusModel, PegasusConfig
+    ```python
+    >>> from transformers import PegasusModel, PegasusConfig

-        >>> # Initializing a PEGASUS google/pegasus-large style configuration
-        >>> configuration = PegasusConfig()
+    >>> # Initializing a PEGASUS google/pegasus-large style configuration
+    >>> configuration = PegasusConfig()

-        >>> # Initializing a model from the google/pegasus-large style configuration
-        >>> model = PegasusModel(configuration)
+    >>> # Initializing a model from the google/pegasus-large style configuration
+    >>> model = PegasusModel(configuration)

-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
    model_type = "pegasus"
    keys_to_ignore_at_inference = ["past_key_values"]
    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}

--- a/src/transformers/models/pegasus/modeling_flax_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_flax_pegasus.py
@@ -989,17 +989,18 @@ class FlaxPegasusPreTrainedModel(FlaxPreTrainedModel):
        r"""
        Returns:

-        Example::
+        Example:

-            >>> from transformers import PegasusTokenizer, FlaxPegasusForConditionalGeneration
+        ```python
+        >>> from transformers import PegasusTokenizer, FlaxPegasusForConditionalGeneration

-            >>> model = FlaxPegasusForConditionalGeneration.from_pretrained('google/pegasus-large')
-            >>> tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large')
+        >>> model = FlaxPegasusForConditionalGeneration.from_pretrained('google/pegasus-large')
+        >>> tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large')

-            >>> text = "My friends are cool but they eat too many carbs."
-            >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
-            >>> encoder_outputs = model.encode(**inputs)
-        """
+        >>> text = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
+        >>> encoder_outputs = model.encode(**inputs)
+        ```"""
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1054,23 +1055,24 @@ class FlaxPegasusPreTrainedModel(FlaxPreTrainedModel):
        r"""
        Returns:

-        Example::
+        Example:

-            >>> from transformers import PegasusTokenizer, FlaxPegasusForConditionalGeneration
+        ```python
+        >>> from transformers import PegasusTokenizer, FlaxPegasusForConditionalGeneration

-            >>> model = FlaxPegasusForConditionalGeneration.from_pretrained('google/pegasus-large')
-            >>> tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large')
+        >>> model = FlaxPegasusForConditionalGeneration.from_pretrained('google/pegasus-large')
+        >>> tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large')

-            >>> text = "My friends are cool but they eat too many carbs."
-            >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
-            >>> encoder_outputs = model.encode(**inputs)
+        >>> text = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
+        >>> encoder_outputs = model.encode(**inputs)

-            >>> decoder_start_token_id = model.config.decoder_start_token_id
-            >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
+        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id

-            >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
-            >>> last_decoder_hidden_states = outputs.last_hidden_state
-        """
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> last_decoder_hidden_states = outputs.last_hidden_state
+        ```"""
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1322,23 +1324,24 @@ class FlaxPegasusForConditionalGeneration(FlaxPegasusPreTrainedModel):
        r"""
        Returns:

-        Example::
+        Example:

-            >>> from transformers import PegasusTokenizer, FlaxPegasusForConditionalGeneration
+        ```python
+        >>> from transformers import PegasusTokenizer, FlaxPegasusForConditionalGeneration

-            >>> model = FlaxPegasusForConditionalGeneration.from_pretrained('google/pegasus-large')
-            >>> tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large')
+        >>> model = FlaxPegasusForConditionalGeneration.from_pretrained('google/pegasus-large')
+        >>> tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large')

-            >>> text = "My friends are cool but they eat too many carbs."
-            >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
-            >>> encoder_outputs = model.encode(**inputs)
+        >>> text = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
+        >>> encoder_outputs = model.encode(**inputs)

-            >>> decoder_start_token_id = model.config.decoder_start_token_id
-            >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
+        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id

-            >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
-            >>> logits = outputs.logits
-        """
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> logits = outputs.logits
+        ```"""
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states

--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -1197,19 +1197,20 @@ class PegasusModel(PegasusPreTrainedModel):
        r"""
        Returns:

-        Example::
+        Example:

-            >>> from transformers import PegasusTokenizer, PegasusModel
+        ```python
+        >>> from transformers import PegasusTokenizer, PegasusModel

-            >>> tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
-            >>> model = PegasusModel.from_pretrained("google/pegasus-large")
+        >>> tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
+        >>> model = PegasusModel.from_pretrained("google/pegasus-large")

-            >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
-            >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
-            >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+        >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
+        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)

-            >>> last_hidden_states = outputs.last_hidden_state
-        """
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (

--- a/src/transformers/models/pegasus/tokenization_pegasus.py
+++ b/src/transformers/models/pegasus/tokenization_pegasus.py
@@ -40,56 +40,57 @@ logger = logging.get_logger(__name__)

 class PegasusTokenizer(PreTrainedTokenizer):
    r"""
-    Construct a PEGASUS tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
+    Construct a PEGASUS tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
    Users should refer to this superclass for more information regarding those methods.

    Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

-            .. note::
+            <Tip>

-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask_2>"`):
+        mask_token (`str`, *optional*, defaults to `"<mask_2>"`):
            The token used for masking single token values. This is the token used when training this model with masked
            language modeling (MLM). This is the token that the PEGASUS encoder will try to predict during pretraining.
-            It corresponds to `[MASK2]` in `PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive
-            Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__.
-        mask_token_sent (:obj:`str`, `optional`, defaults to :obj:`"<mask_1>"`):
+            It corresponds to *[MASK2]* in [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive
+            Summarization](https://arxiv.org/pdf/1912.08777.pdf).
+        mask_token_sent (`str`, *optional*, defaults to `"<mask_1>"`):
            The token used for masking whole target sentences. This is the token used when training this model with gap
            sentences generation (GSG). This is the sentence that the PEGASUS decoder will try to predict during
-            pretraining. It corresponds to `[MASK1]` in `PEGASUS: Pre-training with Extracted Gap-sentences for
-            Abstractive Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__.
-        additional_special_tokens (:obj:`List[str]`, `optional`):
+            pretraining. It corresponds to *[MASK1]* in [PEGASUS: Pre-training with Extracted Gap-sentences for
+            Abstractive Summarization](https://arxiv.org/pdf/1912.08777.pdf).
+        additional_special_tokens (`List[str]`, *optional*):
            Additional special tokens used by the tokenizer. If no additional_special_tokens are provided <mask_2> and
-            <unk_2, ..., unk_102> are used as additional special tokens corresponding to the `original PEGASUS
-            tokenizer
-            <https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66>`__
+            <unk_2, ..., unk_102> are used as additional special tokens corresponding to the [original PEGASUS
+            tokenizer](https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66)
            that uses the tokens 2 - 104 only for pretraining
-        sp_model_kwargs (:obj:`dict`, `optional`):
-            Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
-            <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:

-            - ``enable_sampling``: Enable subword regularization.
-            - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

-              - ``nbest_size = {0,1}``: No sampling is performed.
-              - ``nbest_size > 1``: samples from the nbest_size results.
-              - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

-            - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.
    """
    vocab_files_names = VOCAB_FILES_NAMES
@@ -252,22 +253,22 @@ class PegasusTokenizer(PreTrainedTokenizer):
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating
-        and adding special tokens. A PEGASUS sequence has the following format, where ``X`` represents the sequence:
+        and adding special tokens. A PEGASUS sequence has the following format, where `X` represents the sequence:

-        - single sequence: ``X </s>``
-        - pair of sequences: ``A B </s>`` (not intended use)
+        - single sequence: `X </s>`
+        - pair of sequences: `A B </s>` (not intended use)

        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
        separator.

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        if token_ids_1 is None:
            return token_ids_0 + [self.eos_token_id]

--- a/src/transformers/models/pegasus/tokenization_pegasus_fast.py
+++ b/src/transformers/models/pegasus/tokenization_pegasus_fast.py
@@ -51,43 +51,44 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {

 class PegasusTokenizerFast(PreTrainedTokenizerFast):
    r"""
-    Construct a "fast" PEGASUS tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
-    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__.
+    Construct a "fast" PEGASUS tokenizer (backed by HuggingFace's *tokenizers* library). Based on [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).

-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
    methods. Users should refer to this superclass for more information regarding those methods.

    Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

-            .. note::
+            <Tip>

-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask_2>"`):
+        mask_token (`str`, *optional*, defaults to `"<mask_2>"`):
            The token used for masking single token values. This is the token used when training this model with masked
            language modeling (MLM). This is the token that the PEGASUS encoder will try to predict during pretraining.
-            It corresponds to `[MASK2]` in `PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive
-            Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__.
-        mask_token_sent (:obj:`str`, `optional`, defaults to :obj:`"<mask_1>"`):
+            It corresponds to *[MASK2]* in [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive
+            Summarization](https://arxiv.org/pdf/1912.08777.pdf).
+        mask_token_sent (`str`, *optional*, defaults to `"<mask_1>"`):
            The token used for masking whole target sentences. This is the token used when training this model with gap
            sentences generation (GSG). This is the sentence that the PEGASUS decoder will try to predict during
-            pretraining. It corresponds to `[MASK1]` in `PEGASUS: Pre-training with Extracted Gap-sentences for
-            Abstractive Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__.
-        additional_special_tokens (:obj:`List[str]`, `optional`):
+            pretraining. It corresponds to *[MASK1]* in [PEGASUS: Pre-training with Extracted Gap-sentences for
+            Abstractive Summarization](https://arxiv.org/pdf/1912.08777.pdf).
+        additional_special_tokens (`List[str]`, *optional*):
            Additional special tokens used by the tokenizer. If no additional_special_tokens are provided <mask_2> and
-            <unk_2, ..., unk_102> are used as additional special tokens corresponding to the `original PEGASUS
-            tokenizer
-            <https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66>`__
+            <unk_2, ..., unk_102> are used as additional special tokens corresponding to the [original PEGASUS
+            tokenizer](https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66)
            that uses the tokens 2 - 104 only for pretraining
    """
    vocab_files_names = VOCAB_FILES_NAMES
@@ -175,17 +176,17 @@ class PegasusTokenizerFast(PreTrainedTokenizerFast):
        """
        Build model inputs from a sequence by adding eos to the end. no bos token is added to the front.

-        - single sequence: ``X </s>``
-        - pair of sequences: ``A B </s>`` (not intended use)
+        - single sequence: `X </s>`
+        - pair of sequences: `A B </s>` (not intended use)

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        if token_ids_1 is None:
            return token_ids_0 + [self.eos_token_id]

--- a/src/transformers/models/perceiver/configuration_perceiver.py
+++ b/src/transformers/models/perceiver/configuration_perceiver.py
@@ -28,85 +28,86 @@ PERCEIVER_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class PerceiverConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.PerceiverModel`. It is used
+    This is the configuration class to store the configuration of a [`PerceiverModel`]. It is used
    to instantiate an Perceiver model according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the Perceiver
-    `deepmind/language-perceiver <https://huggingface.co/deepmind/language-perceiver>`__ architecture.
+    [deepmind/language-perceiver](https://huggingface.co/deepmind/language-perceiver) architecture.

-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.

    Args:
-        num_latents (:obj:`int`, `optional`, defaults to 256):
+        num_latents (`int`, *optional*, defaults to 256):
            The number of latents.
-        d_latents (:obj:`int`, `optional`, defaults to 1280):
+        d_latents (`int`, *optional*, defaults to 1280):
            Dimension of the latent embeddings.
-        d_model (:obj:`int`, `optional`, defaults to 768):
-            Dimension of the inputs. Should only be provided in case [`PerceiverTextPreprocessor`] is used or no
+        d_model (`int`, *optional*, defaults to 768):
+            Dimension of the inputs. Should only be provided in case [*PerceiverTextPreprocessor*] is used or no
            preprocessor is provided.
-        num_blocks (:obj:`int`, `optional`, defaults to 1):
+        num_blocks (`int`, *optional*, defaults to 1):
            Number of blocks in the Transformer encoder.
-        num_self_attends_per_block (:obj:`int`, `optional`, defaults to 26):
+        num_self_attends_per_block (`int`, *optional*, defaults to 26):
            The number of self-attention layers per block.
-        num_self_attention_heads (:obj:`int`, `optional`, defaults to 8):
+        num_self_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each self-attention layer in the Transformer encoder.
-        num_cross_attention_heads (:obj:`int`, `optional`, defaults to 8):
+        num_cross_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each cross-attention layer in the Transformer encoder.
-        qk_channels (:obj:`int`, `optional`):
+        qk_channels (`int`, *optional*):
            Dimension to project the queries + keys before applying attention in the cross-attention and self-attention
            layers of the encoder. Will default to preserving the dimension of the queries if not specified.
-        v_channels (:obj:`int`, `optional`):
+        v_channels (`int`, *optional*):
            Dimension to project the values before applying attention in the cross-attention and self-attention layers
            of the encoder. Will default to preserving the dimension of the queries if not specified.
-        cross_attention_shape_for_attention (:obj:`str`, `optional`, defaults to :obj:`'kv'`):
+        cross_attention_shape_for_attention (`str`, *optional*, defaults to `'kv'`):
            Dimension to use when downsampling the queries and keys in the cross-attention layer of the encoder.
-        self_attention_widening_factor (:obj:`int`, `optional`, defaults to 1):
+        self_attention_widening_factor (`int`, *optional*, defaults to 1):
            Dimension of the feed-forward layer in the cross-attention layer of the Transformer encoder.
-        cross_attention_widening_factor (:obj:`int`, `optional`, defaults to 1):
+        cross_attention_widening_factor (`int`, *optional*, defaults to 1):
            Dimension of the feed-forward layer in the self-attention layers of the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention probabilities.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
-        use_query_residual (:obj:`float`, `optional`, defaults to :obj:`True`):
+        use_query_residual (`float`, *optional*, defaults to `True`):
            Whether to add a query residual in the cross-attention layer of the encoder.
-        vocab_size (:obj:`int`, `optional`, defaults to 262):
+        vocab_size (`int`, *optional*, defaults to 262):
            Vocabulary size for the masked language modeling model.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 2048):
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
            The maximum sequence length that the masked language modeling model might ever be used with. Typically set
            this to something large just in case (e.g., 512 or 1024 or 2048).
-        image_size (:obj:`int`, `optional`, defaults to 56):
-            Size of the images after preprocessing, for :class:`~transformers.PerceiverForImageClassificationLearned`.
-        train_size (:obj:`List[int]`, `optional`, defaults to [368, 496]):
+        image_size (`int`, *optional*, defaults to 56):
+            Size of the images after preprocessing, for [`PerceiverForImageClassificationLearned`].
+        train_size (`List[int]`, *optional*, defaults to [368, 496]):
            Training size of the images for the optical flow model.
-        num_frames (:obj:`int`, `optional`, defaults to 16):
+        num_frames (`int`, *optional*, defaults to 16):
            Number of video frames used for the multimodal autoencoding model.
-        audio_samples_per_frame (:obj:`int`, `optional`, defaults to 1920):
+        audio_samples_per_frame (`int`, *optional*, defaults to 1920):
            Number of audio samples per frame for the multimodal autoencoding model.
-        samples_per_patch (:obj:`int`, `optional`, defaults to 16):
+        samples_per_patch (`int`, *optional*, defaults to 16):
            Number of audio samples per patch when preprocessing the audio for the multimodal autoencoding model.
-        output_shape (:obj:`List[int]`, `optional`, defaults to :obj:`[1, 16, 224, 224]`):
+        output_shape (`List[int]`, *optional*, defaults to `[1, 16, 224, 224]`):
            Shape of the output (batch_size, num_frames, height, width) for the video decoder queries of the multimodal
            autoencoding model. This excludes the channel dimension.

-    Example::
+    Example:

-        >>> from transformers import PerceiverModel, PerceiverConfig
+    ```python
+    >>> from transformers import PerceiverModel, PerceiverConfig

-        >>> # Initializing a Perceiver deepmind/language-perceiver style configuration
-        >>> configuration = PerceiverConfig()
+    >>> # Initializing a Perceiver deepmind/language-perceiver style configuration
+    >>> configuration = PerceiverConfig()

-        >>> # Initializing a model from the deepmind/language-perceiver style configuration
-        >>> model = PerceiverModel(configuration)
+    >>> # Initializing a model from the deepmind/language-perceiver style configuration
+    >>> model = PerceiverModel(configuration)

-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
    model_type = "perceiver"

    def __init__(

--- a/src/transformers/models/perceiver/feature_extraction_perceiver.py
+++ b/src/transformers/models/perceiver/feature_extraction_perceiver.py
@@ -38,31 +38,31 @@ class PerceiverFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMi
    r"""
    Constructs a Perceiver feature extractor.

-    This feature extractor inherits from :class:`~transformers.ImageFeatureExtractionMixin` which contains most of the
+    This feature extractor inherits from [`ImageFeatureExtractionMixin`] which contains most of the
    main methods. Users should refer to this superclass for more information regarding those methods.

    Args:
-        do_center_crop (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether to crop the input at the center. If the input size is smaller than :obj:`crop_size` along any edge,
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to crop the input at the center. If the input size is smaller than `crop_size` along any edge,
            the image is padded with 0's and then center cropped.
-        crop_size (:obj:`int`, `optional`, defaults to 256):
-            Desired output size when applying center-cropping. Only has an effect if :obj:`do_center_crop` is set to
-            :obj:`True`.
-        do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether to resize the input to a certain :obj:`size`.
-        size (:obj:`int` or :obj:`Tuple(int)`, `optional`, defaults to 224):
+        crop_size (`int`, *optional*, defaults to 256):
+            Desired output size when applying center-cropping. Only has an effect if `do_center_crop` is set to
+            `True`.
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the input to a certain `size`.
+        size (`int` or `Tuple(int)`, *optional*, defaults to 224):
            Resize the input to the given size. If a tuple is provided, it should be (width, height). If only an
-            integer is provided, then the input will be resized to (size, size). Only has an effect if :obj:`do_resize`
-            is set to :obj:`True`.
-        resample (:obj:`int`, `optional`, defaults to :obj:`PIL.Image.BICUBIC`):
-            An optional resampling filter. This can be one of :obj:`PIL.Image.NEAREST`, :obj:`PIL.Image.BOX`,
-            :obj:`PIL.Image.BILINEAR`, :obj:`PIL.Image.HAMMING`, :obj:`PIL.Image.BICUBIC` or :obj:`PIL.Image.LANCZOS`.
-            Only has an effect if :obj:`do_resize` is set to :obj:`True`.
-        do_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to normalize the input with :obj:`image_mean` and :obj:`image_std`.
-        image_mean (:obj:`List[int]`, defaults to :obj:`[0.485, 0.456, 0.406]`):
+            integer is provided, then the input will be resized to (size, size). Only has an effect if `do_resize`
+            is set to `True`.
+        resample (`int`, *optional*, defaults to `PIL.Image.BICUBIC`):
+            An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
+            `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`.
+            Only has an effect if `do_resize` is set to `True`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the input with `image_mean` and `image_std`.
+        image_mean (`List[int]`, defaults to `[0.485, 0.456, 0.406]`):
            The sequence of means for each channel, to be used when normalizing images.
-        image_std (:obj:`List[int]`, defaults to :obj:`[0.229, 0.224, 0.225]`):
+        image_std (`List[int]`, defaults to `[0.229, 0.224, 0.225]`):
            The sequence of standard deviations for each channel, to be used when normalizing images.
    """

@@ -92,11 +92,11 @@ class PerceiverFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMi

    def center_crop(self, image):
        """
-        Crops :obj:`image` to `self.crop_size` using a center crop. Note that if the image is too small to be cropped
+        Crops `image` to *self.crop_size* using a center crop. Note that if the image is too small to be cropped
        to the size given, it will be padded (so the returned result has the size asked).

        Args:
-            image (:obj:`PIL.Image.Image` or :obj:`np.ndarray` or :obj:`torch.Tensor`):
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
                The image to resize.
        """

@@ -125,27 +125,29 @@ class PerceiverFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMi
        """
        Main method to prepare for the model one or several image(s).

-        .. warning::
+        <Tip warning={true}>

-           NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
-           PIL images.
+        NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
+        PIL images.
+
+        </Tip>

        Args:
-            images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                number of channels, H and W are image height and width.

-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`, defaults to :obj:`'np'`):
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*, defaults to `'np'`):
                If set, will return tensors of a particular framework. Acceptable values are:

-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return NumPy :obj:`np.ndarray` objects.
-                * :obj:`'jax'`: Return JAX :obj:`jnp.ndarray` objects.
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.

        Returns:
-            :class:`~transformers.BatchFeature`: A :class:`~transformers.BatchFeature` with the following fields:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
              width).

--- a/src/transformers/models/perceiver/modeling_perceiver.py
+++ b/src/transformers/models/perceiver/modeling_perceiver.py
@@ -765,83 +765,84 @@ class PerceiverModel(PerceiverPreTrainedModel):
        r"""
        Returns:

-        Examples::
-
-            >>> from transformers import PerceiverConfig, PerceiverTokenizer, PerceiverFeatureExtractor, PerceiverModel
-            >>> from transformers.models.perceiver.modeling_perceiver import PerceiverTextPreprocessor, PerceiverImagePreprocessor, PerceiverClassificationDecoder
-            >>> import torch
-            >>> import requests
-            >>> from PIL import Image
-
-            >>> # EXAMPLE 1: using the Perceiver to classify texts
-            >>> # - we define a TextPreprocessor, which can be used to embed tokens
-            >>> # - we define a ClassificationDecoder, which can be used to decode the
-            >>> # final hidden states of the latents to classification logits
-            >>> # using trainable position embeddings
-            >>> config = PerceiverConfig()
-            >>> preprocessor = PerceiverTextPreprocessor(config)
-            >>> decoder = PerceiverClassificationDecoder(config,
-            ...                                          num_channels=config.d_latents,
-            ...                                          trainable_position_encoding_kwargs=dict(num_channels=config.d_latents, index_dims=1),
-            ...                                          use_query_residual=True)
-            >>> model = PerceiverModel(config, input_preprocessor=preprocessor, decoder=decoder)
-
-            >>> # you can then do a forward pass as follows:
-            >>> tokenizer = PerceiverTokenizer()
-            >>> text = "hello world"
-            >>> inputs = tokenizer(text, return_tensors="pt").input_ids
-
-            >>> with torch.no_grad():
-            >>>    outputs = model(inputs=inputs)
-            >>> logits = outputs.logits
-
-            >>> # to train, one can train the model using standard cross-entropy:
-            >>> criterion = torch.nn.CrossEntropyLoss()
-
-            >>> labels = torch.tensor([1])
-            >>> loss = criterion(logits, labels)
-
-            >>> # EXAMPLE 2: using the Perceiver to classify images
-            >>> # - we define an ImagePreprocessor, which can be used to embed images
-            >>> preprocessor=PerceiverImagePreprocessor(
-            ...              config,
-            ...              prep_type="conv1x1",
-            ...              spatial_downsample=1,
-            ...              out_channels=256,
-            ...              position_encoding_type="trainable",
-            ...              concat_or_add_pos="concat",
-            ...              project_pos_dim=256,
-            ...              trainable_position_encoding_kwargs=dict(num_channels=256, index_dims=config.image_size ** 2,
-            ...              ),
-            ... )
-
-            >>> model = PerceiverModel(
-            ...         config,
-            ...         input_preprocessor=preprocessor,
-            ...         decoder=PerceiverClassificationDecoder(
-            ...              config,
-            ...              num_channels=config.d_latents,
-            ...              trainable_position_encoding_kwargs=dict(num_channels=config.d_latents, index_dims=1),
-            ...              use_query_residual=True,
-            ...          ),
-            ... )
-
-            >>> # you can then do a forward pass as follows:
-            >>> feature_extractor = PerceiverFeatureExtractor()
-            >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
-            >>> image = Image.open(requests.get(url, stream=True).raw)
-            >>> inputs = feature_extractor(image, return_tensors="pt").pixel_values
-
-            >>> with torch.no_grad():
-            >>>    outputs = model(inputs=inputs)
-            >>> logits = outputs.logits
-
-            >>> # to train, one can train the model using standard cross-entropy:
-            >>> criterion = torch.nn.CrossEntropyLoss()
-
-            >>> labels = torch.tensor([1])
-            >>> loss = criterion(logits, labels)
-        """
+        Examples:
+
+        ```python
+        >>> from transformers import PerceiverConfig, PerceiverTokenizer, PerceiverFeatureExtractor, PerceiverModel
+        >>> from transformers.models.perceiver.modeling_perceiver import PerceiverTextPreprocessor, PerceiverImagePreprocessor, PerceiverClassificationDecoder
+        >>> import torch
+        >>> import requests
+        >>> from PIL import Image
+
+        >>> # EXAMPLE 1: using the Perceiver to classify texts
+        >>> # - we define a TextPreprocessor, which can be used to embed tokens
+        >>> # - we define a ClassificationDecoder, which can be used to decode the
+        >>> # final hidden states of the latents to classification logits
+        >>> # using trainable position embeddings
+        >>> config = PerceiverConfig()
+        >>> preprocessor = PerceiverTextPreprocessor(config)
+        >>> decoder = PerceiverClassificationDecoder(config,
+        ...                                          num_channels=config.d_latents,
+        ...                                          trainable_position_encoding_kwargs=dict(num_channels=config.d_latents, index_dims=1),
+        ...                                          use_query_residual=True)
+        >>> model = PerceiverModel(config, input_preprocessor=preprocessor, decoder=decoder)
+
+        >>> # you can then do a forward pass as follows:
+        >>> tokenizer = PerceiverTokenizer()
+        >>> text = "hello world"
+        >>> inputs = tokenizer(text, return_tensors="pt").input_ids
+
+        >>> with torch.no_grad():
+        >>>    outputs = model(inputs=inputs)
+        >>> logits = outputs.logits
+
+        >>> # to train, one can train the model using standard cross-entropy:
+        >>> criterion = torch.nn.CrossEntropyLoss()
+
+        >>> labels = torch.tensor([1])
+        >>> loss = criterion(logits, labels)
+
+        >>> # EXAMPLE 2: using the Perceiver to classify images
+        >>> # - we define an ImagePreprocessor, which can be used to embed images
+        >>> preprocessor=PerceiverImagePreprocessor(
+        ...              config,
+        ...              prep_type="conv1x1",
+        ...              spatial_downsample=1,
+        ...              out_channels=256,
+        ...              position_encoding_type="trainable",
+        ...              concat_or_add_pos="concat",
+        ...              project_pos_dim=256,
+        ...              trainable_position_encoding_kwargs=dict(num_channels=256, index_dims=config.image_size ** 2,
+        ...              ),
+        ... )
+
+        >>> model = PerceiverModel(
+        ...         config,
+        ...         input_preprocessor=preprocessor,
+        ...         decoder=PerceiverClassificationDecoder(
+        ...              config,
+        ...              num_channels=config.d_latents,
+        ...              trainable_position_encoding_kwargs=dict(num_channels=config.d_latents, index_dims=1),
+        ...              use_query_residual=True,
+        ...          ),
+        ... )
+
+        >>> # you can then do a forward pass as follows:
+        >>> feature_extractor = PerceiverFeatureExtractor()
+        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = feature_extractor(image, return_tensors="pt").pixel_values
+
+        >>> with torch.no_grad():
+        >>>    outputs = model(inputs=inputs)
+        >>> logits = outputs.logits
+
+        >>> # to train, one can train the model using standard cross-entropy:
+        >>> criterion = torch.nn.CrossEntropyLoss()
+
+        >>> labels = torch.tensor([1])
+        >>> loss = criterion(logits, labels)
+        ```"""
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states

--- a/src/transformers/models/perceiver/tokenization_perceiver.py
+++ b/src/transformers/models/perceiver/tokenization_perceiver.py
@@ -28,26 +28,29 @@ class PerceiverTokenizer(PreTrainedTokenizer):
    """
    Construct a Perceiver tokenizer. The Perceiver simply uses raw bytes utf-8 encoding.

-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
    Users should refer to this superclass for more information regarding those methods.

    Args:
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
            The token used for padding, for example when batching sequences of different lengths.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"[BOS]"`):
+        bos_token (`str`, *optional*, defaults to `"[BOS]"`):
            The BOS token (reserved in the vocab, but not actually used).
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"[EOS]"`):
+        eos_token (`str`, *optional*, defaults to `"[EOS]"`):
            The end of sequence token (reserved in the vocab, but not actually used).

-            .. note::
+            <Tip>

-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            The MASK token, useful for masked language modeling.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            The CLS token (reserved in the vocab, but not actually used).
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from two sequences.

    """
@@ -115,18 +118,18 @@ class PerceiverTokenizer(PreTrainedTokenizer):
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
@@ -145,17 +148,17 @@ class PerceiverTokenizer(PreTrainedTokenizer):
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks. A sequence has the
        following format:

-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]

--- a/src/transformers/models/phobert/tokenization_phobert.py
+++ b/src/transformers/models/phobert/tokenization_phobert.py
@@ -69,41 +69,47 @@ class PhobertTokenizer(PreTrainedTokenizer):
    """
    Construct a PhoBERT tokenizer. Based on Byte-Pair-Encoding.

-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
    Users should refer to this superclass for more information regarding those methods.

    Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
            Path to the vocabulary file.
-        merges_file (:obj:`str`):
+        merges_file (`str`):
            Path to the merges file.
-        bos_token (:obj:`st`, `optional`, defaults to :obj:`"<s>"`):
+        bos_token (`st`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

-            .. note::
+            <Tip>

-                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                sequence. The token used is the :obj:`cls_token`.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

-            .. note::
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>

-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
    """
@@ -162,17 +168,17 @@ class PhobertTokenizer(PreTrainedTokenizer):
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A PhoBERT sequence has the following format:

-        - single sequence: ``<s> X </s>``
-        - pair of sequences: ``<s> A </s></s> B </s>``
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """

        if token_ids_1 is None:
@@ -186,18 +192,18 @@ class PhobertTokenizer(PreTrainedTokenizer):
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        if already_has_special_tokens:
@@ -217,13 +223,13 @@ class PhobertTokenizer(PreTrainedTokenizer):
        make use of token type ids, therefore a list of zeros is returned.

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`: List of zeros.
+            `List[int]`: List of zeros.
        """

        sep = [self.sep_token_id]

--- a/src/transformers/models/prophetnet/configuration_prophetnet.py
+++ b/src/transformers/models/prophetnet/configuration_prophetnet.py
@@ -28,69 +28,69 @@ PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class ProphetNetConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.ProphetNetModel`. It is used
+    This is the configuration class to store the configuration of a [`ProphetNetModel`]. It is used
    to instantiate a ProphetNet model according to the specified arguments, defining the model architecture.

-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.

    Args:
-        activation_dropout (:obj:`float`, `optional`, defaults to 0.1):
+        activation_dropout (`float`, *optional*, defaults to 0.1):
            The dropout ratio for activations inside the fully connected layer.
-        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        vocab_size (`int`, *optional*, defaults to 30522):
            Vocabulary size of the ProphetNET model. Defines the number of different tokens that can be represented by
-            the :obj:`inputs_ids` passed when calling :class:`~transformers.ProphetNetModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 1024):
+            the `inputs_ids` passed when calling [`ProphetNetModel`].
+        hidden_size (`int`, *optional*, defaults to 1024):
            Dimensionality of the layers and the pooler layer.
-        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        num_encoder_layers (:obj:`int`, `optional`, defaults to 12):
+        num_encoder_layers (`int`, *optional*, defaults to 12):
            Number of encoder layers.
-        num_encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        num_encoder_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
-            Dimensionality of the ``intermediate`` (often named feed-forward) layer in decoder.
-        num_decoder_layers (:obj:`int`, `optional`, defaults to 12):
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the `intermediate` (often named feed-forward) layer in decoder.
+        num_decoder_layers (`int`, *optional*, defaults to 12):
            Number of decoder layers.
-        num_decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        num_decoder_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer decoder.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
+        attention_dropout (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention probabilities.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
+        dropout (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
-        init_std (:obj:`float`, `optional`, defaults to 0.02):
+        init_std (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        add_cross_attention (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        add_cross_attention (`bool`, *optional*, defaults to `True`):
            Whether cross-attention layers should be added to the model.
-        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
            Whether this is an encoder/decoder model.
-        pad_token_id (:obj:`int`, `optional`, defaults to 1)
+        pad_token_id (`int`, *optional*, defaults to 1)
            Padding token id.
-        bos_token_id (:obj:`int`, `optional`, defaults to 0)
+        bos_token_id (`int`, *optional*, defaults to 0)
            Beginning of stream token id.
-        eos_token_id (:obj:`int`, `optional`, defaults to 2)
+        eos_token_id (`int`, *optional*, defaults to 2)
            End of stream token id.
-        ngram (:obj:`int`, `optional`, defaults to 2)
+        ngram (`int`, *optional*, defaults to 2)
            Number of future tokens to predict. Set to 1 to be same as traditional Language model to predict next first
            token.
-        num_buckets (:obj:`int`, `optional`, defaults to 32)
+        num_buckets (`int`, *optional*, defaults to 32)
            The number of buckets to use for each attention layer. This is for relative position calculation. See the
-            `T5 paper <see https://arxiv.org/abs/1910.10683>`__ for more details.
-        relative_max_distance (:obj:`int`, `optional`, defaults to 128)
+            [T5 paper](see https://arxiv.org/abs/1910.10683) for more details.
+        relative_max_distance (`int`, *optional*, defaults to 128)
            Relative distances greater than this number will be put into the last same bucket. This is for relative
-            position calculation. See the `T5 paper <see https://arxiv.org/abs/1910.10683>`__ for more details.
-        disable_ngram_loss (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            position calculation. See the [T5 paper](see https://arxiv.org/abs/1910.10683) for more details.
+        disable_ngram_loss (`bool`, *optional*, defaults to `False`):
            Whether be trained predicting only the next first token.
-        eps (:obj:`float`, `optional`, defaults to 0.0):
-            Controls the ``epsilon`` parameter value for label smoothing in the loss calculation. If set to 0, no label
+        eps (`float`, *optional*, defaults to 0.0):
+            Controls the `epsilon` parameter value for label smoothing in the loss calculation. If set to 0, no label
            smoothing is performed.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
    """
    model_type = "prophetnet"

--- a/src/transformers/models/prophetnet/modeling_prophetnet.py
+++ b/src/transformers/models/prophetnet/modeling_prophetnet.py
@@ -1271,18 +1271,19 @@ class ProphetNetEncoder(ProphetNetPreTrainedModel):
        r"""
        Returns:

-        Example::
+        Example:

-            >>> from transformers import ProphetNetTokenizer, ProphetNetEncoder
-            >>> import torch
+        ```python
+        >>> from transformers import ProphetNetTokenizer, ProphetNetEncoder
+        >>> import torch

-            >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased')
-            >>> model = ProphetNetEncoder.from_pretrained('patrickvonplaten/prophetnet-large-uncased-standalone')
-            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-            >>> outputs = model(**inputs)
+        >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased')
+        >>> model = ProphetNetEncoder.from_pretrained('patrickvonplaten/prophetnet-large-uncased-standalone')
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)

-            >>> last_hidden_states = outputs.last_hidden_state
-        """
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
@@ -1788,20 +1789,21 @@ class ProphetNetModel(ProphetNetPreTrainedModel):
        r"""
        Returns:

-        Example::
+        Example:

-            >>> from transformers import ProphetNetTokenizer, ProphetNetModel
+        ```python
+        >>> from transformers import ProphetNetTokenizer, ProphetNetModel

-            >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased')
-            >>> model = ProphetNetModel.from_pretrained('microsoft/prophetnet-large-uncased')
+        >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased')
+        >>> model = ProphetNetModel.from_pretrained('microsoft/prophetnet-large-uncased')

-            >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
-            >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
-            >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+        >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
+        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)

-            >>> last_hidden_states = outputs.last_hidden_state  # main stream hidden states
-            >>> last_hidden_states_ngram = outputs.last_hidden_state_ngram  # predict hidden states
-        """
+        >>> last_hidden_states = outputs.last_hidden_state  # main stream hidden states
+        >>> last_hidden_states_ngram = outputs.last_hidden_state_ngram  # predict hidden states
+        ```"""
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (

--- a/src/transformers/models/prophetnet/tokenization_prophetnet.py
+++ b/src/transformers/models/prophetnet/tokenization_prophetnet.py
@@ -56,46 +56,45 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
    r"""
    Construct a ProphetNetTokenizer. Based on WordPiece.

-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
    Users should refer to this superclass for more information regarding those methods.

    Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
            File containing the vocabulary.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
-        do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
            Whether or not to do basic tokenization before WordPiece.
-        never_split (:obj:`Iterable`, `optional`):
+        never_split (`Iterable`, *optional*):
            Collection of tokens which will never be split during tokenization. Only has an effect when
-            :obj:`do_basic_tokenize=True`
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+            `do_basic_tokenize=True`
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
-        x_sep_token (:obj:`str`, `optional`, defaults to :obj:`"[X_SEP]"`):
+        x_sep_token (`str`, *optional*, defaults to `"[X_SEP]"`):
            Special second separator token, which can be generated by
-            :class:`~transformers.ProphetNetForConditionalGeneration`. It is used to separate bullet-point like
+            [`ProphetNetForConditionalGeneration`]. It is used to separate bullet-point like
            sentences in summarization, *e.g.*.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
-        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters.

-            This should likely be deactivated for Japanese (see this `issue
-            <https://github.com/huggingface/transformers/issues/328>`__).
-        strip_accents: (:obj:`bool`, `optional`):
+            This should likely be deactivated for Japanese (see this [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents: (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for :obj:`lowercase` (as in the original BERT).
+            value for `lowercase` (as in the original BERT).
    """

    vocab_files_names = VOCAB_FILES_NAMES
@@ -189,18 +188,18 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
@@ -218,21 +217,21 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A ProphetNet
        sequence pair mask has the following format:

-        ::
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```

-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
-
-        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
            sequence(s).
        """
        sep = [self.sep_token_id]
@@ -267,17 +266,17 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A BERT sequence has the following format:

-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        if token_ids_1 is None:
            return token_ids_0 + [self.sep_token_id]

--- a/src/transformers/models/qdqbert/configuration_qdqbert.py
+++ b/src/transformers/models/qdqbert/configuration_qdqbert.py
@@ -28,60 +28,60 @@ QDQBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class QDQBertConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.QDQBertModel`. It is used to
+    This is the configuration class to store the configuration of a [`QDQBertModel`]. It is used to
    instantiate an QDQBERT model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the BERT `bert-base-uncased
-    <https://huggingface.co/bert-base-uncased>`__ architecture.
+    configuration with the defaults will yield a similar configuration to that of the BERT [bert-base-uncased](https://huggingface.co/bert-base-uncased) architecture.

-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.


    Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+        vocab_size (`int`, *optional*, defaults to 30522):
            Vocabulary size of the QDQBERT model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.QDQBertModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            `inputs_ids` passed when calling [`QDQBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
            Dimension of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.QDQBertModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`QDQBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if ``config.is_decoder=True``.
+            relevant if `config.is_decoder=True`.

-    Examples::
+    Examples:

-        >>> from transformers import QDQBertModel, QDQBertConfig
+    ```python
+    >>> from transformers import QDQBertModel, QDQBertConfig

-        >>> # Initializing a QDQBERT bert-base-uncased style configuration
-        >>> configuration = QDQBertConfig()
+    >>> # Initializing a QDQBERT bert-base-uncased style configuration
+    >>> configuration = QDQBertConfig()

-        >>> # Initializing a model from the bert-base-uncased style configuration
-        >>> model = QDQBertModel(configuration)
+    >>> # Initializing a model from the bert-base-uncased style configuration
+    >>> model = QDQBertModel(configuration)

-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
    model_type = "qdqbert"

    def __init__(

--- a/src/transformers/models/rag/configuration_rag.py
+++ b/src/transformers/models/rag/configuration_rag.py
@@ -21,62 +21,62 @@ from ...file_utils import add_start_docstrings


 RAG_CONFIG_DOC = r"""
-    :class:`~transformers.RagConfig` stores the configuration of a `RagModel`. Configuration objects inherit from
-    :class:`~transformers.PretrainedConfig` and can be used to control the model outputs. Read the documentation from
-    :class:`~transformers.PretrainedConfig` for more information.
+    [`RagConfig`] stores the configuration of a *RagModel*. Configuration objects inherit from
+    [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from
+    [`PretrainedConfig`] for more information.

    Args:
-        title_sep (:obj:`str`, `optional`, defaults to  ``" / "``):
+        title_sep (`str`, *optional*, defaults to  `" / "`):
            Separator inserted between the title and the text of the retrieved document when calling
-            :class:`~transformers.RagRetriever`.
-        doc_sep (:obj:`str`, `optional`, defaults to  ``" // "``):
+            [`RagRetriever`].
+        doc_sep (`str`, *optional*, defaults to  `" // "`):
            Separator inserted between the the text of the retrieved document and the original input when calling
-            :class:`~transformers.RagRetriever`.
-        n_docs (:obj:`int`, `optional`, defaults to 5):
+            [`RagRetriever`].
+        n_docs (`int`, *optional*, defaults to 5):
            Number of documents to retrieve.
-        max_combined_length (:obj:`int`, `optional`, defaults to 300):
-            Max length of contextualized input returned by :meth:`~transformers.RagRetriever.__call__`.
-        retrieval_vector_size (:obj:`int`, `optional`, defaults to 768):
-            Dimensionality of the document embeddings indexed by :class:`~transformers.RagRetriever`.
-        retrieval_batch_size (:obj:`int`, `optional`, defaults to 8):
+        max_combined_length (`int`, *optional*, defaults to 300):
+            Max length of contextualized input returned by [`~RagRetriever.__call__`].
+        retrieval_vector_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the document embeddings indexed by [`RagRetriever`].
+        retrieval_batch_size (`int`, *optional*, defaults to 8):
            Retrieval batch size, defined as the number of queries issues concurrently to the faiss index encapsulated
-            :class:`~transformers.RagRetriever`.
-        dataset (:obj:`str`, `optional`, defaults to :obj:`"wiki_dpr"`):
+            [`RagRetriever`].
+        dataset (`str`, *optional*, defaults to `"wiki_dpr"`):
            A dataset identifier of the indexed dataset in HuggingFace Datasets (list all available datasets and ids
-            using :obj:`datasets.list_datasets()`).
-        dataset_split (:obj:`str`, `optional`, defaults to :obj:`"train"`)
-            Which split of the :obj:`dataset` to load.
-        index_name (:obj:`str`, `optional`, defaults to :obj:`"compressed"`)
-            The index name of the index associated with the :obj:`dataset`. One can choose between :obj:`"legacy"`,
-            :obj:`"exact"` and :obj:`"compressed"`.
-        index_path (:obj:`str`, `optional`)
+            using `datasets.list_datasets()`).
+        dataset_split (`str`, *optional*, defaults to `"train"`)
+            Which split of the `dataset` to load.
+        index_name (`str`, *optional*, defaults to `"compressed"`)
+            The index name of the index associated with the `dataset`. One can choose between `"legacy"`,
+            `"exact"` and `"compressed"`.
+        index_path (`str`, *optional*)
            The path to the serialized faiss index on disk.
-        passages_path: (:obj:`str`, `optional`):
+        passages_path: (`str`, *optional*):
            A path to text passages compatible with the faiss index. Required if using
-            :class:`~transformers.models.rag.retrieval_rag.LegacyIndex`
-        use_dummy_dataset (:obj:`bool`, `optional`, defaults to ``False``)
-            Whether to load a "dummy" variant of the dataset specified by :obj:`dataset`.
-        label_smoothing (:obj:`float`, `optional`, defaults to 0.0):
-            Only relevant if ``return_loss`` is set to :obj:`True`. Controls the ``epsilon`` parameter value for label
+            [`~models.rag.retrieval_rag.LegacyIndex`]
+        use_dummy_dataset (`bool`, *optional*, defaults to `False`)
+            Whether to load a "dummy" variant of the dataset specified by `dataset`.
+        label_smoothing (`float`, *optional*, defaults to 0.0):
+            Only relevant if `return_loss` is set to `True`. Controls the `epsilon` parameter value for label
            smoothing in the loss calculation. If set to 0, no label smoothing is performed.
-        do_marginalize (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            If :obj:`True`, the logits are marginalized over all documents by making use of
-            ``torch.nn.functional.log_softmax``.
-        reduce_loss (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to reduce the NLL loss using the ``torch.Tensor.sum`` operation.
-        do_deduplication (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_marginalize (`bool`, *optional*, defaults to `False`):
+            If `True`, the logits are marginalized over all documents by making use of
+            `torch.nn.functional.log_softmax`.
+        reduce_loss (`bool`, *optional*, defaults to `False`):
+            Whether or not to reduce the NLL loss using the `torch.Tensor.sum` operation.
+        do_deduplication (`bool`, *optional*, defaults to `True`):
            Whether or not to deduplicate the generations from different context documents for a given input. Has to be
-            set to :obj:`False` if used while training with distributed backend.
-        exclude_bos_score (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            set to `False` if used while training with distributed backend.
+        exclude_bos_score (`bool`, *optional*, defaults to `False`):
            Whether or not to disregard the BOS token when computing the loss.
-        output_retrieved(:obj:`bool`, `optional`, defaults to :obj:`False`):
-            If set to ``True``, :obj:`retrieved_doc_embeds`, :obj:`retrieved_doc_ids`, :obj:`context_input_ids` and
-            :obj:`context_attention_mask` are returned. See returned tensors for more detail.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        output_retrieved(`bool`, *optional*, defaults to `False`):
+            If set to `True`, `retrieved_doc_embeds`, `retrieved_doc_ids`, `context_input_ids` and
+            `context_attention_mask` are returned. See returned tensors for more detail.
+        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
-        forced_eos_token_id (:obj:`int`, `optional`):
-            The id of the token to force as the last generated token when :obj:`max_length` is reached. Usually set to
-            :obj:`eos_token_id`.
+        forced_eos_token_id (`int`, *optional*):
+            The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+            `eos_token_id`.
 """


@@ -174,21 +174,21 @@ class RagConfig(PretrainedConfig):
        cls, question_encoder_config: PretrainedConfig, generator_config: PretrainedConfig, **kwargs
    ) -> PretrainedConfig:
        r"""
-        Instantiate a :class:`~transformers.EncoderDecoderConfig` (or a derived class) from a pre-trained encoder model
+        Instantiate a [`EncoderDecoderConfig`] (or a derived class) from a pre-trained encoder model
        configuration and decoder model configuration.

        Returns:
-            :class:`EncoderDecoderConfig`: An instance of a configuration object
+            [`EncoderDecoderConfig`]: An instance of a configuration object
        """
        return cls(question_encoder=question_encoder_config.to_dict(), generator=generator_config.to_dict(), **kwargs)

    def to_dict(self):
        """
        Serializes this instance to a Python dictionary. Override the default
-        :meth:`~transformers.PretrainedConfig.to_dict`.
+        [`~PretrainedConfig.to_dict`].

        Returns:
-            :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
        """
        output = copy.deepcopy(self.__dict__)
        output["question_encoder"] = self.question_encoder.to_dict()

--- a/src/transformers/models/rag/modeling_rag.py
+++ b/src/transformers/models/rag/modeling_rag.py
@@ -544,19 +544,20 @@ class RagModel(RagPreTrainedModel):
        r"""
        Returns:

-        Example::
+        Example:

-            >>> from transformers import RagTokenizer, RagRetriever, RagModel
-            >>> import torch
+        ```python
+        >>> from transformers import RagTokenizer, RagRetriever, RagModel
+        >>> import torch

-            >>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-base")
-            >>> retriever = RagRetriever.from_pretrained("facebook/rag-token-base", index_name="exact", use_dummy_dataset=True)
-            >>> # initialize with RagRetriever to do everything in one forward call
-            >>> model = RagModel.from_pretrained("facebook/rag-token-base", retriever=retriever)
+        >>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-base")
+        >>> retriever = RagRetriever.from_pretrained("facebook/rag-token-base", index_name="exact", use_dummy_dataset=True)
+        >>> # initialize with RagRetriever to do everything in one forward call
+        >>> model = RagModel.from_pretrained("facebook/rag-token-base", retriever=retriever)

-            >>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
-            >>> outputs = model(input_ids=inputs["input_ids"])
-        """
+        >>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
+        >>> outputs = model(input_ids=inputs["input_ids"])
+        ```"""
        n_docs = n_docs if n_docs is not None else self.config.n_docs
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions