Mass conversion of documentation from rst to Markdown (#14866)

* Convert docstrings of all configurations and tokenizers * Processors and fixes * Last modeling files and fixes to models * Pipeline modules * Utils files * Data submodule * All the other files * Style * Missing examples * Style again * Fix copies * Say bye bye to rst docstrings forever

Mass conversion of documentation from rst to Markdown (#14866)
* Convert docstrings of all configurations and tokenizers * Processors and fixes * Last modeling files and fixes to models * Pipeline modules * Utils files * Data submodule * All the other files * Style * Missing examples * Style again * Fix copies * Say bye bye to rst docstrings forever
27b3031d · Sylvain Gugger · GitHub · 18587639 · 27b3031d · 27b3031d
Unverified Commit 27b3031d authored Dec 21, 2021 by Sylvain Gugger Committed by GitHub Dec 21, 2021
20 changed files
--- a/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py
@@ -30,72 +30,75 @@ BIGBIRD_PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class BigBirdPegasusConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.BigBirdPegasusModel`. It is
+    This is the configuration class to store the configuration of a [`BigBirdPegasusModel`]. It is
    used to instantiate an BigBirdPegasus model according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the BigBirdPegasus
-    `google/bigbird-pegasus-large-arxiv <https://huggingface.co/google/bigbird-pegasus-large-arxiv>`__ architecture.
+    [google/bigbird-pegasus-large-arxiv](https://huggingface.co/google/bigbird-pegasus-large-arxiv) architecture.
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
    Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 96103):
+        vocab_size (`int`, *optional*, defaults to 96103):
            Vocabulary size of the BigBirdPegasus model. Defines the number of different tokens that can be represented
-            by the :obj:`inputs_ids` passed when calling :class:`~transformers.BigBirdPegasusModel`.
+            by the `inputs_ids` passed when calling [`BigBirdPegasusModel`].
-        d_model (:obj:`int`, `optional`, defaults to 1024):
+        d_model (`int`, *optional*, defaults to 1024):
            Dimension of the layers and the pooler layer.
-        encoder_layers (:obj:`int`, `optional`, defaults to 16):
+        encoder_layers (`int`, *optional*, defaults to 16):
            Number of encoder layers.
-        decoder_layers (:obj:`int`, `optional`, defaults to 16):
+        decoder_layers (`int`, *optional*, defaults to 16):
            Number of decoder layers.
-        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
-        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
-        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu_new"`):
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu_new"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
+        dropout (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
-        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        activation_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for classifier.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 4096):
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 1024 or 2048 or 4096).
-        init_std (:obj:`float`, `optional`, defaults to 0.02):
+        init_std (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
+        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
+            https://arxiv.org/abs/1909.11556) for more details.
-        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
+        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
+            https://arxiv.org/abs/1909.11556) for more details.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
-        attention_type (:obj:`str`, `optional`, defaults to :obj:`"block_sparse"`)
+        attention_type (`str`, *optional*, defaults to `"block_sparse"`)
            Whether to use block sparse attention (with n complexity) as introduced in paper or original attention
-            layer (with n^2 complexity) in encoder. Possible values are :obj:`"original_full"` and
+            layer (with n^2 complexity) in encoder. Possible values are `"original_full"` and
-            :obj:`"block_sparse"`.
+            `"block_sparse"`.
-        use_bias (:obj:`bool`, `optional`, defaults to :obj:`False`)
+        use_bias (`bool`, *optional*, defaults to `False`)
            Whether to use bias in query, key, value.
-        block_size (:obj:`int`, `optional`, defaults to 64)
+        block_size (`int`, *optional*, defaults to 64)
-            Size of each block. Useful only when :obj:`attention_type == "block_sparse"`.
+            Size of each block. Useful only when `attention_type == "block_sparse"`.
-        num_random_blocks (:obj:`int`, `optional`, defaults to 3)
+        num_random_blocks (`int`, *optional*, defaults to 3)
-            Each query is going to attend these many number of random blocks. Useful only when :obj:`attention_type ==
+            Each query is going to attend these many number of random blocks. Useful only when `attention_type == "block_sparse"`.
-            "block_sparse"`.
+        scale_embeddings (`bool`, *optional*, defaults to `True`)
-        scale_embeddings (:obj:`bool`, `optional`, defaults to :obj:`True`)
            Whether to rescale embeddings with (hidden_size ** 0.5).
-        Example::
+    Example:
+    ```python
+    ```
        >>> from transformers import BigBirdPegasusModel, BigBirdPegasusConfig

--- a/src/transformers/models/blenderbot/configuration_blenderbot.py
+++ b/src/transformers/models/blenderbot/configuration_blenderbot.py
@@ -28,77 +28,78 @@ BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class BlenderbotConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.BlenderbotModel`. It is used
+    This is the configuration class to store the configuration of a [`BlenderbotModel`]. It is used
    to instantiate an Blenderbot model according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the Blenderbot
-    `facebook/blenderbot-3B <https://huggingface.co/facebook/blenderbot-3B>`__ architecture.
+    [facebook/blenderbot-3B](https://huggingface.co/facebook/blenderbot-3B) architecture.
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
    Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 50265):
+        vocab_size (`int`, *optional*, defaults to 50265):
            Vocabulary size of the Blenderbot model. Defines the number of different tokens that can be represented by
-            the :obj:`inputs_ids` passed when calling :class:`~transformers.BlenderbotModel` or
+            the `inputs_ids` passed when calling [`BlenderbotModel`] or
-            :class:`~transformers.TFBlenderbotModel`.
+            [`TFBlenderbotModel`].
-        d_model (:obj:`int`, `optional`, defaults to 1024):
+        d_model (`int`, *optional*, defaults to 1024):
            Dimensionality of the layers and the pooler layer.
-        encoder_layers (:obj:`int`, `optional`, defaults to 12):
+        encoder_layers (`int`, *optional*, defaults to 12):
            Number of encoder layers.
-        decoder_layers (:obj:`int`, `optional`, defaults to 12):
+        decoder_layers (`int`, *optional*, defaults to 12):
            Number of decoder layers.
-        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
+        dropout (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
-        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        activation_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for classifier.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 128):
+        max_position_embeddings (`int`, *optional*, defaults to 128):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
-        init_std (:obj:`float`, `optional`, defaults to 0.02):
+        init_std (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
+        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
+            https://arxiv.org/abs/1909.11556) for more details.
-        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
+        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
+            https://arxiv.org/abs/1909.11556) for more details.
-        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        scale_embedding (`bool`, *optional*, defaults to `False`):
            Scale embeddings by diving by sqrt(d_model).
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models)
-        forced_eos_token_id (:obj:`int`, `optional`, defaults to 2):
+        forced_eos_token_id (`int`, *optional*, defaults to 2):
-            The id of the token to force as the last generated token when :obj:`max_length` is reached. Usually set to
+            The id of the token to force as the last generated token when `max_length` is reached. Usually set to
-            :obj:`eos_token_id`.
+            `eos_token_id`.
-    Example::
+    Example:
-        >>> from transformers import BlenderbotModel, BlenderbotConfig
+    ```python
+    >>> from transformers import BlenderbotModel, BlenderbotConfig
-        >>> # Initializing a Blenderbot facebook/blenderbot-3B style configuration
+    >>> # Initializing a Blenderbot facebook/blenderbot-3B style configuration
-        >>> configuration = BlenderbotConfig()
+    >>> configuration = BlenderbotConfig()
-        >>> # Initializing a model from the facebook/blenderbot-3B style configuration
+    >>> # Initializing a model from the facebook/blenderbot-3B style configuration
-        >>> model = BlenderbotModel(configuration)
+    >>> model = BlenderbotModel(configuration)
-        >>> # Accessing the model configuration
+    >>> # Accessing the model configuration
-        >>> configuration = model.config
+    >>> configuration = model.config
-    """
+    ```"""
    model_type = "blenderbot"
    keys_to_ignore_at_inference = ["past_key_values"]
    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}

--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -1128,19 +1128,20 @@ class BlenderbotModel(BlenderbotPreTrainedModel):
        r"""
        Returns:
-        Example::
+        Example:
-            >>> from transformers import BlenderbotTokenizer, BlenderbotModel
+        ```python
+        >>> from transformers import BlenderbotTokenizer, BlenderbotModel
-            >>> model = BlenderbotModel.from_pretrained("facebook/blenderbot-400M-distill")
+        >>> model = BlenderbotModel.from_pretrained("facebook/blenderbot-400M-distill")
-            >>> tokenizer = BlenderbotTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
+        >>> tokenizer = BlenderbotTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
-            >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
+        >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
-            >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
+        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
-            >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
-            >>> last_hidden_states = outputs.last_hidden_state
+        >>> last_hidden_states = outputs.last_hidden_state
-        """
+        ```"""
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states

--- a/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
@@ -977,17 +977,18 @@ class FlaxBlenderbotPreTrainedModel(FlaxPreTrainedModel):
        r"""
        Returns:
-        Example::
+        Example:
-            >>> from transformers import BlenderbotTokenizer, FlaxBlenderbotForConditionalGeneration
+        ```python
+        >>> from transformers import BlenderbotTokenizer, FlaxBlenderbotForConditionalGeneration
-            >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained('facebook/blenderbot-400M-distill')
+        >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained('facebook/blenderbot-400M-distill')
-            >>> tokenizer = BlenderbotTokenizer.from_pretrained('facebook/blenderbot-400M-distill')
+        >>> tokenizer = BlenderbotTokenizer.from_pretrained('facebook/blenderbot-400M-distill')
-            >>> text = "My friends are cool but they eat too many carbs."
+        >>> text = "My friends are cool but they eat too many carbs."
-            >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
-            >>> encoder_outputs = model.encode(**inputs)
+        >>> encoder_outputs = model.encode(**inputs)
-        """
+        ```"""
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1044,23 +1045,24 @@ class FlaxBlenderbotPreTrainedModel(FlaxPreTrainedModel):
        r"""
        Returns:
-        Example::
+        Example:
-            >>> from transformers import BlenderbotTokenizer, FlaxBlenderbotForConditionalGeneration
+        ```python
+        >>> from transformers import BlenderbotTokenizer, FlaxBlenderbotForConditionalGeneration
-            >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained('facebook/blenderbot-400M-distill')
+        >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained('facebook/blenderbot-400M-distill')
-            >>> tokenizer = BlenderbotTokenizer.from_pretrained('facebook/blenderbot-400M-distill')
+        >>> tokenizer = BlenderbotTokenizer.from_pretrained('facebook/blenderbot-400M-distill')
-            >>> text = "My friends are cool but they eat too many carbs."
+        >>> text = "My friends are cool but they eat too many carbs."
-            >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
-            >>> encoder_outputs = model.encode(**inputs)
+        >>> encoder_outputs = model.encode(**inputs)
-            >>> decoder_start_token_id = model.config.decoder_start_token_id
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
-            >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
-            >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
-            >>> last_decoder_hidden_states = outputs.last_hidden_state
+        >>> last_decoder_hidden_states = outputs.last_hidden_state
-        """
+        ```"""
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1312,23 +1314,24 @@ class FlaxBlenderbotForConditionalGeneration(FlaxBlenderbotPreTrainedModel):
        r"""
        Returns:
-        Example::
+        Example:
-            >>> from transformers import BlenderbotTokenizer, FlaxBlenderbotForConditionalGeneration
+        ```python
+        >>> from transformers import BlenderbotTokenizer, FlaxBlenderbotForConditionalGeneration
-            >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained('facebook/blenderbot-400M-distill')
+        >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained('facebook/blenderbot-400M-distill')
-            >>> tokenizer = BlenderbotTokenizer.from_pretrained('facebook/blenderbot-400M-distill')
+        >>> tokenizer = BlenderbotTokenizer.from_pretrained('facebook/blenderbot-400M-distill')
-            >>> text = "My friends are cool but they eat too many carbs."
+        >>> text = "My friends are cool but they eat too many carbs."
-            >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
-            >>> encoder_outputs = model.encode(**inputs)
+        >>> encoder_outputs = model.encode(**inputs)
-            >>> decoder_start_token_id = model.config.decoder_start_token_id
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
-            >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
-            >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
-            >>> logits = outputs.logits
+        >>> logits = outputs.logits
-        """
+        ```"""
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states

--- a/src/transformers/models/blenderbot/tokenization_blenderbot.py
+++ b/src/transformers/models/blenderbot/tokenization_blenderbot.py
@@ -47,11 +47,11 @@ class BlenderbotTokenizer(RobertaTokenizer):
    r"""
    Construct a Blenderbot tokenizer.
-    :class:`~transformers.Blenderbot` is nearly identical to :class:`~transformers.RobertaTokenizer` and runs
+    [`Blenderbot`] is nearly identical to [`RobertaTokenizer`] and runs
    end-to-end tokenization: punctuation splitting and wordpiece. The only difference is that it doesn't add BOS token
    to the beginning of sequences.
-    Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning
+    Refer to superclass [`RobertaTokenizer`] for usage examples and documentation concerning
    parameters.
    """
    vocab_files_names = VOCAB_FILES_NAMES
@@ -63,16 +63,16 @@ class BlenderbotTokenizer(RobertaTokenizer):
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A Blenderbot sequence has the following format:
-        - single sequence: `` X </s>``
+        - single sequence: ` X </s>`
        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Will be ignored
        Returns:
-            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        return token_ids_0 + [self.eos_token_id]

--- a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
+++ b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
@@ -46,13 +46,13 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/blenderbot-3B": 128}
 class BlenderbotTokenizerFast(RobertaTokenizerFast):
    r"""
-    Construct a "fast" Blenderbot tokenizer (backed by HuggingFace's `tokenizers` library).
+    Construct a "fast" Blenderbot tokenizer (backed by HuggingFace's *tokenizers* library).
-    :class:`~transformers.BlenderbotFast` is nearly identical to :class:`~transformers.RobertaTokenizerFast` and runs
+    [`BlenderbotFast`] is nearly identical to [`RobertaTokenizerFast`] and runs
    end-to-end tokenization: punctuation splitting and wordpiece. The only difference is that it doesn't add BOS token
    to the beginning of sequences.
-    Refer to superclass :class:`~transformers.RobertaTokenizerFast` for usage examples and documentation concerning
+    Refer to superclass [`RobertaTokenizerFast`] for usage examples and documentation concerning
    parameters.
    """
    vocab_files_names = VOCAB_FILES_NAMES
@@ -65,16 +65,16 @@ class BlenderbotTokenizerFast(RobertaTokenizerFast):
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A Blenderbot sequence has the following format:
-        - single sequence: `` X </s>``
+        - single sequence: ` X </s>`
        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Will be ignored
        Returns:
-            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        return token_ids_0 + [self.eos_token_id]

--- a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
@@ -28,77 +28,78 @@ BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class BlenderbotSmallConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.BlenderbotSmallModel`. It is
+    This is the configuration class to store the configuration of a [`BlenderbotSmallModel`]. It is
    used to instantiate an BlenderbotSmall model according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the BlenderbotSmall
-    `facebook/blenderbot_small-90M <https://huggingface.co/facebook/blenderbot_small-90M>`__ architecture.
+    [facebook/blenderbot_small-90M](https://huggingface.co/facebook/blenderbot_small-90M) architecture.
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
    Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 50265):
+        vocab_size (`int`, *optional*, defaults to 50265):
            Vocabulary size of the BlenderbotSmall model. Defines the number of different tokens that can be
-            represented by the :obj:`inputs_ids` passed when calling :class:`~transformers.BlenderbotSmallModel` or
+            represented by the `inputs_ids` passed when calling [`BlenderbotSmallModel`] or
-            :class:`~transformers.TFBlenderbotSmallModel`.
+            [`TFBlenderbotSmallModel`].
-        d_model (:obj:`int`, `optional`, defaults to 512):
+        d_model (`int`, *optional*, defaults to 512):
            Dimensionality of the layers and the pooler layer.
-        encoder_layers (:obj:`int`, `optional`, defaults to 8):
+        encoder_layers (`int`, *optional*, defaults to 8):
            Number of encoder layers.
-        decoder_layers (:obj:`int`, `optional`, defaults to 8):
+        decoder_layers (`int`, *optional*, defaults to 8):
            Number of decoder layers.
-        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
+        decoder_ffn_dim (`int`, *optional*, defaults to 2048):
            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
+        encoder_ffn_dim (`int`, *optional*, defaults to 2048):
            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
+        dropout (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
-        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        activation_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for classifier.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
-        init_std (:obj:`float`, `optional`, defaults to 0.02):
+        init_std (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
+        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
+            https://arxiv.org/abs/1909.11556) for more details.
-        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
+        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
+            https://arxiv.org/abs/1909.11556) for more details.
-        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        scale_embedding (`bool`, *optional*, defaults to `False`):
            Scale embeddings by diving by sqrt(d_model).
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models)
-        forced_eos_token_id (:obj:`int`, `optional`, defaults to 2):
+        forced_eos_token_id (`int`, *optional*, defaults to 2):
-            The id of the token to force as the last generated token when :obj:`max_length` is reached. Usually set to
+            The id of the token to force as the last generated token when `max_length` is reached. Usually set to
-            :obj:`eos_token_id`.
+            `eos_token_id`.
-    Example::
+    Example:
-        >>> from transformers import BlenderbotSmallModel, BlenderbotSmallConfig
+    ```python
+    >>> from transformers import BlenderbotSmallModel, BlenderbotSmallConfig
-        >>> # Initializing a BlenderbotSmall facebook/blenderbot_small-90M style configuration
+    >>> # Initializing a BlenderbotSmall facebook/blenderbot_small-90M style configuration
-        >>> configuration = BlenderbotSmallConfig()
+    >>> configuration = BlenderbotSmallConfig()
-        >>> # Initializing a model from the facebook/blenderbot_small-90M style configuration
+    >>> # Initializing a model from the facebook/blenderbot_small-90M style configuration
-        >>> model = BlenderbotSmallModel(configuration)
+    >>> model = BlenderbotSmallModel(configuration)
-        >>> # Accessing the model configuration
+    >>> # Accessing the model configuration
-        >>> configuration = model.config
+    >>> configuration = model.config
-    """
+    ```"""
    model_type = "blenderbot-small"
    keys_to_ignore_at_inference = ["past_key_values"]
    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}

--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -1115,19 +1115,20 @@ class BlenderbotSmallModel(BlenderbotSmallPreTrainedModel):
        r"""
        Returns:
-        Example::
+        Example:
-            >>> from transformers import BlenderbotSmallTokenizer, BlenderbotSmallModel
+        ```python
+        >>> from transformers import BlenderbotSmallTokenizer, BlenderbotSmallModel
-            >>> model = BlenderbotSmallModel.from_pretrained("facebook/blenderbot_small-90M")
+        >>> model = BlenderbotSmallModel.from_pretrained("facebook/blenderbot_small-90M")
-            >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot_small-90M")
+        >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot_small-90M")
-            >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
+        >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
-            >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
+        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
-            >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
-            >>> last_hidden_states = outputs.last_hidden_state
+        >>> last_hidden_states = outputs.last_hidden_state
-        """
+        ```"""
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states

--- a/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
@@ -989,17 +989,18 @@ class FlaxBlenderbotSmallPreTrainedModel(FlaxPreTrainedModel):
        r"""
        Returns:
-        Example::
+        Example:
-            >>> from transformers import BlenderbotSmallTokenizer, FlaxBlenderbotSmallForConditionalGeneration
+        ```python
+        >>> from transformers import BlenderbotSmallTokenizer, FlaxBlenderbotSmallForConditionalGeneration
-            >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained('facebook/blenderbot_small-90M')
+        >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained('facebook/blenderbot_small-90M')
-            >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained('facebook/blenderbot_small-90M')
+        >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained('facebook/blenderbot_small-90M')
-            >>> text = "My friends are cool but they eat too many carbs."
+        >>> text = "My friends are cool but they eat too many carbs."
-            >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
-            >>> encoder_outputs = model.encode(**inputs)
+        >>> encoder_outputs = model.encode(**inputs)
-        """
+        ```"""
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1056,23 +1057,24 @@ class FlaxBlenderbotSmallPreTrainedModel(FlaxPreTrainedModel):
        r"""
        Returns:
-        Example::
+        Example:
-            >>> from transformers import BlenderbotSmallTokenizer, FlaxBlenderbotSmallForConditionalGeneration
+        ```python
+        >>> from transformers import BlenderbotSmallTokenizer, FlaxBlenderbotSmallForConditionalGeneration
-            >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained('facebook/blenderbot_small-90M')
+        >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained('facebook/blenderbot_small-90M')
-            >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained('facebook/blenderbot_small-90M')
+        >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained('facebook/blenderbot_small-90M')
-            >>> text = "My friends are cool but they eat too many carbs."
+        >>> text = "My friends are cool but they eat too many carbs."
-            >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
-            >>> encoder_outputs = model.encode(**inputs)
+        >>> encoder_outputs = model.encode(**inputs)
-            >>> decoder_start_token_id = model.config.decoder_start_token_id
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
-            >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
-            >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
-            >>> last_decoder_hidden_states = outputs.last_hidden_state
+        >>> last_decoder_hidden_states = outputs.last_hidden_state
-        """
+        ```"""
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1324,23 +1326,24 @@ class FlaxBlenderbotSmallForConditionalGeneration(FlaxBlenderbotSmallPreTrainedM
        r"""
        Returns:
-        Example::
+        Example:
-            >>> from transformers import BlenderbotSmallTokenizer, FlaxBlenderbotSmallForConditionalGeneration
+        ```python
+        >>> from transformers import BlenderbotSmallTokenizer, FlaxBlenderbotSmallForConditionalGeneration
-            >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained('facebook/blenderbot_small-90M')
+        >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained('facebook/blenderbot_small-90M')
-            >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained('facebook/blenderbot_small-90M')
+        >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained('facebook/blenderbot_small-90M')
-            >>> text = "My friends are cool but they eat too many carbs."
+        >>> text = "My friends are cool but they eat too many carbs."
-            >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
-            >>> encoder_outputs = model.encode(**inputs)
+        >>> encoder_outputs = model.encode(**inputs)
-            >>> decoder_start_token_id = model.config.decoder_start_token_id
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
-            >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
-            >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
-            >>> logits = outputs.logits
+        >>> logits = outputs.logits
-        """
+        ```"""
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states

--- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
@@ -68,25 +68,25 @@ class BlenderbotSmallTokenizer(PreTrainedTokenizer):
    """
    Constructs a Blenderbot-90M tokenizer based on BPE (Byte-Pair-Encoding)
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
    Users should refer to the superclass for more information regarding methods.
    Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
            File containing the vocabulary.
-        merges_file (:obj:`str`):
+        merges_file (`str`):
            Path to the merges file.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"__start__"`):
+        bos_token (`str`, *optional*, defaults to `"__start__"`):
            The beginning of sentence token.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"__end__"`):
+        eos_token (`str`, *optional*, defaults to `"__end__"`):
            The end of sentence token.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"__unk__"`):
+        unk_token (`str`, *optional*, defaults to `"__unk__"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"__pad__"`):
+        pad_token (`str`, *optional*, defaults to `"__pad__"`):
            The token used for padding, for example when batching sequences of different lengths.
        **kwargs
-            Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer`
+            Additional keyword arguments passed along to [`PreTrainedTokenizer`]
    """
    vocab_files_names = VOCAB_FILES_NAMES

--- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
+++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
@@ -49,10 +49,10 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 class BlenderbotSmallTokenizerFast(PreTrainedTokenizerFast):
    """
-    Construct a "fast" BlenderbotSmall tokenizer (backed by HuggingFace's `tokenizers` library).
+    Construct a "fast" BlenderbotSmall tokenizer (backed by HuggingFace's *tokenizers* library).
    Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
            Path to the vocabulary file.
    """
@@ -101,13 +101,13 @@ class BlenderbotSmallTokenizerFast(PreTrainedTokenizerFast):
        does not make use of token type ids, therefore a list of zeros is returned.
        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
        Returns:
-            :obj:`List[int]`: List of zeros.
+            `List[int]`: List of zeros.
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

--- a/src/transformers/models/byt5/tokenization_byt5.py
+++ b/src/transformers/models/byt5/tokenization_byt5.py
@@ -29,29 +29,31 @@ class ByT5Tokenizer(PreTrainedTokenizer):
    """
    Construct a ByT5 tokenizer. ByT5 simply uses raw bytes utf-8 encoding.
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
    Users should refer to this superclass for more information regarding those methods.
    Args:
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
-            .. note::
+            <Tip>
-                When building a sequence using special tokens, this is not the token that is used for the end of
+            When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
+            sequence. The token used is the `sep_token`.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            </Tip>
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
-        extra_ids (:obj:`int`, `optional`, defaults to 100):
+        extra_ids (`int`, *optional*, defaults to 100):
            Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are
            accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are
            indexed from the end of the vocabulary up to beginning ("<extra_id_0>" is the last token in the vocabulary
-            like in ByT5 preprocessing see `here
+            like in ByT5 preprocessing see [here](https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)).
-            <https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117>`__).
+        additional_special_tokens (`List[str]`, *optional*):
-        additional_special_tokens (:obj:`List[str]`, `optional`):
            Additional special tokens used by the tokenizer.
    """
@@ -116,18 +118,18 @@ class ByT5Tokenizer(PreTrainedTokenizer):
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.
        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
@@ -157,13 +159,13 @@ class ByT5Tokenizer(PreTrainedTokenizer):
        make use of token type ids, therefore a list of zeros is returned.
        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
        Returns:
-            :obj:`List[int]`: List of zeros.
+            `List[int]`: List of zeros.
        """
        eos = [self.eos_token_id]
@@ -178,17 +180,17 @@ class ByT5Tokenizer(PreTrainedTokenizer):
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:
-        - single sequence: ``X </s>``
+        - single sequence: `X </s>`
-        - pair of sequences: ``A </s> B </s>``
+        - pair of sequences: `A </s> B </s>`
        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
        Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        token_ids_0 = self._add_eos_if_not_present(token_ids_0)
        if token_ids_1 is None:

--- a/src/transformers/models/camembert/configuration_camembert.py
+++ b/src/transformers/models/camembert/configuration_camembert.py
@@ -34,7 +34,7 @@ CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class CamembertConfig(RobertaConfig):
    """
-    This class overrides :class:`~transformers.RobertaConfig`. Please check the superclass for the appropriate
+    This class overrides [`RobertaConfig`]. Please check the superclass for the appropriate
    documentation alongside usage examples.
    """

--- a/src/transformers/models/camembert/tokenization_camembert.py
+++ b/src/transformers/models/camembert/tokenization_camembert.py
@@ -44,65 +44,70 @@ SPIECE_UNDERLINE = "▁"
 class CamembertTokenizer(PreTrainedTokenizer):
    """
-    Adapted from :class:`~transformers.RobertaTokenizer` and :class:`~transformers.XLNetTokenizer`. Construct a
+    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Construct a
-    CamemBERT tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
+    CamemBERT tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
    Users should refer to this superclass for more information regarding those methods.
    Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-            .. note::
+            <Tip>
-                When building a sequence using special tokens, this is not the token that is used for the beginning of
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
-                sequence. The token used is the :obj:`cls_token`.
+            sequence. The token used is the `cls_token`.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            </Tip>
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
-            .. note::
+            <Tip>
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+            </Tip>
-                When building a sequence using special tokens, this is not the token that is used for the end of
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
-                sequence. The token used is the :obj:`sep_token`.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
-        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
            Additional special tokens used by the tokenizer.
-        sp_model_kwargs (:obj:`dict`, `optional`):
+        sp_model_kwargs (`dict`, *optional*):
-            Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
-            <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
-            - ``enable_sampling``: Enable subword regularization.
+            - `enable_sampling`: Enable subword regularization.
-            - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
-              - ``nbest_size = {0,1}``: No sampling is performed.
+              - `nbest_size = {0,1}`: No sampling is performed.
-              - ``nbest_size > 1``: samples from the nbest_size results.
+              - `nbest_size > 1`: samples from the nbest_size results.
-              - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.
-            - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.
    Attributes:
-        sp_model (:obj:`SentencePieceProcessor`):
+        sp_model (`SentencePieceProcessor`):
-            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
    """
    vocab_files_names = VOCAB_FILES_NAMES
@@ -158,17 +163,17 @@ class CamembertTokenizer(PreTrainedTokenizer):
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An CamemBERT sequence has the following format:
-        - single sequence: ``<s> X </s>``
+        - single sequence: `<s> X </s>`
-        - pair of sequences: ``<s> A </s></s> B </s>``
+        - pair of sequences: `<s> A </s></s> B </s>`
        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
        Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        if token_ids_1 is None:
@@ -182,18 +187,18 @@ class CamembertTokenizer(PreTrainedTokenizer):
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.
        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
@@ -212,13 +217,13 @@ class CamembertTokenizer(PreTrainedTokenizer):
        RoBERTa, does not make use of token type ids, therefore a list of zeros is returned.
        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
        Returns:
-            :obj:`List[int]`: List of zeros.
+            `List[int]`: List of zeros.
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

--- a/src/transformers/models/camembert/tokenization_camembert_fast.py
+++ b/src/transformers/models/camembert/tokenization_camembert_fast.py
@@ -53,47 +53,52 @@ SPIECE_UNDERLINE = "▁"
 class CamembertTokenizerFast(PreTrainedTokenizerFast):
    """
-    Construct a "fast" CamemBERT tokenizer (backed by HuggingFace's `tokenizers` library). Adapted from
+    Construct a "fast" CamemBERT tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from
-    :class:`~transformers.RobertaTokenizer` and :class:`~transformers.XLNetTokenizer`. Based on `BPE
+    [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
-    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models>`__.
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
    methods. Users should refer to this superclass for more information regarding those methods.
    Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-            .. note::
+            <Tip>
-                When building a sequence using special tokens, this is not the token that is used for the beginning of
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
-                sequence. The token used is the :obj:`cls_token`.
+            sequence. The token used is the `cls_token`.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            </Tip>
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
-            .. note::
+            <Tip>
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+            </Tip>
-                When building a sequence using special tokens, this is not the token that is used for the end of
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
-                sequence. The token used is the :obj:`sep_token`.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
-        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
            Additional special tokens used by the tokenizer.
    """
@@ -144,17 +149,17 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast):
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An CamemBERT sequence has the following format:
-        - single sequence: ``<s> X </s>``
+        - single sequence: `<s> X </s>`
-        - pair of sequences: ``<s> A </s></s> B </s>``
+        - pair of sequences: `<s> A </s></s> B </s>`
        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
        Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        if token_ids_1 is None:
@@ -171,13 +176,13 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast):
        RoBERTa, does not make use of token type ids, therefore a list of zeros is returned.
        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
        Returns:
-            :obj:`List[int]`: List of zeros.
+            `List[int]`: List of zeros.
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

--- a/src/transformers/models/canine/configuration_canine.py
+++ b/src/transformers/models/canine/configuration_canine.py
@@ -28,66 +28,66 @@ CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class CanineConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.CanineModel`. It is used to
+    This is the configuration class to store the configuration of a [`CanineModel`]. It is used to
    instantiate an CANINE model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the CANINE `google/canine-s
+    configuration with the defaults will yield a similar configuration to that of the CANINE [google/canine-s](https://huggingface.co/google/canine-s) architecture.
-    <https://huggingface.co/google/canine-s>`__ architecture.
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
    Args:
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+        hidden_size (`int`, *optional*, defaults to 768):
            Dimension of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the deep Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoders.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoders.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoders, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 16384):
+        max_position_embeddings (`int`, *optional*, defaults to 16384):
            The maximum sequence length that this model might ever be used with.
-        type_vocab_size (:obj:`int`, `optional`, defaults to 16):
+        type_vocab_size (`int`, *optional*, defaults to 16):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.CanineModel`.
+            The vocabulary size of the `token_type_ids` passed when calling [`CanineModel`].
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
-        downsampling_rate (:obj:`int`, `optional`, defaults to 4):
+        downsampling_rate (`int`, *optional*, defaults to 4):
            The rate at which to downsample the original character sequence length before applying the deep Transformer
            encoder.
-        upsampling_kernel_size (:obj:`int`, `optional`, defaults to 4):
+        upsampling_kernel_size (`int`, *optional*, defaults to 4):
            The kernel size (i.e. the number of characters in each window) of the convolutional projection layer when
-            projecting back from :obj:`hidden_size`*2 to :obj:`hidden_size`.
+            projecting back from `hidden_size`*2 to `hidden_size`.
-        num_hash_functions (:obj:`int`, `optional`, defaults to 8):
+        num_hash_functions (`int`, *optional*, defaults to 8):
            The number of hash functions to use. Each hash function has its own embedding matrix.
-        num_hash_buckets (:obj:`int`, `optional`, defaults to 16384):
+        num_hash_buckets (`int`, *optional*, defaults to 16384):
            The number of hash buckets to use.
-        local_transformer_stride (:obj:`int`, `optional`, defaults to 128):
+        local_transformer_stride (`int`, *optional*, defaults to 128):
            The stride of the local attention of the first shallow Transformer encoder. Defaults to 128 for good
            TPU/XLA memory alignment.
-    Example::
+    Example:
-        >>> from transformers import CanineModel, CanineConfig
+    ```python
+    >>> from transformers import CanineModel, CanineConfig
-        >>> # Initializing a CANINE google/canine-s style configuration
+    >>> # Initializing a CANINE google/canine-s style configuration
-        >>> configuration = CanineConfig()
+    >>> configuration = CanineConfig()
-        >>> # Initializing a model from the google/canine-s style configuration
+    >>> # Initializing a model from the google/canine-s style configuration
-        >>> model = CanineModel(configuration)
+    >>> model = CanineModel(configuration)
-        >>> # Accessing the model configuration
+    >>> # Accessing the model configuration
-        >>> configuration = model.config
+    >>> configuration = model.config
-    """
+    ```"""
    model_type = "canine"
    def __init__(

--- a/src/transformers/models/canine/tokenization_canine.py
+++ b/src/transformers/models/canine/tokenization_canine.py
@@ -65,13 +65,13 @@ class CanineTokenizer(PreTrainedTokenizer):
    Construct a CANINE tokenizer (i.e. a character splitter). It turns text into a sequence of characters, and then
    converts each character into its Unicode code point.
-    :class:`~transformers.CanineTokenizer` inherits from :class:`~transformers.PreTrainedTokenizer`.
+    [`CanineTokenizer`] inherits from [`PreTrainedTokenizer`].
-    Refer to superclass :class:`~transformers.PreTrainedTokenizer` for usage examples and documentation concerning
+    Refer to superclass [`PreTrainedTokenizer`] for usage examples and documentation concerning
    parameters.
    Args:
-        model_max_length (:obj:`int`, `optional`, defaults to 2048):
+        model_max_length (`int`, *optional*, defaults to 2048):
                The maximum sentence length the model accepts.
    """
@@ -160,17 +160,17 @@ class CanineTokenizer(PreTrainedTokenizer):
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A CANINE sequence has the following format:
-        - single sequence: ``[CLS] X [SEP]``
+        - single sequence: `[CLS] X [SEP]`
-        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
        Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
@@ -185,18 +185,18 @@ class CanineTokenizer(PreTrainedTokenizer):
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.
        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
@@ -215,21 +215,21 @@ class CanineTokenizer(PreTrainedTokenizer):
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A CANINE
        sequence pair mask has the following format:
-        ::
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
-            | first sequence    | second sequence |
-        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
        Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
            sequence(s).
        """
        sep = [self.sep_token_id]

--- a/src/transformers/models/clip/configuration_clip.py
+++ b/src/transformers/models/clip/configuration_clip.py
@@ -30,58 +30,58 @@ CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class CLIPTextConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.CLIPModel`. It is used to
+    This is the configuration class to store the configuration of a [`CLIPModel`]. It is used to
    instantiate an CLIP model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the CLIP
-    `openai/clip-vit-base-patch32 <https://huggingface.co/openai/clip-vit-base-patch32>`__ architecture.
+    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
    Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 49408):
+        vocab_size (`int`, *optional*, defaults to 49408):
            Vocabulary size of the CLIP text model. Defines the number of different tokens that can be represented by
-            the :obj:`inputs_ids` passed when calling :class:`~transformers.CLIPModel`.
+            the `inputs_ids` passed when calling [`CLIPModel`].
-        hidden_size (:obj:`int`, `optional`, defaults to 512):
+        hidden_size (`int`, *optional*, defaults to 512):
            Dimensionality of the encoder layers and the pooler layer.
-        intermediate_size (:obj:`int`, `optional`, defaults to 2048):
+        intermediate_size (`int`, *optional*, defaults to 2048):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 8):
+        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer encoder.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 77):
+        max_position_embeddings (`int`, *optional*, defaults to 77):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"quick_gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` :obj:`"quick_gelu"` are supported.
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults to 1e-5):
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-5):
            The epsilon used by the layer normalization layers.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
-        dropout (:obj:`float`, `optional`, defaults to 0.0):
+        dropout (`float`, *optional*, defaults to 0.0):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        initializer_factor (:obj:`float`, `optional`, defaults to 1):
+        initializer_factor (`float``, *optional*, defaults to 1):
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).
-    Example::
+    Example:
-        >>> from transformers import CLIPTextModel, CLIPTextConfig
+    ```python
+    >>> from transformers import CLIPTextModel, CLIPTextConfig
-        >>> # Initializing a CLIPTextModel with openai/clip-vit-base-patch32 style configuration
+    >>> # Initializing a CLIPTextModel with openai/clip-vit-base-patch32 style configuration
-        >>> configuration = CLIPTextConfig()
+    >>> configuration = CLIPTextConfig()
-        >>> # Initializing a CLIPTextConfig from the openai/clip-vit-base-patch32 style configuration
+    >>> # Initializing a CLIPTextConfig from the openai/clip-vit-base-patch32 style configuration
-        >>> model = CLIPTextModel(configuration)
+    >>> model = CLIPTextModel(configuration)
-        >>> # Accessing the model configuration
+    >>> # Accessing the model configuration
-        >>> configuration = model.config
+    >>> configuration = model.config
-    """
+    ```"""
    model_type = "clip_text_model"
    def __init__(
@@ -121,56 +121,56 @@ class CLIPTextConfig(PretrainedConfig):
 class CLIPVisionConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.CLIPModel`. It is used to
+    This is the configuration class to store the configuration of a [`CLIPModel`]. It is used to
    instantiate an CLIP model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the CLIP
-    `openai/clip-vit-base-patch32 <https://huggingface.co/openai/clip-vit-base-patch32>`__ architecture.
+    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
    Args:
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
-        image_size (:obj:`int`, `optional`, defaults to 224):
+        image_size (`int`, *optional*, defaults to 224):
            The size (resolution) of each image.
-        patch_size (:obj:`int`, `optional`, defaults to 32):
+        patch_size (`int`, *optional*, defaults to 32):
            The size (resolution) of each patch.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"quick_gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` :obj:`"quick_gelu"` are supported.
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults to 1e-5):
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-5):
            The epsilon used by the layer normalization layers.
-        dropout (:obj:`float`, `optional`, defaults to 0.0):
+        dropout (`float`, *optional*, defaults to 0.0):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        initializer_factor (:obj:`float`, `optional`, defaults to 1):
+        initializer_factor (`float``, *optional*, defaults to 1):
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).
-    Example::
+    Example:
-        >>> from transformers import CLIPVisionModel, CLIPVisionConfig
+    ```python
+    >>> from transformers import CLIPVisionModel, CLIPVisionConfig
-        >>> # Initializing a CLIPVisionModel with openai/clip-vit-base-patch32 style configuration
+    >>> # Initializing a CLIPVisionModel with openai/clip-vit-base-patch32 style configuration
-        >>> configuration = CLIPVisionConfig()
+    >>> configuration = CLIPVisionConfig()
-        >>> # Initializing a CLIPVisionModel model from the openai/clip-vit-base-patch32 style configuration
+    >>> # Initializing a CLIPVisionModel model from the openai/clip-vit-base-patch32 style configuration
-        >>> model = CLIPVisionModel(configuration)
+    >>> model = CLIPVisionModel(configuration)
-        >>> # Accessing the model configuration
+    >>> # Accessing the model configuration
-        >>> configuration = model.config
+    >>> configuration = model.config
-    """
+    ```"""
    model_type = "clip_vision_model"
@@ -208,23 +208,23 @@ class CLIPVisionConfig(PretrainedConfig):
 class CLIPConfig(PretrainedConfig):
    r"""
-    :class:`~transformers.CLIPConfig` is the configuration class to store the configuration of a
+    [`CLIPConfig`] is the configuration class to store the configuration of a
-    :class:`~transformers.CLIPModel`. It is used to instantiate CLIP model according to the specified arguments,
+    [`CLIPModel`]. It is used to instantiate CLIP model according to the specified arguments,
    defining the text model and vision model configs.
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
    Args:
-        text_config_dict (:obj:`dict`, `optional`):
+        text_config_dict (`dict`, *optional*):
-            Dictionary of configuration options used to initialize :class:`~transformers.CLIPTextConfig`.
+            Dictionary of configuration options used to initialize [`CLIPTextConfig`].
-        vision_config_dict (:obj:`dict`, `optional`):
+        vision_config_dict (`dict`, *optional*):
-            Dictionary of configuration options used to initialize :class:`~transformers.CLIPVisionConfig`.
+            Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
-        projection_dim (:obj:`int`, `optional`, defaults to 512):
+        projection_dim (`int`, *optional*, defaults to 512):
            Dimentionality of text and vision projection layers.
-        logit_scale_init_value (:obj:`float`, `optional`, defaults to 2.6592):
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
-            The inital value of the `logit_scale` paramter. Default is used as per the original CLIP implementation.
+            The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
-        kwargs (`optional`):
+        kwargs (*optional*):
            Dictionary of keyword arguments.
    """
@@ -259,11 +259,11 @@ class CLIPConfig(PretrainedConfig):
    @classmethod
    def from_text_vision_configs(cls, text_config: CLIPTextConfig, vision_config: CLIPVisionConfig, **kwargs):
        r"""
-        Instantiate a :class:`~transformers.CLIPConfig` (or a derived class) from clip text model configuration and
+        Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and
        clip vision model configuration.
        Returns:
-            :class:`CLIPConfig`: An instance of a configuration object
+            [`CLIPConfig`]: An instance of a configuration object
        """
        return cls(text_config_dict=text_config.to_dict(), vision_config_dict=vision_config.to_dict(), **kwargs)
@@ -271,10 +271,10 @@ class CLIPConfig(PretrainedConfig):
    def to_dict(self):
        """
        Serializes this instance to a Python dictionary. Override the default
-        :meth:`~transformers.PretrainedConfig.to_dict`.
+        [`~PretrainedConfig.to_dict`].
        Returns:
-            :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
        """
        output = copy.deepcopy(self.__dict__)
        output["text_config"] = self.text_config.to_dict()

--- a/src/transformers/models/clip/feature_extraction_clip.py
+++ b/src/transformers/models/clip/feature_extraction_clip.py
@@ -32,29 +32,29 @@ class CLIPFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
    r"""
    Constructs a CLIP feature extractor.
-    This feature extractor inherits from :class:`~transformers.FeatureExtractionMixin` which contains most of the main
+    This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main
    methods. Users should refer to this superclass for more information regarding those methods.
    Args:
-        do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to resize the input to a certain :obj:`size`.
+            Whether to resize the input to a certain `size`.
-        size (:obj:`int`, `optional`, defaults to 224):
+        size (`int`, *optional*, defaults to 224):
-            Resize the input to the given size. Only has an effect if :obj:`do_resize` is set to :obj:`True`.
+            Resize the input to the given size. Only has an effect if `do_resize` is set to `True`.
-        resample (:obj:`int`, `optional`, defaults to :obj:`PIL.Image.BICUBIC`):
+        resample (`int`, *optional*, defaults to `PIL.Image.BICUBIC`):
-            An optional resampling filter. This can be one of :obj:`PIL.Image.NEAREST`, :obj:`PIL.Image.BOX`,
+            An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
-            :obj:`PIL.Image.BILINEAR`, :obj:`PIL.Image.HAMMING`, :obj:`PIL.Image.BICUBIC` or :obj:`PIL.Image.LANCZOS`.
+            `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`.
-            Only has an effect if :obj:`do_resize` is set to :obj:`True`.
+            Only has an effect if `do_resize` is set to `True`.
-        do_center_crop (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_center_crop (`bool`, *optional*, defaults to `True`):
-            Whether to crop the input at the center. If the input size is smaller than :obj:`crop_size` along any edge,
+            Whether to crop the input at the center. If the input size is smaller than `crop_size` along any edge,
            the image is padded with 0's and then center cropped.
-        crop_size (:obj:`int`, `optional`, defaults to 224):
+        crop_size (`int`, *optional*, defaults to 224):
-            Desired output size when applying center-cropping. Only has an effect if :obj:`do_center_crop` is set to
+            Desired output size when applying center-cropping. Only has an effect if `do_center_crop` is set to
-            :obj:`True`.
+            `True`.
-        do_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_normalize (`bool`, *optional*, defaults to `True`):
-            Whether or not to normalize the input with :obj:`image_mean` and :obj:`image_std`.
+            Whether or not to normalize the input with `image_mean` and `image_std`.
-        image_mean (:obj:`List[int]`, defaults to :obj:`[0.485, 0.456, 0.406]`):
+        image_mean (`List[int]`, defaults to `[0.485, 0.456, 0.406]`):
            The sequence of means for each channel, to be used when normalizing images.
-        image_std (:obj:`List[int]`, defaults to :obj:`[0.229, 0.224, 0.225]`):
+        image_std (`List[int]`, defaults to `[0.229, 0.224, 0.225]`):
            The sequence of standard deviations for each channel, to be used when normalizing images.
    """
@@ -93,27 +93,29 @@ class CLIPFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
        """
        Main method to prepare for the model one or several image(s).
-        .. warning::
+        <Tip warning={true}>
-           NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
+        NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
-           PIL images.
+        PIL images.
+        </Tip>
        Args:
-            images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                number of channels, H and W are image height and width.
-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`, defaults to :obj:`'np'`):
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*, defaults to `'np'`):
                If set, will return tensors of a particular framework. Acceptable values are:
-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                * :obj:`'np'`: Return NumPy :obj:`np.ndarray` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
-                * :obj:`'jax'`: Return JAX :obj:`jnp.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
        Returns:
-            :class:`~transformers.BatchFeature`: A :class:`~transformers.BatchFeature` with the following fields:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
            - **pixel_values** -- Pixel values to be fed to a model.
        """
@@ -157,13 +159,13 @@ class CLIPFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
    def center_crop(self, image, size):
        """
-        Crops :obj:`image` to the given size using a center crop. Note that if the image is too small to be cropped to
+        Crops `image` to the given size using a center crop. Note that if the image is too small to be cropped to
        the size is given, it will be padded (so the returned result has the size asked).
        Args:
-            image (:obj:`PIL.Image.Image` or :obj:`np.ndarray` or :obj:`torch.Tensor`):
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
                The image to resize.
-            size (:obj:`int` or :obj:`Tuple[int, int]`):
+            size (`int` or `Tuple[int, int]`):
                The size to which crop the image.
        """
        self._ensure_format_supported(image)
@@ -183,14 +185,14 @@ class CLIPFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
    def resize(self, image, size, resample=Image.BICUBIC):
        """
-        Resizes :obj:`image`. Note that this will trigger a conversion of :obj:`image` to a PIL Image.
+        Resizes `image`. Note that this will trigger a conversion of `image` to a PIL Image.
        Args:
-            image (:obj:`PIL.Image.Image` or :obj:`np.ndarray` or :obj:`torch.Tensor`):
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
                The image to resize.
-            size (:obj:`int` or :obj:`Tuple[int, int]`):
+            size (`int` or `Tuple[int, int]`):
-                The size to use for resizing the image. If :obj:`int` it will be resized to match the shorter side
+                The size to use for resizing the image. If `int` it will be resized to match the shorter side
-            resample (:obj:`int`, `optional`, defaults to :obj:`PIL.Image.BILINEAR`):
+            resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
                The filter to user for resampling.
        """
        self._ensure_format_supported(image)

--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -704,19 +704,20 @@ class CLIPTextModel(CLIPPreTrainedModel):
        r"""
        Returns:
-        Examples::
+        Examples:
-            >>> from transformers import CLIPTokenizer, CLIPTextModel
+        ```python
+        >>> from transformers import CLIPTokenizer, CLIPTextModel
-            >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
-            >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
-            >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"],  padding=True, return_tensors="pt")
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"],  padding=True, return_tensors="pt")
-            >>> outputs = model(**inputs)
+        >>> outputs = model(**inputs)
-            >>> last_hidden_state = outputs.last_hidden_state
+        >>> last_hidden_state = outputs.last_hidden_state
-            >>> pooled_output = outputs.pooler_output # pooled (EOS token) states
+        >>> pooled_output = outputs.pooler_output # pooled (EOS token) states
-        """
+        ```"""
        return self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
@@ -810,24 +811,25 @@ class CLIPVisionModel(CLIPPreTrainedModel):
        r"""
        Returns:
-        Examples::
+        Examples:
-            >>> from PIL import Image
+        ```python
-            >>> import requests
+        >>> from PIL import Image
-            >>> from transformers import CLIPProcessor, CLIPVisionModel
+        >>> import requests
+        >>> from transformers import CLIPProcessor, CLIPVisionModel
-            >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
-            >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
-            >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-            >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> image = Image.open(requests.get(url, stream=True).raw)
-            >>> inputs = processor(images=image, return_tensors="pt")
+        >>> inputs = processor(images=image, return_tensors="pt")
-            >>> outputs = model(**inputs)
+        >>> outputs = model(**inputs)
-            >>> last_hidden_state = outputs.last_hidden_state
+        >>> last_hidden_state = outputs.last_hidden_state
-            >>> pooled_output = outputs.pooler_output # pooled CLS states
+        >>> pooled_output = outputs.pooler_output # pooled CLS states
-        """
+        ```"""
        return self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
@@ -968,25 +970,25 @@ class CLIPModel(CLIPPreTrainedModel):
        r"""
        Returns:
-        Examples::
+        Examples:
-            >>> from PIL import Image
-            >>> import requests
-            >>> from transformers import CLIPProcessor, CLIPModel
-            >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        ```python
-            >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import CLIPProcessor, CLIPModel
-            >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-            >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
-            >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
-            >>> outputs = model(**inputs)
+        >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
-            >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
-            >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
-        """
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
+        >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
+        ```"""
        return_dict = return_dict if return_dict is not None else self.config.return_dict
        vision_outputs = self.vision_model(
            pixel_values=pixel_values,