Mass conversion of documentation from rst to Markdown (#14866)

* Convert docstrings of all configurations and tokenizers * Processors and fixes * Last modeling files and fixes to models * Pipeline modules * Utils files * Data submodule * All the other files * Style * Missing examples * Style again * Fix copies * Say bye bye to rst docstrings forever

Mass conversion of documentation from rst to Markdown (#14866)
* Convert docstrings of all configurations and tokenizers * Processors and fixes * Last modeling files and fixes to models * Pipeline modules * Utils files * Data submodule * All the other files * Style * Missing examples * Style again * Fix copies * Say bye bye to rst docstrings forever
27b3031d · Sylvain Gugger · GitHub · 18587639 · 27b3031d · 27b3031d
Unverified Commit 27b3031d authored Dec 21, 2021 by Sylvain Gugger Committed by GitHub Dec 21, 2021
20 changed files
--- a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
@@ -310,8 +310,9 @@ class VisionTextDualEncoderModel(PreTrainedModel):
        r"""
        Returns:

-        Examples::
+        Examples:

+        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import VisionTextDualEncoderModel, VisionTextDualEncoderProcessor, ViTFeatureExtractor, BertTokenizer
@@ -336,8 +337,7 @@ class VisionTextDualEncoderModel(PreTrainedModel):
        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
-
-        """
+        ```"""
        return_dict = return_dict if return_dict is not None else self.config.return_dict

        vision_outputs = self.vision_model(

--- a/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
@@ -30,15 +30,15 @@ class VisionTextDualEncoderProcessor:
    Constructs a VisionTextDualEncoder processor which wraps a vision feature extractor and a tokenizer into a single
    processor.

-    :class:`~transformers.VisionTextDualEncoderProcessor` offers all the functionalities of
-    :class:`~transformers.AutoFeatureExtractor` and :class:`~transformers.AutoTokenizer`. See the
-    :meth:`~transformers.VisionTextDualEncoderProcessor.__call__` and
-    :meth:`~transformers.VisionTextDualEncoderProcessor.decode` for more information.
+    [`VisionTextDualEncoderProcessor`] offers all the functionalities of
+    [`AutoFeatureExtractor`] and [`AutoTokenizer`]. See the
+    [`~VisionTextDualEncoderProcessor.__call__`] and
+    [`~VisionTextDualEncoderProcessor.decode`] for more information.

    Args:
-        feature_extractor (:class:`~transformers.AutoFeatureExtractor`):
+        feature_extractor ([`AutoFeatureExtractor`]):
            The feature extractor is a required input.
-        tokenizer (:class:`~transformers.PreTrainedTokenizer`):
+        tokenizer ([`PreTrainedTokenizer`]):
            The tokenizer is a required input.
    """

@@ -61,17 +61,19 @@ class VisionTextDualEncoderProcessor:
    def save_pretrained(self, save_directory):
        """
        Save a VisionTextDualEncoder feature extractor object and VisionTextDualEncoder tokenizer object to the
-        directory ``save_directory``, so that it can be re-loaded using the
-        :func:`~transformers.VisionTextDualEncoderProcessor.from_pretrained` class method.
+        directory `save_directory`, so that it can be re-loaded using the
+        [`~VisionTextDualEncoderProcessor.from_pretrained`] class method.

-        .. note::
+        <Tip>

-            This class method is simply calling :meth:`~transformers.PreTrainedFeatureExtractor.save_pretrained` and
-            :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.save_pretrained`. Please refer to the
+        This class method is simply calling [`~PreTrainedFeatureExtractor.save_pretrained`] and
+        [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the
        docstrings of the methods above for more information.

+        </Tip>
+
        Args:
-            save_directory (:obj:`str` or :obj:`os.PathLike`):
+            save_directory (`str` or `os.PathLike`):
                Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
                be created if it does not exist).
        """
@@ -82,32 +84,34 @@ class VisionTextDualEncoderProcessor:
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
        r"""
-        Instantiate a :class:`~transformers.VisionTextDualEncoderProcessor` from a pretrained VisionTextDualEncoder
+        Instantiate a [`VisionTextDualEncoderProcessor`] from a pretrained VisionTextDualEncoder
        processor.

-        .. note::
+        <Tip>

        This class method is simply calling AutoFeatureExtractor's
-            :meth:`~transformers.PreTrainedFeatureExtractor.from_pretrained` and AutoTokenizer's
-            :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained`. Please refer to the
+        [`~PreTrainedFeatureExtractor.from_pretrained`] and AutoTokenizer's
+        [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the
        docstrings of the methods above for more information.

+        </Tip>
+
        Args:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
                This can be either:

-                - a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
-                  namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing a feature extractor file saved using the
-                  :meth:`~transformers.PreTrainedFeatureExtractor.save_pretrained` method, e.g.,
-                  ``./my_model_directory/``.
-                - a path or url to a saved feature extractor JSON `file`, e.g.,
-                  ``./my_model_directory/preprocessor_config.json``.
+                - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
+                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                - a path to a *directory* containing a feature extractor file saved using the
+                  [`~PreTrainedFeatureExtractor.save_pretrained`] method, e.g.,
+                  `./my_model_directory/`.
+                - a path or url to a saved feature extractor JSON *file*, e.g.,
+                  `./my_model_directory/preprocessor_config.json`.

            **kwargs
-                Additional keyword arguments passed along to both :class:`~transformers.PreTrainedFeatureExtractor` and
-                :class:`~transformers.PreTrainedTokenizer`
+                Additional keyword arguments passed along to both [`PreTrainedFeatureExtractor`] and
+                [`PreTrainedTokenizer`]
        """
        feature_extractor = AutoFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
@@ -117,38 +121,38 @@ class VisionTextDualEncoderProcessor:
    def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the
-        :obj:`text` and :obj:`kwargs` arguments to VisionTextDualEncoderTokenizer's
-        :meth:`~transformers.PreTrainedTokenizer.__call__` if :obj:`text` is not :obj:`None` to encode the text. To
-        prepare the image(s), this method forwards the :obj:`images` and :obj:`kwrags` arguments to
-        AutoFeatureExtractor's :meth:`~transformers.AutoFeatureExtractor.__call__` if :obj:`images` is not :obj:`None`.
+        `text` and `kwargs` arguments to VisionTextDualEncoderTokenizer's
+        [`~PreTrainedTokenizer.__call__`] if `text` is not `None` to encode the text. To
+        prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        AutoFeatureExtractor's [`~AutoFeatureExtractor.__call__`] if `images` is not `None`.
        Please refer to the doctsring of the above two methods for more information.

        Args:
-            text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+            text (`str`, `List[str]`, `List[List[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                :obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                number of channels, H and W are image height and width.

-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:

-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return NumPy :obj:`np.ndarray` objects.
-                * :obj:`'jax'`: Return JAX :obj:`jnp.ndarray` objects.
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.

        Returns:
-            :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:

-            - **input_ids** -- List of token ids to be fed to a model. Returned when :obj:`text` is not :obj:`None`.
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              :obj:`return_attention_mask=True` or if `"attention_mask"` is in :obj:`self.model_input_names` and if
-              :obj:`text` is not :obj:`None`).
-            - **pixel_values** -- Pixel values to be fed to a model. Returned when :obj:`images` is not :obj:`None`.
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if
+              `text` is not `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        """

        if text is None and images is None:
@@ -171,7 +175,7 @@ class VisionTextDualEncoderProcessor:
    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to VisionTextDualEncoderTokenizer's
-        :meth:`~transformers.PreTrainedTokenizer.batch_decode`. Please refer to the docstring of this method for more
+        [`~PreTrainedTokenizer.batch_decode`]. Please refer to the docstring of this method for more
        information.
        """
        return self.tokenizer.batch_decode(*args, **kwargs)
@@ -179,7 +183,7 @@ class VisionTextDualEncoderProcessor:
    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to VisionTextDualEncoderTokenizer's
-        :meth:`~transformers.PreTrainedTokenizer.decode`. Please refer to the docstring of this method for more
+        [`~PreTrainedTokenizer.decode`]. Please refer to the docstring of this method for more
        information.
        """
        return self.tokenizer.decode(*args, **kwargs)
--- a/src/transformers/models/visual_bert/configuration_visual_bert.py
+++ b/src/transformers/models/visual_bert/configuration_visual_bert.py
@@ -36,60 +36,61 @@ VISUAL_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class VisualBertConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.VisualBertModel`. It is used
+    This is the configuration class to store the configuration of a [`VisualBertModel`]. It is used
    to instantiate an VisualBERT model according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the VisualBERT
-    `visualbert-vqa-coco-pre <https://huggingface.co/uclanlp/visualbert-vqa-coco-pre>`__ architecture.
+    [visualbert-vqa-coco-pre](https://huggingface.co/uclanlp/visualbert-vqa-coco-pre) architecture.

-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.


    Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+        vocab_size (`int`, *optional*, defaults to 30522):
            Vocabulary size of the VisualBERT model. Defines the number of different tokens that can be represented by
-            the :obj:`inputs_ids` passed when calling :class:`~transformers.VisualBertModel`. Vocabulary size of the
-            model. Defines the different tokens that can be represented by the ``inputs_ids`` passed to the forward
-            method of :class:`~transformers.VisualBertModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            the `inputs_ids` passed when calling [`VisualBertModel`]. Vocabulary size of the
+            model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward
+            method of [`VisualBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
-        visual_embedding_dim (:obj:`int`, `optional`, defaults to 512):
+        visual_embedding_dim (`int`, *optional*, defaults to 512):
            Dimensionality of the visual embeddings to be passed to the model.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling
-            :class:`~transformers.VisualBertModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling
+            [`VisualBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
-        bypass_transformer (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not the model should bypass the transformer for the visual embeddings. If set to :obj:`True`,
-            the model directly concatenates the visual embeddings from :class:`~transformers.VisualBertEmbeddings` with
+        bypass_transformer (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should bypass the transformer for the visual embeddings. If set to `True`,
+            the model directly concatenates the visual embeddings from [`VisualBertEmbeddings`] with
            text output from transformers, and then pass it to a self-attention layer.
-        special_visual_initialize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        special_visual_initialize (`bool`, *optional*, defaults to `True`):
            Whether or not the visual token type and position type embedding weights should be initialized the same as
-            the textual token type and positive type embeddings. When set to :obj:`True`, the weights of the textual
+            the textual token type and positive type embeddings. When set to `True`, the weights of the textual
            token type and position type embeddings are copied to the respective visual embedding layers.


-    Example::
+    Example:

+    ```python
    >>> from transformers import VisualBertModel, VisualBertConfig

    >>> # Initializing a VisualBERT visualbert-vqa-coco-pre style configuration
@@ -100,7 +101,7 @@ class VisualBertConfig(PretrainedConfig):

    >>> # Accessing the model configuration
    >>> configuration = model.config
-    """
+    ```"""

    model_type = "visual_bert"


--- a/src/transformers/models/visual_bert/modeling_visual_bert.py
+++ b/src/transformers/models/visual_bert/modeling_visual_bert.py
@@ -736,9 +736,10 @@ class VisualBertModel(VisualBertPreTrainedModel):

        Returns:

-        Example::
+        Example:

-            # Assumption: `get_visual_embeddings(image)` gets the visual embeddings of the image.
+        ```python
+        # Assumption: *get_visual_embeddings(image)* gets the visual embeddings of the image.
        from transformers import BertTokenizer, VisualBertModel
        import torch

@@ -759,7 +760,7 @@ class VisualBertModel(VisualBertPreTrainedModel):
        outputs = model(**inputs)

        last_hidden_states = outputs.last_hidden_state
-        """
+        ```"""

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (

--- a/src/transformers/models/vit/configuration_vit.py
+++ b/src/transformers/models/vit/configuration_vit.py
@@ -28,47 +28,47 @@ VIT_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class ViTConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.ViTModel`. It is used to
+    This is the configuration class to store the configuration of a [`ViTModel`]. It is used to
    instantiate an ViT model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the ViT `google/vit-base-patch16-224
-    <https://huggingface.co/google/vit-base-patch16-224>`__ architecture.
+    configuration with the defaults will yield a similar configuration to that of the ViT [google/vit-base-patch16-224](https://huggingface.co/google/vit-base-patch16-224) architecture.

-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.


    Args:
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention probabilities.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
-        image_size (:obj:`int`, `optional`, defaults to :obj:`224`):
+        image_size (`int`, *optional*, defaults to `224`):
            The size (resolution) of each image.
-        patch_size (:obj:`int`, `optional`, defaults to :obj:`16`):
+        patch_size (`int`, *optional*, defaults to `16`):
            The size (resolution) of each patch.
-        num_channels (:obj:`int`, `optional`, defaults to :obj:`3`):
+        num_channels (`int`, *optional*, defaults to `3`):
            The number of input channels.
-        qkv_bias (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        qkv_bias (`bool`, *optional*, defaults to `True`):
            Whether to add a bias to the queries, keys and values.


-    Example::
+    Example:

+    ```python
    >>> from transformers import ViTModel, ViTConfig

    >>> # Initializing a ViT vit-base-patch16-224 style configuration
@@ -79,7 +79,7 @@ class ViTConfig(PretrainedConfig):

    >>> # Accessing the model configuration
    >>> configuration = model.config
-    """
+    ```"""
    model_type = "vit"

    def __init__(

--- a/src/transformers/models/vit/feature_extraction_vit.py
+++ b/src/transformers/models/vit/feature_extraction_vit.py
@@ -38,25 +38,25 @@ class ViTFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
    r"""
    Constructs a ViT feature extractor.

-    This feature extractor inherits from :class:`~transformers.FeatureExtractionMixin` which contains most of the main
+    This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main
    methods. Users should refer to this superclass for more information regarding those methods.

    Args:
-        do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether to resize the input to a certain :obj:`size`.
-        size (:obj:`int` or :obj:`Tuple(int)`, `optional`, defaults to 224):
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the input to a certain `size`.
+        size (`int` or `Tuple(int)`, *optional*, defaults to 224):
            Resize the input to the given size. If a tuple is provided, it should be (width, height). If only an
-            integer is provided, then the input will be resized to (size, size). Only has an effect if :obj:`do_resize`
-            is set to :obj:`True`.
-        resample (:obj:`int`, `optional`, defaults to :obj:`PIL.Image.BILINEAR`):
-            An optional resampling filter. This can be one of :obj:`PIL.Image.NEAREST`, :obj:`PIL.Image.BOX`,
-            :obj:`PIL.Image.BILINEAR`, :obj:`PIL.Image.HAMMING`, :obj:`PIL.Image.BICUBIC` or :obj:`PIL.Image.LANCZOS`.
-            Only has an effect if :obj:`do_resize` is set to :obj:`True`.
-        do_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            integer is provided, then the input will be resized to (size, size). Only has an effect if `do_resize`
+            is set to `True`.
+        resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
+            An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
+            `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`.
+            Only has an effect if `do_resize` is set to `True`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether or not to normalize the input with mean and standard deviation.
-        image_mean (:obj:`List[int]`, defaults to :obj:`[0.5, 0.5, 0.5]`):
+        image_mean (`List[int]`, defaults to `[0.5, 0.5, 0.5]`):
            The sequence of means for each channel, to be used when normalizing images.
-        image_std (:obj:`List[int]`, defaults to :obj:`[0.5, 0.5, 0.5]`):
+        image_std (`List[int]`, defaults to `[0.5, 0.5, 0.5]`):
            The sequence of standard deviations for each channel, to be used when normalizing images.
    """

@@ -86,27 +86,29 @@ class ViTFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
        """
        Main method to prepare for the model one or several image(s).

-        .. warning::
+        <Tip warning={true}>

        NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
        PIL images.

+        </Tip>
+
        Args:
-            images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                number of channels, H and W are image height and width.

-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`, defaults to :obj:`'np'`):
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*, defaults to `'np'`):
                If set, will return tensors of a particular framework. Acceptable values are:

-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return NumPy :obj:`np.ndarray` objects.
-                * :obj:`'jax'`: Return JAX :obj:`jnp.ndarray` objects.
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.

        Returns:
-            :class:`~transformers.BatchFeature`: A :class:`~transformers.BatchFeature` with the following fields:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
              width).

--- a/src/transformers/models/vit/modeling_flax_vit.py
+++ b/src/transformers/models/vit/modeling_flax_vit.py
@@ -515,8 +515,9 @@ class FlaxViTModel(FlaxViTPreTrainedModel):
 FLAX_VISION_MODEL_DOCSTRING = """
    Returns:

-    Examples::
+    Examples:

+    ```python
    >>> from transformers import ViTFeatureExtractor, FlaxViTModel
    >>> from PIL import Image
    >>> import requests
@@ -530,6 +531,7 @@ FLAX_VISION_MODEL_DOCSTRING = """
    >>> inputs = feature_extractor(images=image, return_tensors="np")
    >>> outputs = model(**inputs)
    >>> last_hidden_states = outputs.last_hidden_state
+    ```
 """

 overwrite_call_docstring(FlaxViTModel, FLAX_VISION_MODEL_DOCSTRING)
@@ -594,8 +596,9 @@ class FlaxViTForImageClassification(FlaxViTPreTrainedModel):
 FLAX_VISION_CLASSIF_DOCSTRING = """
    Returns:

-    Example::
+    Example:

+    ```python
    >>> from transformers import ViTFeatureExtractor, FlaxViTForImageClassification
    >>> from PIL import Image
    >>> import jax
@@ -614,6 +617,7 @@ FLAX_VISION_CLASSIF_DOCSTRING = """
    >>> # model predicts one of the 1000 ImageNet classes
    >>> predicted_class_idx = jax.numpy.argmax(logits, axis=-1)
    >>> print("Predicted class:", model.config.id2label[predicted_class_idx.item()])
+    ```
 """

 overwrite_call_docstring(FlaxViTForImageClassification, FLAX_VISION_CLASSIF_DOCSTRING)

--- a/src/transformers/models/vit/modeling_tf_vit.py
+++ b/src/transformers/models/vit/modeling_tf_vit.py
@@ -674,8 +674,9 @@ class TFViTModel(TFViTPreTrainedModel):
        r"""
        Returns:

-        Examples::
+        Examples:

+        ```python
        >>> from transformers import ViTFeatureExtractor, TFViTModel
        >>> from PIL import Image
        >>> import requests
@@ -689,7 +690,7 @@ class TFViTModel(TFViTPreTrainedModel):
        >>> inputs = feature_extractor(images=image, return_tensors="tf")
        >>> outputs = model(**inputs)
        >>> last_hidden_states = outputs.last_hidden_state
-        """
+        ```"""
        inputs = input_processing(
            func=self.call,
            config=self.config,

--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -517,8 +517,9 @@ class ViTModel(ViTPreTrainedModel):
        r"""
        Returns:

-        Examples::
+        Examples:

+        ```python
        >>> from transformers import ViTFeatureExtractor, ViTModel
        >>> from PIL import Image
        >>> import requests
@@ -532,7 +533,7 @@ class ViTModel(ViTPreTrainedModel):
        >>> inputs = feature_extractor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> last_hidden_states = outputs.last_hidden_state
-        """
+        ```"""
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states

--- a/src/transformers/models/wav2vec2/configuration_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/configuration_wav2vec2.py
@@ -28,158 +28,154 @@ WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class Wav2Vec2Config(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.Wav2Vec2Model`. It is used to
+    This is the configuration class to store the configuration of a [`Wav2Vec2Model`]. It is used to
    instantiate an Wav2Vec2 model according to the specified arguments, defining the model architecture. Instantiating
    a configuration with the defaults will yield a similar configuration to that of the Wav2Vec2
-    `facebook/wav2vec2-base-960h <https://huggingface.co/facebook/wav2vec2-base-960h>`__ architecture.
+    [facebook/wav2vec2-base-960h](https://huggingface.co/facebook/wav2vec2-base-960h) architecture.

-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.


    Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 32):
+        vocab_size (`int`, *optional*, defaults to 32):
            Vocabulary size of the Wav2Vec2 model. Defines the number of different tokens that can be represented by
-            the :obj:`inputs_ids` passed when calling :class:`~transformers.Wav2Vec2Model` or
-            :class:`~transformers.TFWav2Vec2Model`. Vocabulary size of the model. Defines the different tokens that can
-            be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.Wav2Vec2Model`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            the `inputs_ids` passed when calling [`Wav2Vec2Model`] or
+            [`TFWav2Vec2Model`]. Vocabulary size of the model. Defines the different tokens that can
+            be represented by the *inputs_ids* passed to the forward method of [`Wav2Vec2Model`].
+        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
+        attention_dropout (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention probabilities.
-        final_dropout (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probability for the final projection layer of :class:`Wav2Vec2ForCTC`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        final_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the final projection layer of [`Wav2Vec2ForCTC`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
-        feat_extract_norm (:obj:`str`, `optional`, defaults to :obj:`"group"`):
-            The norm to be applied to 1D convolutional layers in feature extractor. One of :obj:`"group"` for group
-            normalization of only the first 1D convolutional layer or :obj:`"layer"` for layer normalization of all 1D
+        feat_extract_norm (`str`, *optional*, defaults to `"group"`):
+            The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group
+            normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
            convolutional layers.
-        feat_proj_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
            The dropout probability for output of the feature extractor.
-        feat_extract_activation (:obj:`str, `optional`, defaults to :obj:`"gelu"`):
+        feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the 1D convolutional layers of the feature
-            extractor. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        feat_quantizer_dropout (obj:`float`, `optional`, defaults to 0.0):
+            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        feat_quantizer_dropout (obj:*float*, *optional*, defaults to 0.0):
            The dropout probabilitiy for quantized feature extractor states.
-        conv_dim (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(512, 512, 512, 512, 512, 512, 512)`):
+        conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
            A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
-            feature extractor. The length of `conv_dim` defines the number of 1D convolutional layers.
-        conv_stride (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(5, 2, 2, 2, 2, 2, 2)`):
+            feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers.
+        conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
            A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length
-            of `conv_stride` defines the number of convolutional layers and has to match the length of `conv_dim`.
-        conv_kernel (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(10, 3, 3, 3, 3, 3, 3)`):
+            of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
+        conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
            A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The
-            length of `conv_kernel` defines the number of convolutional layers and has to match the length of
-            `conv_dim`.
-        conv_bias (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            length of *conv_kernel* defines the number of convolutional layers and has to match the length of
+            *conv_dim*.
+        conv_bias (`bool`, *optional*, defaults to `False`):
            Whether the 1D convolutional layers have a bias.
-        num_conv_pos_embeddings (:obj:`int`, `optional`, defaults to 128):
+        num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
            Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
            embeddings layer.
-        num_conv_pos_embedding_groups (:obj:`int`, `optional`, defaults to 16):
+        num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
            Number of groups of 1D convolutional positional embeddings layer.
-        do_stable_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to apply `stable` layer norm architecture of the Transformer encoder. ``do_stable_layer_norm is
-            True`` corresponds to applying layer norm before the attention layer, whereas ``do_stable_layer_norm is
-            False`` corresponds to applying layer norm after the attention layer.
-        apply_spec_augment (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_stable_layer_norm (`bool`, *optional*, defaults to `False`):
+            Whether to apply *stable* layer norm architecture of the Transformer encoder. `do_stable_layer_norm is True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is False` corresponds to applying layer norm after the attention layer.
+        apply_spec_augment (`bool`, *optional*, defaults to `True`):
            Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see
-            `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
-            <https://arxiv.org/abs/1904.08779>`__.
-        mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
+            [SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition](https://arxiv.org/abs/1904.08779).
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
            Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
            procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
            reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
-            masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
-            the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
-        mask_time_length (:obj:`int`, `optional`, defaults to 10):
+            masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease
+            the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
+        mask_time_length (`int`, *optional*, defaults to 10):
            Length of vector span along the time axis.
-        mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
-            The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
-            step, irrespectively of ``mask_feature_prob``. Only relevant if
+        mask_time_min_masks (`int`, *optional*, defaults to 2),:
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
            ''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
-        mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
            Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
            masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
            the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
-            span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
-            overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
-            is True``.
-        mask_feature_length (:obj:`int`, `optional`, defaults to 10):
+            span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that
+            overlap may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
            Length of vector span along the feature axis.
-        mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
-            The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
-            step, irrespectively of ``mask_feature_prob``. Only relevant if
+        mask_feature_min_masks (`int`, *optional*, defaults to 0),:
+            The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
            ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
-        num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320):
+        num_codevectors_per_group (`int`, *optional*, defaults to 320):
            Number of entries in each quantization codebook (group).
-        num_codevector_groups (:obj:`int`, `optional`, defaults to 2):
+        num_codevector_groups (`int`, *optional*, defaults to 2):
            Number of codevector groups for product codevector quantization.
-        contrastive_logits_temperature (:obj:`float`, `optional`, defaults to 0.1):
-            The temperature `kappa` in the contrastive loss.
-        feat_quantizer_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
+            The temperature *kappa* in the contrastive loss.
+        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
            The dropout probabilitiy for the output of the feature extractor that's used by the quantizer.
-        num_negatives (:obj:`int`, `optional`, defaults to 100):
+        num_negatives (`int`, *optional*, defaults to 100):
            Number of negative samples for the contrastive loss.
-        codevector_dim (:obj:`int`, `optional`, defaults to 256):
+        codevector_dim (`int`, *optional*, defaults to 256):
            Dimensionality of the quantized feature vectors.
-        proj_codevector_dim (:obj:`int`, `optional`, defaults to 256):
+        proj_codevector_dim (`int`, *optional*, defaults to 256):
            Dimensionality of the final projection of both the quantized and the transformer features.
-        diversity_loss_weight (:obj:`int`, `optional`, defaults to 0.1):
+        diversity_loss_weight (`int`, *optional*, defaults to 0.1):
            The weight of the codebook diversity loss component.
-        ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`):
-            Specifies the reduction to apply to the output of ``torch.nn.CTCLoss``. Only relevant when training an
-            instance of :class:`~transformers.Wav2Vec2ForCTC`.
-        ctc_zero_infinity (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to zero infinite losses and the associated gradients of ``torch.nn.CTCLoss``. Infinite losses
+        ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
+            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
+            instance of [`Wav2Vec2ForCTC`].
+        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
+            Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses
            mainly occur when the inputs are too short to be aligned to the targets. Only relevant when training an
-            instance of :class:`~transformers.Wav2Vec2ForCTC`.
-        use_weighted_layer_sum (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            instance of [`Wav2Vec2ForCTC`].
+        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
            Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
-            instance of :class:`~transformers.Wav2Vec2ForSequenceClassification`.
-        classifier_proj_size (:obj:`int`, `optional`, defaults to 256):
+            instance of [`Wav2Vec2ForSequenceClassification`].
+        classifier_proj_size (`int`, *optional*, defaults to 256):
            Dimensionality of the projection before token mean-pooling for classification.
-        tdnn_dim (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(512, 512, 512, 512, 1500)`):
-            A tuple of integers defining the number of output channels of each 1D convolutional layer in the `TDNN`
-            module of the `XVector` model. The length of `tdnn_dim` defines the number of `TDNN` layers.
-        tdnn_kernel (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(5, 3, 3, 1, 1)`):
-            A tuple of integers defining the kernel size of each 1D convolutional layer in the `TDNN` module of the
-            `XVector` model. The length of `tdnn_kernel` has to match the length of `tdnn_dim`.
-        tdnn_dilation (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(1, 2, 3, 1, 1)`):
-            A tuple of integers defining the dilation factor of each 1D convolutional layer in `TDNN` module of the
-            `XVector` model. The length of `tdnn_dilation` has to match the length of `tdnn_dim`.
-        xvector_output_dim (:obj:`int`, `optional`, defaults to 512):
-            Dimensionality of the `XVector` embedding vectors.
-        add_adapter (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        tdnn_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`):
+            A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN*
+            module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers.
+        tdnn_kernel (`Tuple[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the
+            *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*.
+        tdnn_dilation (`Tuple[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`):
+            A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the
+            *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*.
+        xvector_output_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of the *XVector* embedding vectors.
+        add_adapter (`bool`, *optional*, defaults to `False`):
            Whether a convolutional network should be stacked on top of the Wav2Vec2 Encoder. Can be very useful for
            warm-starting Wav2Vec2 for SpeechEncoderDecoder models.
-        adapter_kernel_size (:obj:`int`, `optional`, defaults to 3):
-            Kernel size of the convolutional layers in the adapter network. Only relevant if ``add_adapter is True``.
-        adapter_stride (:obj:`int`, `optional`, defaults to 2):
-            Stride of the convolutional layers in the adapter network. Only relevant if ``add_adapter is True``.
-        num_adapter_layers (:obj:`int`, `optional`, defaults to 3):
-            Number of convolutional layers that should be used in the adapter network. Only relevant if ``add_adapter
-            is True``.
-        output_hidden_size (:obj:`int`, `optional`):
-            Dimensionality of the encoder output layer. If not defined, this defaults to `hidden-size`. Only relevant
-            if ``add_adapter is True``.
+        adapter_kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        adapter_stride (`int`, *optional*, defaults to 2):
+            Stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        num_adapter_layers (`int`, *optional*, defaults to 3):
+            Number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is True`.
+        output_hidden_size (`int`, *optional*):
+            Dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*. Only relevant
+            if `add_adapter is True`.

-    Example::
+    Example:

+    ```python
    >>> from transformers import Wav2Vec2Model, Wav2Vec2Config

    >>> # Initializing a Wav2Vec2 facebook/wav2vec2-base-960h style configuration
@@ -190,7 +186,7 @@ class Wav2Vec2Config(PretrainedConfig):

    >>> # Accessing the model configuration
    >>> configuration = model.config
-    """
+    ```"""
    model_type = "wav2vec2"

    def __init__(

--- a/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py
@@ -34,34 +34,32 @@ class Wav2Vec2FeatureExtractor(SequenceFeatureExtractor):
    Constructs a Wav2Vec2 feature extractor.

    This feature extractor inherits from
-    :class:`~transformers.feature_extraction_sequence_utils.SequenceFeatureExtractor` which contains most of the main
+    [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains most of the main
    methods. Users should refer to this superclass for more information regarding those methods.

    Args:
-        feature_size (:obj:`int`, defaults to 1):
+        feature_size (`int`, defaults to 1):
            The feature dimension of the extracted features.
-        sampling_rate (:obj:`int`, defaults to 16000):
+        sampling_rate (`int`, defaults to 16000):
            The sampling rate at which the audio files should be digitalized expressed in Hertz per second (Hz).
-        padding_value (:obj:`float`, defaults to 0.0):
+        padding_value (`float`, defaults to 0.0):
            The value that is used to fill the padding values.
-        do_normalize (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        do_normalize (`bool`, *optional*, defaults to `False`):
            Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
-            improve the performance for some models, *e.g.*, `wav2vec2-lv60
-            <https://huggingface.co/models?search=lv60>`__.
-        return_attention_mask (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not :meth:`~transformers.Wav2Vec2FeatureExtractor.__call__` should return :obj:`attention_mask`.
+            improve the performance for some models, *e.g.*, [wav2vec2-lv60](https://huggingface.co/models?search=lv60).
+        return_attention_mask (`bool`, *optional*, defaults to `False`):
+            Whether or not [`~Wav2Vec2FeatureExtractor.__call__`] should return `attention_mask`.

-            .. note::
+            <Tip>

-                Wav2Vec2 models that have set ``config.feat_extract_norm == "group"``, such as `wav2vec2-base
-                <https://huggingface.co/facebook/wav2vec2-base-960h>`__, have **not** been trained using
-                :obj:`attention_mask`. For such models, :obj:`input_values` should simply be padded with 0 and no
-                :obj:`attention_mask` should be passed.
+            Wav2Vec2 models that have set `config.feat_extract_norm == "group"`, such as [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base-960h), have **not** been trained using
+            `attention_mask`. For such models, `input_values` should simply be padded with 0 and no
+            `attention_mask` should be passed.

-                For Wav2Vec2 models that have set ``config.feat_extract_norm == "layer"``, such as `wav2vec2-lv60
-                <https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self>`__, :obj:`attention_mask` should be
+            For Wav2Vec2 models that have set `config.feat_extract_norm == "layer"`, such as [wav2vec2-lv60](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self), `attention_mask` should be
            passed for batched inference.
-    """
+
+            </Tip>"""

    model_input_names = ["input_values", "attention_mask"]

@@ -116,55 +114,55 @@ class Wav2Vec2FeatureExtractor(SequenceFeatureExtractor):
        Main method to featurize and prepare for the model one or several sequence(s). sequences.

        Args:
-            raw_speech (:obj:`np.ndarray`, :obj:`List[float]`, :obj:`List[np.ndarray]`, :obj:`List[List[float]]`):
+            raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                values, a list of numpy arrays or a list of list of float values.
-            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
+            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
                Select a strategy to pad the returned sequences (according to the model's padding side and padding
                index) among:

-                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a
                  single sequence if provided).
-                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided.
-                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
                  different lengths).
-            max_length (:obj:`int`, `optional`):
+            max_length (`int`, *optional*):
                Maximum length of the returned list and optionally padding length (see above).
-            truncation (:obj:`bool`):
-                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
-            pad_to_multiple_of (:obj:`int`, `optional`):
+            truncation (`bool`):
+                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
+            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                >= 7.5 (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
-            return_attention_mask (:obj:`bool`, `optional`):
+            return_attention_mask (`bool`, *optional*):
                Whether to return the attention mask. If left to the default, will return the attention mask according
                to the specific feature_extractor's default.

-                `What are attention masks? <../glossary.html#attention-mask>`__
+                [What are attention masks?](../glossary#attention-mask)

-                .. note::
+                <Tip>

-                    Wav2Vec2 models that have set ``config.feat_extract_norm == "group"``, such as `wav2vec2-base
-                    <https://huggingface.co/facebook/wav2vec2-base-960h>`__, have **not** been trained using
-                    :obj:`attention_mask`. For such models, :obj:`input_values` should simply be padded with 0 and no
-                    :obj:`attention_mask` should be passed.
+                Wav2Vec2 models that have set `config.feat_extract_norm == "group"`, such as [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base-960h), have **not** been trained using
+                `attention_mask`. For such models, `input_values` should simply be padded with 0 and no
+                `attention_mask` should be passed.

-                    For Wav2Vec2 models that have set ``config.feat_extract_norm == "layer"``, such as `wav2vec2-lv60
-                    <https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self>`__, :obj:`attention_mask` should be
+                For Wav2Vec2 models that have set `config.feat_extract_norm == "layer"`, such as [wav2vec2-lv60](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self), `attention_mask` should be
                passed for batched inference.

-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+                </Tip>
+
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
-            sampling_rate (:obj:`int`, `optional`):
-                The sampling rate at which the ``raw_speech`` input was sampled. It is strongly recommended to pass
-                ``sampling_rate`` at the forward call to prevent silent errors.
-            padding_value (:obj:`float`, defaults to 0.0):
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors.
+            padding_value (`float`, defaults to 0.0):
        """

        if sampling_rate is not None:

--- a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
@@ -937,8 +937,9 @@ class FlaxWav2Vec2Model(FlaxWav2Vec2PreTrainedModel):
 FLAX_WAV2VEC2_MODEL_DOCSTRING = """
    Returns:

-    Example::
+    Example:

+    ```python
    >>> from transformers import Wav2Vec2Processor, FlaxWav2Vec2Model
    >>> from datasets import load_dataset
    >>> import soundfile as sf
@@ -956,6 +957,7 @@ FLAX_WAV2VEC2_MODEL_DOCSTRING = """

    >>> input_values = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="np").input_values  # Batch size 1
    >>> hidden_states = model(input_values).last_hidden_state
+    ```
 """

 overwrite_call_docstring(
@@ -1037,8 +1039,9 @@ class FlaxWav2Vec2ForCTC(FlaxWav2Vec2PreTrainedModel):
 FLAX_WAV2VEC2_FOR_CTC_DOCSTRING = """
    Returns:

-    Example::
+    Example:

+    ```python
    >>> import jax.numpy as jnp
    >>> from transformers import Wav2Vec2Processor, FlaxWav2Vec2ForCTC
    >>> from datasets import load_dataset
@@ -1061,6 +1064,7 @@ FLAX_WAV2VEC2_FOR_CTC_DOCSTRING = """

    >>> transcription = processor.decode(predicted_ids[0])
    >>> # should give:  "A MAN SAID TO THE UNIVERSE SIR I EXIST"
+    ```
 """

 overwrite_call_docstring(
@@ -1108,10 +1112,11 @@ class FlaxWav2Vec2ForPreTrainingModule(nn.Module):
        r"""
        Returns:

-        Example::
+        Example:

+        ```python

-        """
+        ```"""

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

@@ -1220,8 +1225,9 @@ class FlaxWav2Vec2ForPreTraining(FlaxWav2Vec2PreTrainedModel):
 FLAX_WAV2VEC2_FOR_PRETRAINING_DOCSTRING = """
    Returns:

-    Example::
+    Example:

+    ```python
    >>> import optax
    >>> import numpy as np
    >>> import jax.numpy as jnp
@@ -1259,6 +1265,7 @@ FLAX_WAV2VEC2_FOR_PRETRAINING_DOCSTRING = """

    >>> # show that cosine similarity is much higher than random
    >>> assert np.asarray(cosine_sim)[mask_time_indices].mean() > 0.5
+    ```
 """

 overwrite_call_docstring(

--- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
@@ -1396,8 +1396,9 @@ class TFWav2Vec2Model(TFWav2Vec2PreTrainedModel):

        Returns:

-        Example::
+        Example:

+        ```python
        >>> from transformers import Wav2Vec2Processor, TFWav2Vec2Model
        >>> from datasets import load_dataset
        >>> import soundfile as sf
@@ -1415,7 +1416,7 @@ class TFWav2Vec2Model(TFWav2Vec2PreTrainedModel):

        >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values  # Batch size 1
        >>> hidden_states = model(input_values).last_hidden_state
-        """
+        ```"""

        inputs = input_values_processing(
            func=self.call,

--- a/src/transformers/models/wav2vec2/processing_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/processing_wav2vec2.py
@@ -29,16 +29,16 @@ class Wav2Vec2Processor:
    Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor and a Wav2Vec2 CTC tokenizer into a single
    processor.

-    :class:`~transformers.Wav2Vec2Processor` offers all the functionalities of
-    :class:`~transformers.Wav2Vec2FeatureExtractor` and :class:`~transformers.PreTrainedTokenizer`. See the docstring
-    of :meth:`~transformers.Wav2Vec2Processor.__call__` and :meth:`~transformers.Wav2Vec2Processor.decode` for more
+    [`Wav2Vec2Processor`] offers all the functionalities of
+    [`Wav2Vec2FeatureExtractor`] and [`PreTrainedTokenizer`]. See the docstring
+    of [`~Wav2Vec2Processor.__call__`] and [`~Wav2Vec2Processor.decode`] for more
    information.

    Args:
-        feature_extractor (:obj:`Wav2Vec2FeatureExtractor`):
-            An instance of :class:`~transformers.Wav2Vec2FeatureExtractor`. The feature extractor is a required input.
-        tokenizer (:class:`~transformers.PreTrainedTokenizer`):
-            An instance of :class:`~transformers.PreTrainedTokenizer`. The tokenizer is a required input.
+        feature_extractor (`Wav2Vec2FeatureExtractor`):
+            An instance of [`Wav2Vec2FeatureExtractor`]. The feature extractor is a required input.
+        tokenizer ([`PreTrainedTokenizer`]):
+            An instance of [`PreTrainedTokenizer`]. The tokenizer is a required input.
    """

    def __init__(self, feature_extractor, tokenizer):
@@ -57,18 +57,20 @@ class Wav2Vec2Processor:

    def save_pretrained(self, save_directory):
        """
-        Save a Wav2Vec2 feature_extractor object and Wav2Vec2 tokenizer object to the directory ``save_directory``, so
-        that it can be re-loaded using the :func:`~transformers.Wav2Vec2Processor.from_pretrained` class method.
+        Save a Wav2Vec2 feature_extractor object and Wav2Vec2 tokenizer object to the directory `save_directory`, so
+        that it can be re-loaded using the [`~Wav2Vec2Processor.from_pretrained`] class method.

-        .. note::
+        <Tip>

        This class method is simply calling
-            :meth:`~transformers.feature_extraction_utils.FeatureExtractionMixin.save_pretrained` and
-            :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.save_pretrained`. Please refer to the
+        [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
+        [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the
        docstrings of the methods above for more information.

+        </Tip>
+
        Args:
-            save_directory (:obj:`str` or :obj:`os.PathLike`):
+            save_directory (`str` or `os.PathLike`):
                Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
                be created if it does not exist).
        """
@@ -79,30 +81,32 @@ class Wav2Vec2Processor:
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
        r"""
-        Instantiate a :class:`~transformers.Wav2Vec2Processor` from a pretrained Wav2Vec2 processor.
+        Instantiate a [`Wav2Vec2Processor`] from a pretrained Wav2Vec2 processor.

-        .. note::
+        <Tip>

        This class method is simply calling Wav2Vec2FeatureExtractor's
-            :meth:`~transformers.feature_extraction_utils.FeatureExtractionMixin.from_pretrained` and
-            PreTrainedTokenizer's :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained`.
+        [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and
+        PreTrainedTokenizer's [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`].
        Please refer to the docstrings of the methods above for more information.

+        </Tip>
+
        Args:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
                This can be either:

-                - a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
-                  namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing a feature extractor file saved using the
-                  :meth:`~transformers.SequenceFeatureExtractor.save_pretrained` method, e.g.,
-                  ``./my_model_directory/``.
-                - a path or url to a saved feature extractor JSON `file`, e.g.,
-                  ``./my_model_directory/preprocessor_config.json``.
+                - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
+                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                - a path to a *directory* containing a feature extractor file saved using the
+                  [`~SequenceFeatureExtractor.save_pretrained`] method, e.g.,
+                  `./my_model_directory/`.
+                - a path or url to a saved feature extractor JSON *file*, e.g.,
+                  `./my_model_directory/preprocessor_config.json`.
            **kwargs
-                Additional keyword arguments passed along to both :class:`~transformers.SequenceFeatureExtractor` and
-                :class:`~transformers.PreTrainedTokenizer`
+                Additional keyword arguments passed along to both [`SequenceFeatureExtractor`] and
+                [`PreTrainedTokenizer`]
        """
        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)

@@ -128,9 +132,9 @@ class Wav2Vec2Processor:
    def __call__(self, *args, **kwargs):
        """
        When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's
-        :meth:`~transformers.Wav2Vec2FeatureExtractor.__call__` and returns its output. If used in the context
-        :meth:`~transformers.Wav2Vec2Processor.as_target_processor` this method forwards all its arguments to
-        PreTrainedTokenizer's :meth:`~transformers.PreTrainedTokenizer.__call__`. Please refer to the docstring of the
+        [`~Wav2Vec2FeatureExtractor.__call__`] and returns its output. If used in the context
+        [`~Wav2Vec2Processor.as_target_processor`] this method forwards all its arguments to
+        PreTrainedTokenizer's [`~PreTrainedTokenizer.__call__`]. Please refer to the docstring of the
        above two methods for more information.
        """
        return self.current_processor(*args, **kwargs)
@@ -138,9 +142,9 @@ class Wav2Vec2Processor:
    def pad(self, *args, **kwargs):
        """
        When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's
-        :meth:`~transformers.Wav2Vec2FeatureExtractor.pad` and returns its output. If used in the context
-        :meth:`~transformers.Wav2Vec2Processor.as_target_processor` this method forwards all its arguments to
-        PreTrainedTokenizer's :meth:`~transformers.PreTrainedTokenizer.pad`. Please refer to the docstring of the above
+        [`~Wav2Vec2FeatureExtractor.pad`] and returns its output. If used in the context
+        [`~Wav2Vec2Processor.as_target_processor`] this method forwards all its arguments to
+        PreTrainedTokenizer's [`~PreTrainedTokenizer.pad`]. Please refer to the docstring of the above
        two methods for more information.
        """
        return self.current_processor.pad(*args, **kwargs)
@@ -148,7 +152,7 @@ class Wav2Vec2Processor:
    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to PreTrainedTokenizer's
-        :meth:`~transformers.PreTrainedTokenizer.batch_decode`. Please refer to the docstring of this method for more
+        [`~PreTrainedTokenizer.batch_decode`]. Please refer to the docstring of this method for more
        information.
        """
        return self.tokenizer.batch_decode(*args, **kwargs)
@@ -156,7 +160,7 @@ class Wav2Vec2Processor:
    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to PreTrainedTokenizer's
-        :meth:`~transformers.PreTrainedTokenizer.decode`. Please refer to the docstring of this method for more
+        [`~PreTrainedTokenizer.decode`]. Please refer to the docstring of this method for more
        information.
        """
        return self.tokenizer.decode(*args, **kwargs)

--- a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
@@ -50,31 +50,31 @@ PRETRAINED_VOCAB_FILES_MAP = {
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/wav2vec2-base-960h": sys.maxsize}

 WAV2VEC2_KWARGS_DOCSTRING = r"""
-            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
+            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
                Activates and controls padding. Accepts the following values:

-                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a
                  single sequence if provided).
-                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided.
-                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
                  different lengths).
-            max_length (:obj:`int`, `optional`):
+            max_length (`int`, *optional*):
                Controls the maximum length to use by one of the truncation/padding parameters.

-                If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
+                If left unset or set to `None`, this will use the predefined model maximum length if a maximum
                length is required by one of the truncation/padding parameters. If the model has no specific maximum
                input length (like XLNet) truncation/padding to a maximum length will be deactivated.
-            pad_to_multiple_of (:obj:`int`, `optional`):
+            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
-            verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            verbose (`bool`, *optional*, defaults to `True`):
                Whether or not to print more information and warnings.
 """

@@ -84,28 +84,28 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
    """
    Constructs a Wav2Vec2CTC tokenizer.

-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains some of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains some of the main methods.
    Users should refer to the superclass for more information regarding such methods.

    Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
            File containing the vocabulary.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sentence token.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sentence token.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
-        word_delimiter_token (:obj:`str`, `optional`, defaults to :obj:`"|"`):
+        word_delimiter_token (`str`, *optional*, defaults to `"|"`):
            The token used for defining the end of a word.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        do_lower_case (`bool`, *optional*, defaults to `False`):
            Whether or not to accept lowercase input and lowercase the output when decoding.

        **kwargs
-            Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer`
+            Additional keyword arguments passed along to [`PreTrainedTokenizer`]
    """

    vocab_files_names = VOCAB_FILES_NAMES
@@ -153,7 +153,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
    @property
    def word_delimiter_token(self) -> str:
        """
-        :obj:`str`: Word delimiter token. Log an error if used while not having been set.
+        `str`: Word delimiter token. Log an error if used while not having been set.
        """
        if self._word_delimiter_token is None and self.verbose:
            logger.error("Using word_delimiter_token, but it is not set yet.")
@@ -163,7 +163,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
    @property
    def word_delimiter_token_id(self) -> Optional[int]:
        """
-        :obj:`Optional[int]`: Id of the word_delimiter_token in the vocabulary. Returns :obj:`None` if the token has
+        `Optional[int]`: Id of the word_delimiter_token in the vocabulary. Returns `None` if the token has
        not been set.
        """
        if self._word_delimiter_token is None:
@@ -285,17 +285,18 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
        it with indices starting from length of the current vocabulary.

        Args:
-            new_tokens (:obj:`List[str]`or :obj:`List[tokenizers.AddedToken]`):
+            new_tokens (`List[str]`or `List[tokenizers.AddedToken]`):
                Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by
-                checking if the tokenizer assign the index of the ``unk_token`` to them).
-            special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                checking if the tokenizer assign the index of the `unk_token` to them).
+            special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the tokens should be added as special tokens.

        Returns:
-            :obj:`int`: The number of tokens actually added to the vocabulary.
+            `int`: The number of tokens actually added to the vocabulary.

-        Examples::
+        Examples:

+        ```python
        # Let's see how to increase the vocabulary of Bert model and tokenizer
        tokenizer = Wav2Vec2CTCTokenizer.from_pretrained('facebook/wav2vec2-base-960h')
        model = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-base-960h')
@@ -304,7 +305,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
        print('We have added', num_added_toks, 'tokens')
        # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
        model.resize_token_embeddings(len(tokenizer))
-        """
+        ```"""
        new_tokens = [str(tok) for tok in new_tokens]

        tokens_to_add = []
@@ -341,45 +342,44 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer):
    """
    Constructs a Wav2Vec2 tokenizer.

-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains some of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains some of the main methods.
    Users should refer to the superclass for more information regarding such methods.

    Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
            File containing the vocabulary.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sentence token.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sentence token.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
-        word_delimiter_token (:obj:`str`, `optional`, defaults to :obj:`"|"`):
+        word_delimiter_token (`str`, *optional*, defaults to `"|"`):
            The token used for defining the end of a word.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        do_lower_case (`bool`, *optional*, defaults to `False`):
            Whether or not to lowercase the output when decoding.
-        do_normalize (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        do_normalize (`bool`, *optional*, defaults to `False`):
            Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
-            improve the performance for some models, *e.g.*, `wav2vec2-lv60
-            <https://huggingface.co/models?search=lv60>`__.
-        return_attention_mask (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not :meth:`~transformers.Wav2Vec2Tokenizer.__call__` should return :obj:`attention_mask`.
+            improve the performance for some models, *e.g.*, [wav2vec2-lv60](https://huggingface.co/models?search=lv60).
+        return_attention_mask (`bool`, *optional*, defaults to `False`):
+            Whether or not [`~Wav2Vec2Tokenizer.__call__`] should return `attention_mask`.

-            .. note::
+            <Tip>

-                Wav2Vec2 models that have set ``config.feat_extract_norm == "group"``, such as `wav2vec2-base
-                <https://huggingface.co/facebook/wav2vec2-base-960h>`__, have **not** been trained using
-                :obj:`attention_mask`. For such models, :obj:`input_values` should simply be padded with 0 and no
-                :obj:`attention_mask` should be passed.
+            Wav2Vec2 models that have set `config.feat_extract_norm == "group"`, such as [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base-960h), have **not** been trained using
+            `attention_mask`. For such models, `input_values` should simply be padded with 0 and no
+            `attention_mask` should be passed.

-                For Wav2Vec2 models that have set ``config.feat_extract_norm == "layer"``, such as `wav2vec2-lv60
-                <https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self>`__, :obj:`attention_mask` should be
+            For Wav2Vec2 models that have set `config.feat_extract_norm == "layer"`, such as [wav2vec2-lv60](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self), `attention_mask` should be
            passed for batched inference.

+            </Tip>
+
        **kwargs
-            Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer`
+            Additional keyword arguments passed along to [`PreTrainedTokenizer`]
    """

    vocab_files_names = VOCAB_FILES_NAMES
@@ -437,7 +437,7 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer):
    @property
    def word_delimiter_token(self) -> str:
        """
-        :obj:`str`: Padding token. Log an error if used while not having been set.
+        `str`: Padding token. Log an error if used while not having been set.
        """
        if self._word_delimiter_token is None and self.verbose:
            logger.error("Using word_delimiter_token, but it is not set yet.")
@@ -447,7 +447,7 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer):
    @property
    def word_delimiter_token_id(self) -> Optional[int]:
        """
-        :obj:`Optional[int]`: Id of the word_delimiter_token in the vocabulary. Returns :obj:`None` if the token has
+        `Optional[int]`: Id of the word_delimiter_token in the vocabulary. Returns `None` if the token has
        not been set.
        """
        if self._word_delimiter_token is None:
@@ -478,7 +478,7 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer):
        sequences.

        Args:
-            raw_speech (:obj:`np.ndarray`, :obj:`List[float]`, :obj:`List[np.ndarray]`, :obj:`List[List[float]]`):
+            raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                values, a list of numpy arrayr or a list of list of float values.
        """

--- a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
+++ b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
@@ -52,32 +52,32 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
    """
    Constructs a Wav2Vec2PhonemeCTC tokenizer.

-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains some of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains some of the main methods.
    Users should refer to the superclass for more information regarding such methods.

    Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
            File containing the vocabulary.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sentence token.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sentence token.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
-        do_phonemize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_phonemize (`bool`, *optional*, defaults to `True`):
            Whether the tokenizer should phonetize the input or not. Only if a sequence of phonemes is passed to the
-            tokenizer, :obj:`do_phonemize` should be set to ``False``.
-        phonemizer_lang (:obj:`str`, `optional`, defaults to :obj:`"en-us"`):
+            tokenizer, `do_phonemize` should be set to `False`.
+        phonemizer_lang (`str`, *optional*, defaults to `"en-us"`):
            The language of the phoneme set to which the tokenizer should phonetize the input text to.
-        phonemizer_backend (:obj:`str`, `optional`. defaults to :obj:`"espeak"`):
-            The backend phonetization library that shall be used by the phonemizer library. Defaults to ``espeak-ng``.
-            See the `phonemizer package <https://github.com/bootphon/phonemizer#readme>`_. for more information.
+        phonemizer_backend (`str`, *optional*. defaults to `"espeak"`):
+            The backend phonetization library that shall be used by the phonemizer library. Defaults to `espeak-ng`.
+            See the [phonemizer package](https://github.com/bootphon/phonemizer#readme). for more information.

        **kwargs
-            Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer`
+            Additional keyword arguments passed along to [`PreTrainedTokenizer`]
    """

    vocab_files_names = VOCAB_FILES_NAMES
@@ -139,25 +139,25 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
        """
        Performs any necessary transformations before tokenization.

-        This method should pop the arguments from kwargs and return the remaining :obj:`kwargs` as well. We test the
-        :obj:`kwargs` at the end of the encoding process to be sure all the arguments have been used.
+        This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
+        `kwargs` at the end of the encoding process to be sure all the arguments have been used.

        Args:
-            text (:obj:`str`):
+            text (`str`):
                The text to prepare.
-            is_split_into_words (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the input is already pre-tokenized (e.g., split into words). If set to :obj:`True`, the
+            is_split_into_words (`bool`, *optional*, defaults to `False`):
+                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
                which it will tokenize. This is useful for NER or token classification.
-            phonemizer_lang (:obj:`str`, `optional`):
+            phonemizer_lang (`str`, *optional*):
                The language of the phoneme set to which the tokenizer should phonetize the input text to.
-            do_phonemize (:obj:`bool`, `optional`):
+            do_phonemize (`bool`, *optional*):
                Whether the tokenizer should phonetize the input text or not. Only if a sequence of phonemes is passed
-                to the tokenizer, :obj:`do_phonemize` should be set to ``False``.
+                to the tokenizer, `do_phonemize` should be set to `False`.


        Returns:
-            :obj:`Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
+            `Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
        """
        if is_split_into_words:
            text = " " + text
@@ -217,7 +217,7 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
    @property
    def word_delimiter_token(self) -> str:
        """
-        :obj:`str`: Word delimiter token. Log an error if used while not having been set.
+        `str`: Word delimiter token. Log an error if used while not having been set.
        """
        if self._word_delimiter_token is None and self.verbose:
            return None
@@ -226,7 +226,7 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
    @property
    def word_delimiter_token_id(self) -> Optional[int]:
        """
-        :obj:`Optional[int]`: Id of the word_delimiter_token in the vocabulary. Returns :obj:`None` if the token has
+        `Optional[int]`: Id of the word_delimiter_token in the vocabulary. Returns `None` if the token has
        not been set.
        """
        if self._word_delimiter_token is None:
@@ -244,7 +244,7 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
    @property
    def phone_delimiter_token(self) -> str:
        """
-        :obj:`str`: Word delimiter token. Log an error if used while not having been set.
+        `str`: Word delimiter token. Log an error if used while not having been set.
        """
        if self._phone_delimiter_token is None and self.verbose:
            logger.error("Using phone_delimiter_token, but it is not set yet.")
@@ -254,7 +254,7 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
    @property
    def phone_delimiter_token_id(self) -> Optional[int]:
        """
-        :obj:`Optional[int]`: Id of the phone_delimiter_token in the vocabulary. Returns :obj:`None` if the token has
+        `Optional[int]`: Id of the phone_delimiter_token in the vocabulary. Returns `None` if the token has
        not been set.
        """
        if self._phone_delimiter_token is None:
@@ -357,17 +357,18 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
        it with indices starting from length of the current vocabulary.

        Args:
-            new_tokens (:obj:`List[str]`or :obj:`List[tokenizers.AddedToken]`):
+            new_tokens (`List[str]`or `List[tokenizers.AddedToken]`):
                Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by
-                checking if the tokenizer assign the index of the ``unk_token`` to them).
-            special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                checking if the tokenizer assign the index of the `unk_token` to them).
+            special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the tokens should be added as special tokens.

        Returns:
-            :obj:`int`: The number of tokens actually added to the vocabulary.
+            `int`: The number of tokens actually added to the vocabulary.

-        Examples::
+        Examples:

+        ```python
        # Let's see how to increase the vocabulary of Bert model and tokenizer
        tokenizer = Wav2Vec2PhonemeCTCTokenizer.from_pretrained('facebook/wav2vec2-lv-60-espeak-cv-ft')
        model = Wav2Vec2PhonemeForCTC.from_pretrained('facebook/wav2vec2-lv-60-espeak-cv-ft')
@@ -376,7 +377,7 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
        print('We have added', num_added_toks, 'tokens')
        # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
        model.resize_token_embeddings(len(tokenizer))
-        """
+        ```"""
        new_tokens = [str(tok) for tok in new_tokens]

        tokens_to_add = []

--- a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
+++ b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
@@ -37,10 +37,10 @@ if TYPE_CHECKING:
 @dataclass
 class Wav2Vec2DecoderWithLMOutput(ModelOutput):
    """
-    Output type of :class:`~transformers.Wav2Vec2DecoderWithLM`, with transcription.
+    Output type of [`Wav2Vec2DecoderWithLM`], with transcription.

    Args:
-        text (list of :obj:`str`):
+        text (list of `str`):
            Decoded logits in text from. Usually the speech transcription.
    """

@@ -53,12 +53,12 @@ class Wav2Vec2ProcessorWithLM:
    with language model support into a single processor for language model boosted speech recognition decoding.

    Args:
-        feature_extractor (:class:`~transformers.Wav2Vec2FeatureExtractor`):
-            An instance of :class:`~transformers.Wav2Vec2FeatureExtractor`. The feature extractor is a required input.
-        tokenizer (:class:`~transformers.Wav2Vec2CTCTokenizer`):
-            An instance of :class:`~transformers.Wav2Vec2CTCTokenizer`. The tokenizer is a required input.
-        decoder (:obj:`pyctcdecode.BeamSearchDecoderCTC`):
-            An instance of :class:`pyctcdecode.BeamSearchDecoderCTC`. The decoder is a required input.
+        feature_extractor ([`Wav2Vec2FeatureExtractor`]):
+            An instance of [`Wav2Vec2FeatureExtractor`]. The feature extractor is a required input.
+        tokenizer ([`Wav2Vec2CTCTokenizer`]):
+            An instance of [`Wav2Vec2CTCTokenizer`]. The tokenizer is a required input.
+        decoder (`pyctcdecode.BeamSearchDecoderCTC`):
+            An instance of [`pyctcdecode.BeamSearchDecoderCTC`]. The decoder is a required input.
    """

    def __init__(
@@ -98,20 +98,22 @@ class Wav2Vec2ProcessorWithLM:
    def save_pretrained(self, save_directory):
        """
        Save the Wav2Vec2 feature_extractor, a tokenizer object and a pyctcdecode decoder to the directory
-        ``save_directory``, so that they can be re-loaded using the
-        :func:`~transformers.Wav2Vec2ProcessorWithLM.from_pretrained` class method.
+        `save_directory`, so that they can be re-loaded using the
+        [`~Wav2Vec2ProcessorWithLM.from_pretrained`] class method.

-        .. note::
+        <Tip>

        This class method is simply calling
-            :meth:`~transformers.feature_extraction_utils.FeatureExtractionMixin.save_pretrained,`
-            :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.save_pretrained` and pyctcdecode's
-            :meth:`pyctcdecode.BeamSearchDecoderCTC.save_to_dir`.
+        [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained,`]
+        [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`] and pyctcdecode's
+        [`pyctcdecode.BeamSearchDecoderCTC.save_to_dir`].

        Please refer to the docstrings of the methods above for more information.

+        </Tip>
+
        Args:
-            save_directory (:obj:`str` or :obj:`os.PathLike`):
+            save_directory (`str` or `os.PathLike`):
                Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
                be created if it does not exist).
        """
@@ -122,32 +124,34 @@ class Wav2Vec2ProcessorWithLM:
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
        r"""
-        Instantiate a :class:`~transformers.Wav2Vec2ProcessorWithLM` from a pretrained Wav2Vec2 processor.
+        Instantiate a [`Wav2Vec2ProcessorWithLM`] from a pretrained Wav2Vec2 processor.

-        .. note::
+        <Tip>

        This class method is simply calling Wav2Vec2FeatureExtractor's
-            :meth:`~transformers.feature_extraction_utils.FeatureExtractionMixin.from_pretrained`,
-            Wav2Vec2CTCTokenizer's :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained`,
-            and :meth:`pyctcdecode.BeamSearchDecoderCTC.load_from_hf_hub`.
+        [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`],
+        Wav2Vec2CTCTokenizer's [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`],
+        and [`pyctcdecode.BeamSearchDecoderCTC.load_from_hf_hub`].

        Please refer to the docstrings of the methods above for more information.

+        </Tip>
+
        Args:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
                This can be either:

-                - a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
-                  namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing a feature extractor file saved using the
-                  :meth:`~transformers.SequenceFeatureExtractor.save_pretrained` method, e.g.,
-                  ``./my_model_directory/``.
-                - a path or url to a saved feature extractor JSON `file`, e.g.,
-                  ``./my_model_directory/preprocessor_config.json``.
+                - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
+                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                - a path to a *directory* containing a feature extractor file saved using the
+                  [`~SequenceFeatureExtractor.save_pretrained`] method, e.g.,
+                  `./my_model_directory/`.
+                - a path or url to a saved feature extractor JSON *file*, e.g.,
+                  `./my_model_directory/preprocessor_config.json`.
            **kwargs
-                Additional keyword arguments passed along to both :class:`~transformers.SequenceFeatureExtractor` and
-                :class:`~transformers.PreTrainedTokenizer`
+                Additional keyword arguments passed along to both [`SequenceFeatureExtractor`] and
+                [`PreTrainedTokenizer`]
        """
        requires_backends(cls, "pyctcdecode")
        from pyctcdecode import BeamSearchDecoderCTC
@@ -215,9 +219,9 @@ class Wav2Vec2ProcessorWithLM:
    def __call__(self, *args, **kwargs):
        """
        When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's
-        :meth:`~transformers.Wav2Vec2FeatureExtractor.__call__` and returns its output. If used in the context
-        :meth:`~transformers.Wav2Vec2ProcessorWithLM.as_target_processor` this method forwards all its arguments to
-        Wav2Vec2CTCTokenizer's :meth:`~transformers.Wav2Vec2CTCTokenizer.__call__`. Please refer to the docstring of
+        [`~Wav2Vec2FeatureExtractor.__call__`] and returns its output. If used in the context
+        [`~Wav2Vec2ProcessorWithLM.as_target_processor`] this method forwards all its arguments to
+        Wav2Vec2CTCTokenizer's [`~Wav2Vec2CTCTokenizer.__call__`]. Please refer to the docstring of
        the above two methods for more information.
        """
        return self.current_processor(*args, **kwargs)
@@ -225,9 +229,9 @@ class Wav2Vec2ProcessorWithLM:
    def pad(self, *args, **kwargs):
        """
        When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's
-        :meth:`~transformers.Wav2Vec2FeatureExtractor.pad` and returns its output. If used in the context
-        :meth:`~transformers.Wav2Vec2ProcessorWithLM.as_target_processor` this method forwards all its arguments to
-        Wav2Vec2CTCTokenizer's :meth:`~transformers.Wav2Vec2CTCTokenizer.pad`. Please refer to the docstring of the
+        [`~Wav2Vec2FeatureExtractor.pad`] and returns its output. If used in the context
+        [`~Wav2Vec2ProcessorWithLM.as_target_processor`] this method forwards all its arguments to
+        Wav2Vec2CTCTokenizer's [`~Wav2Vec2CTCTokenizer.pad`]. Please refer to the docstring of the
        above two methods for more information.
        """
        return self.current_processor.pad(*args, **kwargs)
@@ -245,30 +249,32 @@ class Wav2Vec2ProcessorWithLM:
        """
        Batch decode output logits to audio transcription with language model support.

-        .. note::
+        <Tip>

        This function makes use of Python's multiprocessing.

+        </Tip>
+
        Args:
-            logits (:obj:`np.ndarray`):
+            logits (`np.ndarray`):
                The logits output vector of the model representing the log probabilities for each token.
-            num_processes (:obj:`int`, `optional`):
+            num_processes (`int`, *optional*):
                Number of processes on which the function should be parallelized over. Defaults to the number of
                available CPUs.
-            beam_width (:obj:`int`, `optional`):
+            beam_width (`int`, *optional*):
                Maximum number of beams at each step in decoding. Defaults to pyctcdecode's DEFAULT_BEAM_WIDTH.
-            beam_prune_logp (:obj:`int`, `optional`):
+            beam_prune_logp (`int`, *optional*):
                Beams that are much worse than best beam will be pruned Defaults to pyctcdecode's DEFAULT_PRUNE_LOGP.
-            token_min_logp (:obj:`int`, `optional`):
+            token_min_logp (`int`, *optional*):
                Tokens below this logp are skipped unless they are argmax of frame Defaults to pyctcdecode's
                DEFAULT_MIN_TOKEN_LOGP.
-            hotwords (:obj:`List[str]`, `optional`):
+            hotwords (`List[str]`, *optional*):
                List of words with extra importance, can be OOV for LM
-            hotword_weight (:obj:`int`, `optional`):
+            hotword_weight (`int`, *optional*):
                Weight factor for hotword importance Defaults to pyctcdecode's DEFAULT_HOTWORD_WEIGHT.

        Returns:
-            :class:`~transformers.models.wav2vec2.Wav2Vec2DecoderWithLMOutput` or :obj:`tuple`.
+            [`~models.wav2vec2.Wav2Vec2DecoderWithLMOutput`] or `tuple`.

        """
        from pyctcdecode.constants import (
@@ -318,23 +324,23 @@ class Wav2Vec2ProcessorWithLM:
        Decode output logits to audio transcription with language model support.

        Args:
-            logits (:obj:`np.ndarray`):
+            logits (`np.ndarray`):
                The logits output vector of the model representing the log probabilities for each token.
-            beam_width (:obj:`int`, `optional`):
+            beam_width (`int`, *optional*):
                Maximum number of beams at each step in decoding. Defaults to pyctcdecode's DEFAULT_BEAM_WIDTH.
-            beam_prune_logp (:obj:`int`, `optional`):
+            beam_prune_logp (`int`, *optional*):
                A threshold to prune beams with log-probs less than best_beam_logp + beam_prune_logp. The value should
                be <= 0. Defaults to pyctcdecode's DEFAULT_PRUNE_LOGP.
-            token_min_logp (:obj:`int`, `optional`):
+            token_min_logp (`int`, *optional*):
                Tokens with log-probs below token_min_logp are skipped unless they are have the maximum log-prob for an
                utterance. Defaults to pyctcdecode's DEFAULT_MIN_TOKEN_LOGP.
-            hotwords (:obj:`List[str]`, `optional`):
+            hotwords (`List[str]`, *optional*):
                List of words with extra importance which can be missing from the LM's vocabulary, e.g. ["huggingface"]
-            hotword_weight (:obj:`int`, `optional`):
+            hotword_weight (`int`, *optional*):
                Weight multiplier that boosts hotword scores. Defaults to pyctcdecode's DEFAULT_HOTWORD_WEIGHT.

        Returns:
-            :class:`~transformers.models.wav2vec2.Wav2Vec2DecoderWithLMOutput` or :obj:`tuple`.
+            [`~models.wav2vec2.Wav2Vec2DecoderWithLMOutput`] or `tuple`.

        """
        from pyctcdecode.constants import (

--- a/src/transformers/models/wavlm/configuration_wavlm.py
+++ b/src/transformers/models/wavlm/configuration_wavlm.py
@@ -28,151 +28,151 @@ WAVLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class WavLMConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.WavLMModel`. It is used to
+    This is the configuration class to store the configuration of a [`WavLMModel`]. It is used to
    instantiate an WavLM model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the WavLM `facebook/wavlm-base-960h
-    <https://huggingface.co/facebook/wavlm-base-960h>`__ architecture.
+    configuration with the defaults will yield a similar configuration to that of the WavLM [facebook/wavlm-base-960h](https://huggingface.co/facebook/wavlm-base-960h) architecture.

-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.


    Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 32):
+        vocab_size (`int`, *optional*, defaults to 32):
            Vocabulary size of the WavLM model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.WavLMModel`. Vocabulary size of the model.
-            Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of
-            :class:`~transformers.WavLMModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            `inputs_ids` passed when calling [`WavLMModel`]. Vocabulary size of the model.
+            Defines the different tokens that can be represented by the *inputs_ids* passed to the forward method of
+            [`WavLMModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
+        attention_dropout (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention probabilities.
-        final_dropout (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probability for the final projection layer of :class:`WavLMForCTC`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        final_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the final projection layer of [`WavLMForCTC`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
-        feat_extract_norm (:obj:`str`, `optional`, defaults to :obj:`"group"`):
-            The norm to be applied to 1D convolutional layers in feature extractor. One of :obj:`"group"` for group
-            normalization of only the first 1D convolutional layer or :obj:`"layer"` for layer normalization of all 1D
+        feat_extract_norm (`str`, *optional*, defaults to `"group"`):
+            The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group
+            normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
            convolutional layers.
-        feat_proj_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
            The dropout probability for output of the feature extractor.
-        feat_extract_activation (:obj:`str, `optional`, defaults to :obj:`"gelu"`):
+        feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the 1D convolutional layers of the feature
-            extractor. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        feat_quantizer_dropout (obj:`float`, `optional`, defaults to 0.0):
+            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        feat_quantizer_dropout (obj:*float*, *optional*, defaults to 0.0):
            The dropout probabilitiy for quantized feature extractor states.
-        conv_dim (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(512, 512, 512, 512, 512, 512, 512)`):
+        conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
            A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
-            feature extractor. The length of `conv_dim` defines the number of 1D convolutional layers.
-        conv_stride (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(5, 2, 2, 2, 2, 2, 2)`):
+            feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers.
+        conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
            A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length
-            of `conv_stride` defines the number of convolutional layers and has to match the the length of `conv_dim`.
-        conv_kernel (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(10, 3, 3, 3, 3, 3, 3)`):
+            of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
+        conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
            A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The
-            length of `conv_kernel` defines the number of convolutional layers and has to match the the length of
-            `conv_dim`.
-        conv_bias (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
+            *conv_dim*.
+        conv_bias (`bool`, *optional*, defaults to `False`):
            Whether the 1D convolutional layers have a bias.
-        num_conv_pos_embeddings (:obj:`int`, `optional`, defaults to 128):
+        num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
            Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
            embeddings layer.
-        num_conv_pos_embedding_groups (:obj:`int`, `optional`, defaults to 16):
+        num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
            Number of groups of 1D convolutional positional embeddings layer.
-        do_stable_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to apply `stable` layer norm architecture of the Transformer encoder. ``do_stable_layer_norm is
-            True`` corresponds to applying layer norm before the attention layer, whereas ``do_stable_layer_norm is
-            False`` corresponds to applying layer norm after the attention layer.
-        apply_spec_augment (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_stable_layer_norm (`bool`, *optional*, defaults to `False`):
+            Whether to apply *stable* layer norm architecture of the Transformer encoder. `do_stable_layer_norm is True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is False` corresponds to applying layer norm after the attention layer.
+        apply_spec_augment (`bool`, *optional*, defaults to `True`):
            Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see
-            `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
-            <https://arxiv.org/abs/1904.08779>`__.
-        mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
+            [SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition](https://arxiv.org/abs/1904.08779).
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
            Propability of each feature vector along the time axis to be chosen as the start of the vector span to be
-            masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be
-            masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
-        mask_time_length (:obj:`int`, `optional`, defaults to 10):
+            masked. Approximately `mask_time_prob * sequence_length // mask_time_length` feature vectors will be
+            masked along the time axis. This is only relevant if `apply_spec_augment is True`.
+        mask_time_length (`int`, *optional*, defaults to 10):
            Length of vector span along the time axis.
-        mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
-            The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
-            step, irrespectively of ``mask_feature_prob``. Only relevant if
+        mask_time_min_masks (`int`, *optional*, defaults to 2),:
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
            ''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
-        mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
            Propability of each feature vector along the feature axis to be chosen as the start of the vector span to
-            be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be
-            masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
-        mask_feature_length (:obj:`int`, `optional`, defaults to 10):
+            be masked. Approximately `mask_time_prob * hidden_size // mask_time_length` feature vectors will be
+            masked along the time axis. This is only relevant if `apply_spec_augment is True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
            Length of vector span along the feature axis.
-        num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320):
+        num_codevectors_per_group (`int`, *optional*, defaults to 320):
            Number of entries in each quantization codebook (group).
-        num_codevector_groups (:obj:`int`, `optional`, defaults to 2):
+        num_codevector_groups (`int`, *optional*, defaults to 2):
            Number of codevector groups for product codevector quantization.
-        contrastive_logits_temperature (:obj:`float`, `optional`, defaults to 0.1):
-            The temperature `kappa` in the contrastive loss.
-        feat_quantizer_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
+            The temperature *kappa* in the contrastive loss.
+        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
            The dropout probabilitiy for the output of the feature extractor that's used by the quantizer.
-        num_negatives (:obj:`int`, `optional`, defaults to 100):
+        num_negatives (`int`, *optional*, defaults to 100):
            Number of negative samples for the contrastive loss.
-        codevector_dim (:obj:`int`, `optional`, defaults to 256):
+        codevector_dim (`int`, *optional*, defaults to 256):
            Dimensionality of the quantized feature vectors.
-        proj_codevector_dim (:obj:`int`, `optional`, defaults to 256):
+        proj_codevector_dim (`int`, *optional*, defaults to 256):
            Dimensionality of the final projection of both the quantized and the transformer features.
-        diversity_loss_weight (:obj:`int`, `optional`, defaults to 0.1):
+        diversity_loss_weight (`int`, *optional*, defaults to 0.1):
            The weight of the codebook diversity loss component.
-        ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"mean"`):
-            Specifies the reduction to apply to the output of ``torch.nn.CTCLoss``. Only relevant when training an
-            instance of :class:`~transformers.WavLMForCTC`.
-        ctc_zero_infinity (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to zero infinite losses and the associated gradients of ``torch.nn.CTCLoss``. Infinite losses
+        ctc_loss_reduction (`str`, *optional*, defaults to `"mean"`):
+            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
+            instance of [`WavLMForCTC`].
+        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
+            Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses
            mainly occur when the inputs are too short to be aligned to the targets. Only relevant when training an
-            instance of :class:`~transformers.WavLMForCTC`.
-        use_weighted_layer_sum (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            instance of [`WavLMForCTC`].
+        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
            Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
-            instance of :class:`~transformers.WavLMForSequenceClassification`.
-        classifier_proj_size (:obj:`int`, `optional`, defaults to 256):
+            instance of [`WavLMForSequenceClassification`].
+        classifier_proj_size (`int`, *optional*, defaults to 256):
            Dimensionality of the projection before token mean-pooling for classification.
-        tdnn_dim (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(512, 512, 512, 512, 1500)`):
-            A tuple of integers defining the number of output channels of each 1D convolutional layer in the `TDNN`
-            module of the `XVector` model. The length of `tdnn_dim` defines the number of `TDNN` layers.
-        tdnn_kernel (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(5, 3, 3, 1, 1)`):
-            A tuple of integers defining the kernel size of each 1D convolutional layer in the `TDNN` module of the
-            `XVector` model. The length of `tdnn_kernel` has to match the length of `tdnn_dim`.
-        tdnn_dilation (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(1, 2, 3, 1, 1)`):
-            A tuple of integers defining the dilation factor of each 1D convolutional layer in `TDNN` module of the
-            `XVector` model. The length of `tdnn_dilation` has to match the length of `tdnn_dim`.
-        xvector_output_dim (:obj:`int`, `optional`, defaults to 512):
-            Dimensionality of the `XVector` embedding vectors.
-        add_adapter (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        tdnn_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`):
+            A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN*
+            module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers.
+        tdnn_kernel (`Tuple[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the
+            *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*.
+        tdnn_dilation (`Tuple[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`):
+            A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the
+            *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*.
+        xvector_output_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of the *XVector* embedding vectors.
+        add_adapter (`bool`, *optional*, defaults to `False`):
            Whether a convolutional network should be stacked on top of the Wav2Vec2 Encoder. Can be very useful for
            warm-starting Wav2Vec2 for SpeechEncoderDecoder models.
-        adapter_kernel_size (:obj:`int`, `optional`, defaults to 3):
-            Kernel size of the convolutional layers in the adapter network. Only relevant if ``add_adapter is True``.
-        adapter_stride (:obj:`int`, `optional`, defaults to 2):
-            Stride of the convolutional layers in the adapter network. Only relevant if ``add_adapter is True``.
-        num_adapter_layers (:obj:`int`, `optional`, defaults to 3):
-            Number of convolutional layers that should be used in the adapter network. Only relevant if ``add_adapter
-            is True``.
-        output_hidden_size (:obj:`int`, `optional`):
-            Dimensionality of the encoder output layer. If not defined, this defaults to `hidden-size`. Only relevant
-            if ``add_adapter is True``.
+        adapter_kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        adapter_stride (`int`, *optional*, defaults to 2):
+            Stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        num_adapter_layers (`int`, *optional*, defaults to 3):
+            Number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is True`.
+        output_hidden_size (`int`, *optional*):
+            Dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*. Only relevant
+            if `add_adapter is True`.

-    Example::
+    Example:

-    Example::
+    ```python

+    ```
+
+    Example:
+
+    ```python
    >>> from transformers import WavLMModel, WavLMConfig

    >>> # Initializing a WavLM facebook/wavlm-base-960h style configuration
@@ -183,7 +183,7 @@ class WavLMConfig(PretrainedConfig):

    >>> # Accessing the model configuration
    >>> configuration = model.config
-    """
+    ```"""
    model_type = "wavlm"

    def __init__(

--- a/src/transformers/models/xlm/configuration_xlm.py
+++ b/src/transformers/models/xlm/configuration_xlm.py
@@ -36,103 +36,104 @@ XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class XLMConfig(PretrainedConfig):
    """
-    This is the configuration class to store the configuration of a :class:`~transformers.XLMModel` or a
-    :class:`~transformers.TFXLMModel`. It is used to instantiate a XLM model according to the specified arguments,
+    This is the configuration class to store the configuration of a [`XLMModel`] or a
+    [`TFXLMModel`]. It is used to instantiate a XLM model according to the specified arguments,
    defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
-    to that of the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.
+    to that of the [xlm-mlm-en-2048](https://huggingface.co/xlm-mlm-en-2048) architecture.

-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.

    Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 30145):
+        vocab_size (`int`, *optional*, defaults to 30145):
            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.XLMModel` or :class:`~transformers.TFXLMModel`.
-        emb_dim (:obj:`int`, `optional`, defaults to 2048):
+            `inputs_ids` passed when calling [`XLMModel`] or [`TFXLMModel`].
+        emb_dim (`int`, *optional*, defaults to 2048):
            Dimensionality of the encoder layers and the pooler layer.
-        n_layer (:obj:`int`, `optional`, defaults to 12):
+        n_layer (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
-        n_head (:obj:`int`, `optional`, defaults to 16):
+        n_head (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
+        dropout (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
+        attention_dropout (`float`, *optional*, defaults to 0.1):
            The dropout probability for the attention mechanism
-        gelu_activation (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to use `gelu` for the activations instead of `relu`.
-        sinusoidal_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        gelu_activation (`bool`, *optional*, defaults to `True`):
+            Whether or not to use *gelu* for the activations instead of *relu*.
+        sinusoidal_embeddings (`bool`, *optional*, defaults to `False`):
            Whether or not to use sinusoidal positional embeddings instead of absolute positional embeddings.
-        causal (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        causal (`bool`, *optional*, defaults to `False`):
            Whether or not the model should behave in a causal manner. Causal models use a triangular attention mask in
            order to only attend to the left-side context instead if a bidirectional context.
-        asm (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        asm (`bool`, *optional*, defaults to `False`):
            Whether or not to use an adaptive log softmax projection layer instead of a linear layer for the prediction
            layer.
-        n_langs (:obj:`int`, `optional`, defaults to 1):
+        n_langs (`int`, *optional*, defaults to 1):
            The number of languages the model handles. Set to 1 for monolingual models.
-        use_lang_emb (:obj:`bool`, `optional`, defaults to :obj:`True`)
-            Whether to use language embeddings. Some models use additional language embeddings, see `the multilingual
-            models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__ for
+        use_lang_emb (`bool`, *optional*, defaults to `True`)
+            Whether to use language embeddings. Some models use additional language embeddings, see [the multilingual
+            models page](http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings) for
            information on how to use them.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
-        embed_init_std (:obj:`float`, `optional`, defaults to 2048^-0.5):
+        embed_init_std (`float`, *optional*, defaults to 2048^-0.5):
            The standard deviation of the truncated_normal_initializer for initializing the embedding matrices.
-        init_std (:obj:`int`, `optional`, defaults to 50257):
+        init_std (`int`, *optional*, defaults to 50257):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices except the
            embedding matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
-        bos_index (:obj:`int`, `optional`, defaults to 0):
+        bos_index (`int`, *optional*, defaults to 0):
            The index of the beginning of sentence token in the vocabulary.
-        eos_index (:obj:`int`, `optional`, defaults to 1):
+        eos_index (`int`, *optional*, defaults to 1):
            The index of the end of sentence token in the vocabulary.
-        pad_index (:obj:`int`, `optional`, defaults to 2):
+        pad_index (`int`, *optional*, defaults to 2):
            The index of the padding token in the vocabulary.
-        unk_index (:obj:`int`, `optional`, defaults to 3):
+        unk_index (`int`, *optional*, defaults to 3):
            The index of the unknown token in the vocabulary.
-        mask_index (:obj:`int`, `optional`, defaults to 5):
+        mask_index (`int`, *optional*, defaults to 5):
            The index of the masking token in the vocabulary.
-        is_encoder(:obj:`bool`, `optional`, defaults to :obj:`True`):
+        is_encoder(`bool`, *optional*, defaults to `True`):
            Whether or not the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
-        summary_type (:obj:`string`, `optional`, defaults to "first"):
+        summary_type (`string`, *optional*, defaults to "first"):
            Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.

            Has to be one of the following options:

-                - :obj:`"last"`: Take the last token hidden state (like XLNet).
-                - :obj:`"first"`: Take the first token hidden state (like BERT).
-                - :obj:`"mean"`: Take the mean of all tokens hidden states.
-                - :obj:`"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
-                - :obj:`"attn"`: Not implemented now, use multi-head attention.
-        summary_use_proj (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                - `"last"`: Take the last token hidden state (like XLNet).
+                - `"first"`: Take the first token hidden state (like BERT).
+                - `"mean"`: Take the mean of all tokens hidden states.
+                - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
+                - `"attn"`: Not implemented now, use multi-head attention.
+        summary_use_proj (`bool`, *optional*, defaults to `True`):
            Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.

            Whether or not to add a projection after the vector extraction.
-        summary_activation (:obj:`str`, `optional`):
+        summary_activation (`str`, *optional*):
            Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.

-            Pass :obj:`"tanh"` for a tanh activation to the output, any other value will result in no activation.
-        summary_proj_to_labels (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
+        summary_proj_to_labels (`bool`, *optional*, defaults to `True`):
            Used in the sequence classification and multiple choice models.

-            Whether the projection outputs should have :obj:`config.num_labels` or :obj:`config.hidden_size` classes.
-        summary_first_dropout (:obj:`float`, `optional`, defaults to 0.1):
+            Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes.
+        summary_first_dropout (`float`, *optional*, defaults to 0.1):
            Used in the sequence classification and multiple choice models.

            The dropout ratio to be used after the projection and activation.
-        start_n_top (:obj:`int`, `optional`, defaults to 5):
+        start_n_top (`int`, *optional*, defaults to 5):
            Used in the SQuAD evaluation script.
-        end_n_top (:obj:`int`, `optional`, defaults to 5):
+        end_n_top (`int`, *optional*, defaults to 5):
            Used in the SQuAD evaluation script.
-        mask_token_id (:obj:`int`, `optional`, defaults to 0):
+        mask_token_id (`int`, *optional*, defaults to 0):
            Model agnostic parameter to identify masked tokens when generating text in an MLM context.
-        lang_id (:obj:`int`, `optional`, defaults to 1):
+        lang_id (`int`, *optional*, defaults to 1):
            The ID of the language used by the model. This parameter is used when generating text in a given language.

-    Examples::
+    Examples:

+    ```python
    >>> from transformers import XLMConfig, XLMModel

    >>> # Initializing a XLM configuration
@@ -143,7 +144,7 @@ class XLMConfig(PretrainedConfig):

    >>> # Accessing the model configuration
    >>> configuration = model.config
-    """
+    ```"""

    model_type = "xlm"
    attribute_map = {

--- a/src/transformers/models/xlm/tokenization_xlm.py
+++ b/src/transformers/models/xlm/tokenization_xlm.py
@@ -534,49 +534,52 @@ class XLMTokenizer(PreTrainedTokenizer):
    - Moses preprocessing and tokenization for most supported languages.
    - Language specific tokenization for Chinese (Jieba), Japanese (KyTea) and Thai (PyThaiNLP).
    - Optionally lowercases and normalizes all inputs text.
-    - The arguments ``special_tokens`` and the function ``set_special_tokens``, can be used to add additional symbols
+    - The arguments `special_tokens` and the function `set_special_tokens`, can be used to add additional symbols
      (like "__classify__") to a vocabulary.
-    - The :obj:`lang2id` attribute maps the languages supported by the model with their IDs if provided (automatically
+    - The `lang2id` attribute maps the languages supported by the model with their IDs if provided (automatically
      set for pretrained vocabularies).
-    - The :obj:`id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies).
+    - The `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies).

-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
    Users should refer to this superclass for more information regarding those methods.

    Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
            Vocabulary file.
-        merges_file (:obj:`str`):
+        merges_file (`str`):
            Merges file.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

-            .. note::
+            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
-                sequence. The token used is the :obj:`cls_token`.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        cls_token (`str`, *optional*, defaults to `"</s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<special1>"`):
+        mask_token (`str`, *optional*, defaults to `"<special1>"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
-        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<special0>","<special1>","<special2>","<special3>","<special4>","<special5>","<special6>","<special7>","<special8>","<special9>"]`):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `["<special0>","<special1>","<special2>","<special3>","<special4>","<special5>","<special6>","<special7>","<special8>","<special9>"]`):
            List of additional special tokens.
-        lang2id (:obj:`Dict[str, int]`, `optional`):
+        lang2id (`Dict[str, int]`, *optional*):
            Dictionary mapping languages string identifiers to their IDs.
-        id2lang (:obj:`Dict[int, str]`, `optional`):
+        id2lang (`Dict[int, str]`, *optional*):
            Dictionary mapping language IDs to their string identifiers.
-        do_lowercase_and_remove_accent (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_lowercase_and_remove_accent (`bool`, *optional*, defaults to `True`):
            Whether to lowercase and remove accents when tokenizing.
    """

@@ -866,17 +869,17 @@ class XLMTokenizer(PreTrainedTokenizer):
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An XLM sequence has the following format:

-        - single sequence: ``<s> X </s>``
-        - pair of sequences: ``<s> A </s> B </s>``
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s> B </s>`

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.

        """
        bos = [self.bos_token_id]
@@ -891,18 +894,18 @@ class XLMTokenizer(PreTrainedTokenizer):
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        if already_has_special_tokens:
@@ -921,21 +924,21 @@ class XLMTokenizer(PreTrainedTokenizer):
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLM sequence
        pair mask has the following format:

-        ::
-
+        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
+        ```

-        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
            sequence(s).
        """
        sep = [self.sep_token_id]