Mass conversion of documentation from rst to Markdown (#14866)

* Convert docstrings of all configurations and tokenizers * Processors and fixes * Last modeling files and fixes to models * Pipeline modules * Utils files * Data submodule * All the other files * Style * Missing examples * Style again * Fix copies * Say bye bye to rst docstrings forever

Mass conversion of documentation from rst to Markdown (#14866)
* Convert docstrings of all configurations and tokenizers * Processors and fixes * Last modeling files and fixes to models * Pipeline modules * Utils files * Data submodule * All the other files * Style * Missing examples * Style again * Fix copies * Say bye bye to rst docstrings forever
27b3031d · Sylvain Gugger · GitHub · 18587639 · 27b3031d · 27b3031d
Unverified Commit 27b3031d authored Dec 21, 2021 by Sylvain Gugger Committed by GitHub Dec 21, 2021
20 changed files
--- a/src/transformers/models/clip/modeling_flax_clip.py
+++ b/src/transformers/models/clip/modeling_flax_clip.py
@@ -940,18 +940,20 @@ class FlaxCLIPTextModel(FlaxCLIPTextPreTrainedModel):
 FLAX_CLIP_TEXT_MODEL_DOCSTRING = """
    Returns:

-    Example::
+    Example:

-        >>> from transformers import CLIPTokenizer, FlaxCLIPTextModel
+    ```python
+    >>> from transformers import CLIPTokenizer, FlaxCLIPTextModel

-        >>> model = FlaxCLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
-        >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+    >>> model = FlaxCLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
+    >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")

-        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"],  padding=True, return_tensors="np")
+    >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"],  padding=True, return_tensors="np")

-        >>> outputs = model(**inputs)
-        >>> last_hidden_state = outputs.last_hidden_state
-        >>> pooler_output = outputs.pooler_output # pooled (EOS token) states
+    >>> outputs = model(**inputs)
+    >>> last_hidden_state = outputs.last_hidden_state
+    >>> pooler_output = outputs.pooler_output # pooled (EOS token) states
+    ```
 """

 overwrite_call_docstring(FlaxCLIPTextModel, CLIP_TEXT_INPUTS_DOCSTRING + FLAX_CLIP_TEXT_MODEL_DOCSTRING)
@@ -991,23 +993,25 @@ class FlaxCLIPVisionModel(FlaxCLIPVisionPreTrainedModel):
 FLAX_CLIP_VISION_MODEL_DOCSTRING = """
    Returns:

-    Example::
+    Example:

-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import CLIPProcessor, FlaxCLIPVisionModel
+    ```python
+    >>> from PIL import Image
+    >>> import requests
+    >>> from transformers import CLIPProcessor, FlaxCLIPVisionModel

-        >>> model = FlaxCLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
-        >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+    >>> model = FlaxCLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
+    >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    >>> image = Image.open(requests.get(url, stream=True).raw)

-        >>> inputs = processor(images=image, return_tensors="np")
+    >>> inputs = processor(images=image, return_tensors="np")

-        >>> outputs = model(**inputs)
-        >>> last_hidden_state = outputs.last_hidden_state
-        >>> pooler_output = outputs.pooler_output # pooled CLS states
+    >>> outputs = model(**inputs)
+    >>> last_hidden_state = outputs.last_hidden_state
+    >>> pooler_output = outputs.pooler_output # pooled CLS states
+    ```
 """

 overwrite_call_docstring(FlaxCLIPVisionModel, CLIP_VISION_INPUTS_DOCSTRING + FLAX_CLIP_VISION_MODEL_DOCSTRING)
@@ -1115,24 +1119,26 @@ class FlaxCLIPModel(FlaxCLIPPreTrainedModel):
 FLAX_CLIP_MODEL_DOCSTRING = """
    Returns:

-    Example::
+    Example:

-        >>> import jax
-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import CLIPProcessor, FlaxCLIPModel
+    ```python
+    >>> import jax
+    >>> from PIL import Image
+    >>> import requests
+    >>> from transformers import CLIPProcessor, FlaxCLIPModel

-        >>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-        >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+    >>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+    >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    >>> image = Image.open(requests.get(url, stream=True).raw)

-        >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="np", padding=True)
+    >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="np", padding=True)

-        >>> outputs = model(**inputs)
-        >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
-        >>> probs = jax.nn.softmax(logits_per_image, axis=1) # we can take the softmax to get the label probabilities
+    >>> outputs = model(**inputs)
+    >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
+    >>> probs = jax.nn.softmax(logits_per_image, axis=1) # we can take the softmax to get the label probabilities
+    ```
 """

 overwrite_call_docstring(FlaxCLIPModel, CLIP_INPUTS_DOCSTRING + FLAX_CLIP_MODEL_DOCSTRING)

--- a/src/transformers/models/clip/processing_clip.py
+++ b/src/transformers/models/clip/processing_clip.py
@@ -24,14 +24,14 @@ class CLIPProcessor:
    r"""
    Constructs a CLIP processor which wraps a CLIP feature extractor and a CLIP tokenizer into a single processor.

-    :class:`~transformers.CLIPProcessor` offers all the functionalities of :class:`~transformers.CLIPFeatureExtractor`
-    and :class:`~transformers.CLIPTokenizer`. See the :meth:`~transformers.CLIPProcessor.__call__` and
-    :meth:`~transformers.CLIPProcessor.decode` for more information.
+    [`CLIPProcessor`] offers all the functionalities of [`CLIPFeatureExtractor`]
+    and [`CLIPTokenizer`]. See the [`~CLIPProcessor.__call__`] and
+    [`~CLIPProcessor.decode`] for more information.

    Args:
-        feature_extractor (:class:`~transformers.CLIPFeatureExtractor`):
+        feature_extractor ([`CLIPFeatureExtractor`]):
            The feature extractor is a required input.
-        tokenizer (:class:`~transformers.CLIPTokenizer`):
+        tokenizer ([`CLIPTokenizer`]):
            The tokenizer is a required input.
    """

@@ -49,17 +49,19 @@ class CLIPProcessor:

    def save_pretrained(self, save_directory):
        """
-        Save a CLIP feature extractor object and CLIP tokenizer object to the directory ``save_directory``, so that it
-        can be re-loaded using the :func:`~transformers.CLIPProcessor.from_pretrained` class method.
+        Save a CLIP feature extractor object and CLIP tokenizer object to the directory `save_directory`, so that it
+        can be re-loaded using the [`~CLIPProcessor.from_pretrained`] class method.

-        .. note::
+        <Tip>

-            This class method is simply calling :meth:`~transformers.PreTrainedFeatureExtractor.save_pretrained` and
-            :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.save_pretrained`. Please refer to the
-            docstrings of the methods above for more information.
+        This class method is simply calling [`~PreTrainedFeatureExtractor.save_pretrained`] and
+        [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the
+        docstrings of the methods above for more information.
+
+        </Tip>

        Args:
-            save_directory (:obj:`str` or :obj:`os.PathLike`):
+            save_directory (`str` or `os.PathLike`):
                Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
                be created if it does not exist).
        """
@@ -70,31 +72,33 @@ class CLIPProcessor:
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
        r"""
-        Instantiate a :class:`~transformers.CLIPProcessor` from a pretrained CLIP processor.
+        Instantiate a [`CLIPProcessor`] from a pretrained CLIP processor.
+
+        <Tip>

-        .. note::
+        This class method is simply calling CLIPFeatureExtractor's
+        [`~PreTrainedFeatureExtractor.from_pretrained`] and CLIPTokenizer's
+        [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the
+        docstrings of the methods above for more information.

-            This class method is simply calling CLIPFeatureExtractor's
-            :meth:`~transformers.PreTrainedFeatureExtractor.from_pretrained` and CLIPTokenizer's
-            :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained`. Please refer to the
-            docstrings of the methods above for more information.
+        </Tip>

        Args:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
                This can be either:

-                - a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like ``clip-vit-base-patch32``, or
-                  namespaced under a user or organization name, like ``openai/clip-vit-base-patch32``.
-                - a path to a `directory` containing a feature extractor file saved using the
-                  :meth:`~transformers.PreTrainedFeatureExtractor.save_pretrained` method, e.g.,
-                  ``./my_model_directory/``.
-                - a path or url to a saved feature extractor JSON `file`, e.g.,
-                  ``./my_model_directory/preprocessor_config.json``.
+                - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
+                  huggingface.co. Valid model ids can be located at the root-level, like `clip-vit-base-patch32`, or
+                  namespaced under a user or organization name, like `openai/clip-vit-base-patch32`.
+                - a path to a *directory* containing a feature extractor file saved using the
+                  [`~PreTrainedFeatureExtractor.save_pretrained`] method, e.g.,
+                  `./my_model_directory/`.
+                - a path or url to a saved feature extractor JSON *file*, e.g.,
+                  `./my_model_directory/preprocessor_config.json`.

            **kwargs
-                Additional keyword arguments passed along to both :class:`~transformers.PreTrainedFeatureExtractor` and
-                :class:`~transformers.PreTrainedTokenizer`
+                Additional keyword arguments passed along to both [`PreTrainedFeatureExtractor`] and
+                [`PreTrainedTokenizer`]
        """
        feature_extractor = CLIPFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
        tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
@@ -104,38 +108,38 @@ class CLIPProcessor:
    def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the
-        :obj:`text` and :obj:`kwargs` arguments to CLIPTokenizer's :meth:`~transformers.CLIPTokenizer.__call__` if
-        :obj:`text` is not :obj:`None` to encode the text. To prepare the image(s), this method forwards the
-        :obj:`images` and :obj:`kwrags` arguments to CLIPFeatureExtractor's
-        :meth:`~transformers.CLIPFeatureExtractor.__call__` if :obj:`images` is not :obj:`None`. Please refer to the
+        `text` and `kwargs` arguments to CLIPTokenizer's [`~CLIPTokenizer.__call__`] if
+        `text` is not `None` to encode the text. To prepare the image(s), this method forwards the
+        `images` and `kwrags` arguments to CLIPFeatureExtractor's
+        [`~CLIPFeatureExtractor.__call__`] if `images` is not `None`. Please refer to the
        doctsring of the above two methods for more information.

        Args:
-            text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+            text (`str`, `List[str]`, `List[List[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                :obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                number of channels, H and W are image height and width.

-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:

-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return NumPy :obj:`np.ndarray` objects.
-                * :obj:`'jax'`: Return JAX :obj:`jnp.ndarray` objects.
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.

        Returns:
-            :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:

-            - **input_ids** -- List of token ids to be fed to a model. Returned when :obj:`text` is not :obj:`None`.
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              :obj:`return_attention_mask=True` or if `"attention_mask"` is in :obj:`self.model_input_names` and if
-              :obj:`text` is not :obj:`None`).
-            - **pixel_values** -- Pixel values to be fed to a model. Returned when :obj:`images` is not :obj:`None`.
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if
+              `text` is not `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        """

        if text is None and images is None:
@@ -158,14 +162,14 @@ class CLIPProcessor:
    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to CLIPTokenizer's
-        :meth:`~transformers.PreTrainedTokenizer.batch_decode`. Please refer to the docstring of this method for more
+        [`~PreTrainedTokenizer.batch_decode`]. Please refer to the docstring of this method for more
        information.
        """
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        """
-        This method forwards all its arguments to CLIPTokenizer's :meth:`~transformers.PreTrainedTokenizer.decode`.
+        This method forwards all its arguments to CLIPTokenizer's [`~PreTrainedTokenizer.decode`].
        Please refer to the docstring of this method for more information.
        """
        return self.tokenizer.decode(*args, **kwargs)
--- a/src/transformers/models/clip/tokenization_clip.py
+++ b/src/transformers/models/clip/tokenization_clip.py
@@ -105,33 +105,34 @@ class CLIPTokenizer(PreTrainedTokenizer):
    be encoded differently whether it is at the beginning of the sentence (without space) or not:


-    You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
+    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.

-    .. note::
+    <Tip>

-        When used with ``is_split_into_words=True``, this tokenizer will add a space before each word (even the first
-        one).
+    When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first
+    one).

-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    </Tip>
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
    Users should refer to this superclass for more information regarding those methods.

    Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
            Path to the vocabulary file.
-        merges_file (:obj:`str`):
+        merges_file (`str`):
            Path to the merges file.
-        errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
-            Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
-            <https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+        bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
            The beginning of sequence token.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+        eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
            The end of sequence token.
-        add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        add_prefix_space (`bool`, *optional*, defaults to `False`):
            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
            other word. (CLIP tokenizer detect beginning of words by the preceding space).
    """
@@ -200,7 +201,7 @@ class CLIPTokenizer(PreTrainedTokenizer):
    @property
    def pad_token_id(self) -> Optional[int]:
        """
-        :obj:`Optional[int]`: Id of the padding token in the vocabulary. Returns :obj:`None` if the token has not been
+        `Optional[int]`: Id of the padding token in the vocabulary. Returns `None` if the token has not been
        set.
        """
        return 0
@@ -219,18 +220,18 @@ class CLIPTokenizer(PreTrainedTokenizer):
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A CLIP sequence has the following format:

-        - single sequence: ``<|startoftext|> X <|endoftext|>``
+        - single sequence: `<|startoftext|> X <|endoftext|>`

        Pairs of sequences are not the expected use case, but they will be handled without a separator.

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        if token_ids_1 is None:
            return [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
@@ -241,18 +242,18 @@ class CLIPTokenizer(PreTrainedTokenizer):
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        if already_has_special_tokens:

--- a/src/transformers/models/clip/tokenization_clip_fast.py
+++ b/src/transformers/models/clip/tokenization_clip_fast.py
@@ -49,51 +49,52 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {

 class CLIPTokenizerFast(PreTrainedTokenizerFast):
    """
-    Construct a "fast" CLIP tokenizer (backed by HuggingFace's `tokenizers` library). Based on byte-level
+    Construct a "fast" CLIP tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
    Byte-Pair-Encoding.

    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
    be encoded differently whether it is at the beginning of the sentence (without space) or not:

-    ::
+    ```
+    >>> from transformers import CLIPTokenizerFast
+    >>> tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch32")
+    >>> tokenizer("Hello world")['input_ids']
+    [15496, 995]
+    >>> tokenizer(" Hello world")['input_ids']
+    [18435, 995]
+    ```

-        >>> from transformers import CLIPTokenizerFast
-        >>> tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch32")
-        >>> tokenizer("Hello world")['input_ids']
-        [15496, 995]
-        >>> tokenizer(" Hello world")['input_ids']
-        [18435, 995]
-
-    You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
+    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.

-    .. note::
+    <Tip>
+
+    When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with
+    `add_prefix_space=True`.

-        When used with ``is_split_into_words=True``, this tokenizer needs to be instantiated with
-        ``add_prefix_space=True``.
+    </Tip>

-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
    methods. Users should refer to this superclass for more information regarding those methods.

    Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
            Path to the vocabulary file.
-        merges_file (:obj:`str`):
+        merges_file (`str`):
            Path to the merges file.
-        errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
-            Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
-            <https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+        bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
            The beginning of sequence token.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+        eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
            The end of sequence token.
-        add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        add_prefix_space (`bool`, *optional*, defaults to `False`):
            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
            other word. (CLIP tokenizer detect beginning of words by the preceding space).
-        trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        trim_offsets (`bool`, *optional*, defaults to `True`):
            Whether or not the post-processing step should trim offsets to avoid including whitespaces.
    """

@@ -139,7 +140,7 @@ class CLIPTokenizerFast(PreTrainedTokenizerFast):
    @property
    def pad_token_id(self) -> Optional[int]:
        """
-        :obj:`Optional[int]`: Id of the padding token in the vocabulary. Returns :obj:`None` if the token has not been
+        `Optional[int]`: Id of the padding token in the vocabulary. Returns `None` if the token has not been
        set.
        """
        return 0

--- a/src/transformers/models/convbert/configuration_convbert.py
+++ b/src/transformers/models/convbert/configuration_convbert.py
@@ -30,61 +30,62 @@ CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class ConvBertConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.ConvBertModel`. It is used to
+    This is the configuration class to store the configuration of a [`ConvBertModel`]. It is used to
    instantiate an ConvBERT model according to the specified arguments, defining the model architecture. Instantiating
-    a configuration with the defaults will yield a similar configuration to that of the ConvBERT `conv-bert-base
-    <https://huggingface.co/YituTech/conv-bert-base>`__ architecture. Configuration objects inherit from
-    :class:`~transformers.PretrainedConfig` and can be used to control the model outputs. Read the documentation from
-    :class:`~transformers.PretrainedConfig` for more information.
+    a configuration with the defaults will yield a similar configuration to that of the ConvBERT [conv-bert-base](https://huggingface.co/YituTech/conv-bert-base) architecture. Configuration objects inherit from
+    [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from
+    [`PretrainedConfig`] for more information.

    Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+        vocab_size (`int`, *optional*, defaults to 30522):
            Vocabulary size of the ConvBERT model. Defines the number of different tokens that can be represented by
-            the :obj:`inputs_ids` passed when calling :class:`~transformers.ConvBertModel` or
-            :class:`~transformers.TFConvBertModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            the `inputs_ids` passed when calling [`ConvBertModel`] or
+            [`TFConvBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.ConvBertModel`
-            or :class:`~transformers.TFConvBertModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`ConvBertModel`]
+            or [`TFConvBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
-        head_ratio (:obj:`int`, `optional`, defaults to 2):
+        head_ratio (`int`, *optional*, defaults to 2):
            Ratio gamma to reduce the number of attention heads.
-        num_groups (:obj:`int`, `optional`, defaults to 1):
+        num_groups (`int`, *optional*, defaults to 1):
            The number of groups for grouped linear layers for ConvBert model
-        conv_kernel_size (:obj:`int`, `optional`, defaults to 9):
+        conv_kernel_size (`int`, *optional*, defaults to 9):
            The size of the convolutional kernel.
-        classifier_dropout (:obj:`float`, `optional`):
+        classifier_dropout (`float`, *optional*):
            The dropout ratio for the classification head.

-    Example::
-        >>> from transformers import ConvBertModel, ConvBertConfig
-        >>> # Initializing a ConvBERT convbert-base-uncased style configuration
-        >>> configuration = ConvBertConfig()
-        >>> # Initializing a model from the convbert-base-uncased style configuration
-        >>> model = ConvBertModel(configuration)
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    Example:
+
+    ```python
+    >>> from transformers import ConvBertModel, ConvBertConfig
+    >>> # Initializing a ConvBERT convbert-base-uncased style configuration
+    >>> configuration = ConvBertConfig()
+    >>> # Initializing a model from the convbert-base-uncased style configuration
+    >>> model = ConvBertModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
    model_type = "convbert"

    def __init__(

--- a/src/transformers/models/convbert/tokenization_convbert.py
+++ b/src/transformers/models/convbert/tokenization_convbert.py
@@ -45,9 +45,9 @@ PRETRAINED_INIT_CONFIGURATION = {

 class ConvBertTokenizer(BertTokenizer):
    r"""
-    Construct a ConvBERT tokenizer. :class:`~transformers.ConvBertTokenizer` is identical to
-    :class:`~transformers.BertTokenizer` and runs end-to-end tokenization: punctuation splitting and wordpiece. Refer
-    to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning parameters.
+    Construct a ConvBERT tokenizer. [`ConvBertTokenizer`] is identical to
+    [`BertTokenizer`] and runs end-to-end tokenization: punctuation splitting and wordpiece. Refer
+    to superclass [`BertTokenizer`] for usage examples and documentation concerning parameters.
    """

    vocab_files_names = VOCAB_FILES_NAMES

--- a/src/transformers/models/convbert/tokenization_convbert_fast.py
+++ b/src/transformers/models/convbert/tokenization_convbert_fast.py
@@ -46,12 +46,12 @@ PRETRAINED_INIT_CONFIGURATION = {

 class ConvBertTokenizerFast(BertTokenizerFast):
    r"""
-    Construct a "fast" ConvBERT tokenizer (backed by HuggingFace's `tokenizers` library).
+    Construct a "fast" ConvBERT tokenizer (backed by HuggingFace's *tokenizers* library).

-    :class:`~transformers.ConvBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
+    [`ConvBertTokenizerFast`] is identical to [`BertTokenizerFast`] and runs
    end-to-end tokenization: punctuation splitting and wordpiece.

-    Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning
    parameters.
    """
    vocab_files_names = VOCAB_FILES_NAMES

--- a/src/transformers/models/cpm/tokenization_cpm.py
+++ b/src/transformers/models/cpm/tokenization_cpm.py
@@ -33,59 +33,64 @@ class CpmTokenizer(XLNetTokenizer):

    def __init__(self, *args, **kwargs):
        """
-        Construct a CPM tokenizer. Based on `Jieba <https://pypi.org/project/jieba/>` and `SentencePiece
-        <https://github.com/google/sentencepiece>`__.
+        Construct a CPM tokenizer. Based on *Jieba <https://pypi.org/project/jieba/>* and [SentencePiece](https://github.com/google/sentencepiece).

-        This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main
+        This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main
        methods. Users should refer to this superclass for more information regarding those methods.

        Args:
-            vocab_file (:obj:`str`):
-                `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a .spm extension) that
+            vocab_file (`str`):
+                [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
                contains the vocabulary necessary to instantiate a tokenizer.
-            do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            do_lower_case (`bool`, *optional*, defaults to `True`):
                Whether to lowercase the input when tokenizing.
-            remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            remove_space (`bool`, *optional*, defaults to `True`):
                Whether to strip the text when tokenizing (removing excess spaces before and after the string).
-            keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            keep_accents (`bool`, *optional*, defaults to `False`):
                Whether to keep accents when tokenizing.
-            bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+            bos_token (`str`, *optional*, defaults to `"<s>"`):
                The beginning of sequence token that was used during pretraining. Can be used a sequence classifier
                token.

-                .. note::
+                <Tip>

-                    When building a sequence using special tokens, this is not the token that is used for the beginning
-                    of sequence. The token used is the :obj:`cls_token`.
-            eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+                When building a sequence using special tokens, this is not the token that is used for the beginning
+                of sequence. The token used is the `cls_token`.
+
+                </Tip>
+
+            eos_token (`str`, *optional*, defaults to `"</s>"`):
                The end of sequence token.

-                .. note::
+                <Tip>
+
+                When building a sequence using special tokens, this is not the token that is used for the end of
+                sequence. The token used is the `sep_token`.
+
+                </Tip>

-                    When building a sequence using special tokens, this is not the token that is used for the end of
-                    sequence. The token used is the :obj:`sep_token`.
-            unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            unk_token (`str`, *optional*, defaults to `"<unk>"`):
                The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be
                this token instead.
-            sep_token (:obj:`str`, `optional`, defaults to :obj:`"<sep>"`):
+            sep_token (`str`, *optional*, defaults to `"<sep>"`):
                The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
                for sequence classification or for a text and a question for question answering. It is also used as the
                last token of a sequence built with special tokens.
-            pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+            pad_token (`str`, *optional*, defaults to `"<pad>"`):
                The token used for padding, for example when batching sequences of different lengths.
-            cls_token (:obj:`str`, `optional`, defaults to :obj:`"<cls>"`):
+            cls_token (`str`, *optional*, defaults to `"<cls>"`):
                The classifier token which is used when doing sequence classification (classification of the whole
                sequence instead of per-token classification). It is the first token of the sequence when built with
                special tokens.
-            mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+            mask_token (`str`, *optional*, defaults to `"<mask>"`):
                The token used for masking values. This is the token used when training this model with masked language
                modeling. This is the token which the model will try to predict.
-            additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<eop>", "<eod>"]`):
+            additional_special_tokens (`List[str]`, *optional*, defaults to `["<eop>", "<eod>"]`):
                Additional special tokens used by the tokenizer.

        Attributes:
-            sp_model (:obj:`SentencePieceProcessor`):
-                The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+            sp_model (`SentencePieceProcessor`):
+                The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
        """
        super().__init__(*args, **kwargs)
        try:

--- a/src/transformers/models/cpm/tokenization_cpm_fast.py
+++ b/src/transformers/models/cpm/tokenization_cpm_fast.py
@@ -36,59 +36,64 @@ class CpmTokenizerFast(XLNetTokenizerFast):

    def __init__(self, *args, **kwargs):
        """
-        Construct a CPM tokenizer. Based on `Jieba <https://pypi.org/project/jieba/>` and `SentencePiece
-        <https://github.com/google/sentencepiece>`__.
+        Construct a CPM tokenizer. Based on *Jieba <https://pypi.org/project/jieba/>* and [SentencePiece](https://github.com/google/sentencepiece).

-        This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main
+        This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main
        methods. Users should refer to this superclass for more information regarding those methods.

        Args:
-            vocab_file (:obj:`str`):
-                `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a .spm extension) that
+            vocab_file (`str`):
+                [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
                contains the vocabulary necessary to instantiate a tokenizer.
-            do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            do_lower_case (`bool`, *optional*, defaults to `True`):
                Whether to lowercase the input when tokenizing.
-            remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            remove_space (`bool`, *optional*, defaults to `True`):
                Whether to strip the text when tokenizing (removing excess spaces before and after the string).
-            keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            keep_accents (`bool`, *optional*, defaults to `False`):
                Whether to keep accents when tokenizing.
-            bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+            bos_token (`str`, *optional*, defaults to `"<s>"`):
                The beginning of sequence token that was used during pretraining. Can be used a sequence classifier
                token.

-                .. note::
+                <Tip>

-                    When building a sequence using special tokens, this is not the token that is used for the beginning
-                    of sequence. The token used is the :obj:`cls_token`.
-            eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+                When building a sequence using special tokens, this is not the token that is used for the beginning
+                of sequence. The token used is the `cls_token`.
+
+                </Tip>
+
+            eos_token (`str`, *optional*, defaults to `"</s>"`):
                The end of sequence token.

-                .. note::
+                <Tip>
+
+                When building a sequence using special tokens, this is not the token that is used for the end of
+                sequence. The token used is the `sep_token`.
+
+                </Tip>

-                    When building a sequence using special tokens, this is not the token that is used for the end of
-                    sequence. The token used is the :obj:`sep_token`.
-            unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            unk_token (`str`, *optional*, defaults to `"<unk>"`):
                The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be
                this token instead.
-            sep_token (:obj:`str`, `optional`, defaults to :obj:`"<sep>"`):
+            sep_token (`str`, *optional*, defaults to `"<sep>"`):
                The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
                for sequence classification or for a text and a question for question answering. It is also used as the
                last token of a sequence built with special tokens.
-            pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+            pad_token (`str`, *optional*, defaults to `"<pad>"`):
                The token used for padding, for example when batching sequences of different lengths.
-            cls_token (:obj:`str`, `optional`, defaults to :obj:`"<cls>"`):
+            cls_token (`str`, *optional*, defaults to `"<cls>"`):
                The classifier token which is used when doing sequence classification (classification of the whole
                sequence instead of per-token classification). It is the first token of the sequence when built with
                special tokens.
-            mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+            mask_token (`str`, *optional*, defaults to `"<mask>"`):
                The token used for masking values. This is the token used when training this model with masked language
                modeling. This is the token which the model will try to predict.
-            additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<eop>", "<eod>"]`):
+            additional_special_tokens (`List[str]`, *optional*, defaults to `["<eop>", "<eod>"]`):
                Additional special tokens used by the tokenizer.

        Attributes:
-            sp_model (:obj:`SentencePieceProcessor`):
-                The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+            sp_model (`SentencePieceProcessor`):
+                The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
        """
        super().__init__(*args, **kwargs)
        try:

--- a/src/transformers/models/ctrl/configuration_ctrl.py
+++ b/src/transformers/models/ctrl/configuration_ctrl.py
@@ -25,57 +25,58 @@ CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://huggingface.co/ctrl/resol

 class CTRLConfig(PretrainedConfig):
    """
-    This is the configuration class to store the configuration of a :class:`~transformers.CTRLModel` or a
-    :class:`~transformers.TFCTRLModel`. It is used to instantiate a CTRL model according to the specified arguments,
+    This is the configuration class to store the configuration of a [`CTRLModel`] or a
+    [`TFCTRLModel`]. It is used to instantiate a CTRL model according to the specified arguments,
    defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
-    to that of the `ctrl <https://huggingface.co/ctrl>`__ architecture from SalesForce.
+    to that of the [ctrl](https://huggingface.co/ctrl) architecture from SalesForce.

-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.

    Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 246534):
+        vocab_size (`int`, *optional*, defaults to 246534):
            Vocabulary size of the CTRL model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.CTRLModel` or
-            :class:`~transformers.TFCTRLModel`.
-        n_positions (:obj:`int`, `optional`, defaults to 256):
+            `inputs_ids` passed when calling [`CTRLModel`] or
+            [`TFCTRLModel`].
+        n_positions (`int`, *optional*, defaults to 256):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
-        n_embd (:obj:`int`, `optional`, defaults to 1280):
+        n_embd (`int`, *optional*, defaults to 1280):
            Dimensionality of the embeddings and hidden states.
-        dff (:obj:`int`, `optional`, defaults to 8192):
+        dff (`int`, *optional*, defaults to 8192):
            Dimensionality of the inner dimension of the feed forward networks (FFN).
-        n_layer (:obj:`int`, `optional`, defaults to 48):
+        n_layer (`int`, *optional*, defaults to 48):
            Number of hidden layers in the Transformer encoder.
-        n_head (:obj:`int`, `optional`, defaults to 16):
+        n_head (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
-        resid_pdrop (:obj:`float`, `optional`, defaults to 0.1):
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        embd_pdrop (:obj:`int`, `optional`, defaults to 0.1):
+        embd_pdrop (`int`, *optional*, defaults to 0.1):
            The dropout ratio for the embeddings.
-        attn_pdrop (:obj:`float`, `optional`, defaults to 0.1):
+        attn_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention.
-        layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-6):
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-6):
            The epsilon to use in the layer normalization layers
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).


-    Examples::
+    Examples:

-        >>> from transformers import CTRLModel, CTRLConfig
+    ```python
+    >>> from transformers import CTRLModel, CTRLConfig

-        >>> # Initializing a CTRL configuration
-        >>> configuration = CTRLConfig()
+    >>> # Initializing a CTRL configuration
+    >>> configuration = CTRLConfig()

-        >>> # Initializing a model from the configuration
-        >>> model = CTRLModel(configuration)
+    >>> # Initializing a model from the configuration
+    >>> model = CTRLModel(configuration)

-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""

    model_type = "ctrl"
    keys_to_ignore_at_inference = ["past_key_values"]

--- a/src/transformers/models/ctrl/tokenization_ctrl.py
+++ b/src/transformers/models/ctrl/tokenization_ctrl.py
@@ -120,15 +120,15 @@ class CTRLTokenizer(PreTrainedTokenizer):
    """
    Construct a CTRL tokenizer. Based on Byte-Pair-Encoding.

-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
    Users should refer to this superclass for more information regarding those methods.

    Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
            Path to the vocabulary file.
-        merges_file (:obj:`str`):
+        merges_file (`str`):
            Path to the merges file.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
    """

--- a/src/transformers/models/deberta/configuration_deberta.py
+++ b/src/transformers/models/deberta/configuration_deberta.py
@@ -32,59 +32,59 @@ DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class DebertaConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.DebertaModel` or a
-    :class:`~transformers.TFDebertaModel`. It is used to instantiate a DeBERTa model according to the specified
+    This is the configuration class to store the configuration of a [`DebertaModel`] or a
+    [`TFDebertaModel`]. It is used to instantiate a DeBERTa model according to the specified
    arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
-    configuration to that of the DeBERTa `microsoft/deberta-base <https://huggingface.co/microsoft/deberta-base>`__
+    configuration to that of the DeBERTa [microsoft/deberta-base](https://huggingface.co/microsoft/deberta-base)
    architecture.

-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.

    Arguments:
-        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+        vocab_size (`int`, *optional*, defaults to 30522):
            Vocabulary size of the DeBERTa model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.DebertaModel` or
-            :class:`~transformers.TFDebertaModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            `inputs_ids` passed when calling [`DebertaModel`] or
+            [`TFDebertaModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"`, :obj:`"gelu"`, :obj:`"tanh"`, :obj:`"gelu_fast"`,
-            :obj:`"mish"`, :obj:`"linear"`, :obj:`"sigmoid"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"`, `"gelu"`, `"tanh"`, `"gelu_fast"`,
+            `"mish"`, `"linear"`, `"sigmoid"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.DebertaModel` or
-            :class:`~transformers.TFDebertaModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`DebertaModel`] or
+            [`TFDebertaModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
-        relative_attention (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        relative_attention (`bool`, *optional*, defaults to `False`):
            Whether use relative position encoding.
-        max_relative_positions (:obj:`int`, `optional`, defaults to 1):
-            The range of relative positions :obj:`[-max_position_embeddings, max_position_embeddings]`. Use the same
-            value as :obj:`max_position_embeddings`.
-        pad_token_id (:obj:`int`, `optional`, defaults to 0):
+        max_relative_positions (`int`, *optional*, defaults to 1):
+            The range of relative positions `[-max_position_embeddings, max_position_embeddings]`. Use the same
+            value as `max_position_embeddings`.
+        pad_token_id (`int`, *optional*, defaults to 0):
            The value used to pad input_ids.
-        position_biased_input (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        position_biased_input (`bool`, *optional*, defaults to `True`):
            Whether add absolute position embedding to content embedding.
-        pos_att_type (:obj:`List[str]`, `optional`):
-            The type of relative position attention, it can be a combination of :obj:`["p2c", "c2p", "p2p"]`, e.g.
-            :obj:`["p2c"]`, :obj:`["p2c", "c2p"]`, :obj:`["p2c", "c2p", 'p2p"]`.
-        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+        pos_att_type (`List[str]`, *optional*):
+            The type of relative position attention, it can be a combination of `["p2c", "c2p", "p2p"]`, e.g.
+            `["p2c"]`, `["p2c", "c2p"]`, `["p2c", "c2p", 'p2p"]`.
+        layer_norm_eps (`float`, optional, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
    """
    model_type = "deberta"

--- a/src/transformers/models/deberta/tokenization_deberta.py
+++ b/src/transformers/models/deberta/tokenization_deberta.py
@@ -64,23 +64,23 @@ class DebertaTokenizer(GPT2Tokenizer):
    Constructs a DeBERTa tokenizer, which runs end-to-end tokenization: punctuation splitting + wordpiece

    Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
            File containing the vocabulary.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
    """
@@ -141,13 +141,13 @@ class DebertaTokenizer(GPT2Tokenizer):
        - pair of sequences: [CLS] A [SEP] B [SEP]

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
@@ -160,18 +160,18 @@ class DebertaTokenizer(GPT2Tokenizer):
    ) -> List[int]:
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
+        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
@@ -189,21 +189,21 @@ class DebertaTokenizer(GPT2Tokenizer):
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
        sequence pair mask has the following format:

-        ::
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```

-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
-
-        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
            sequence(s).
        """
        sep = [self.sep_token_id]

--- a/src/transformers/models/deberta/tokenization_deberta_fast.py
+++ b/src/transformers/models/deberta/tokenization_deberta_fast.py
@@ -63,26 +63,26 @@ PRETRAINED_INIT_CONFIGURATION = {
 class DebertaTokenizerFast(GPT2TokenizerFast):
    """
    Constructs a "fast" DeBERTa tokenizer, which runs end-to-end tokenization: punctuation splitting + wordpiece. It is
-    backed by HuggingFace's `tokenizers` library.
+    backed by HuggingFace's *tokenizers* library.

    Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
            File containing the vocabulary.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
    """
@@ -129,11 +129,11 @@ class DebertaTokenizerFast(GPT2TokenizerFast):
    @property
    def mask_token(self) -> str:
        """
-        :obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
+        `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
        not having been set.

        Deberta tokenizer has a special mask token to be used in the fill-mask pipeline. The mask token will greedily
-        comprise the space before the `[MASK]`.
+        comprise the space before the *[MASK]*.
        """
        if self._mask_token is None and self.verbose:
            logger.error("Using mask_token, but it is not set yet.")
@@ -161,13 +161,13 @@ class DebertaTokenizerFast(GPT2TokenizerFast):
        - pair of sequences: [CLS] A [SEP] B [SEP]

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
@@ -182,21 +182,21 @@ class DebertaTokenizerFast(GPT2TokenizerFast):
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
        sequence pair mask has the following format:

-        ::
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+        | first sequence    | second sequence |
+        ```

-            0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-            | first sequence    | second sequence |
-
-        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
            sequence(s).
        """
        sep = [self.sep_token_id]

--- a/src/transformers/models/deberta_v2/configuration_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/configuration_deberta_v2.py
@@ -30,57 +30,57 @@ DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class DebertaV2Config(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.DebertaV2Model`. It is used
+    This is the configuration class to store the configuration of a [`DebertaV2Model`]. It is used
    to instantiate a DeBERTa-v2 model according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the DeBERTa
-    `microsoft/deberta-v2-xlarge <https://huggingface.co/microsoft/deberta-base>`__ architecture.
+    [microsoft/deberta-v2-xlarge](https://huggingface.co/microsoft/deberta-base) architecture.

-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.

    Arguments:
-        vocab_size (:obj:`int`, `optional`, defaults to 128100):
+        vocab_size (`int`, *optional*, defaults to 128100):
            Vocabulary size of the DeBERTa-v2 model. Defines the number of different tokens that can be represented by
-            the :obj:`inputs_ids` passed when calling :class:`~transformers.DebertaV2Model`.
-        hidden_size (:obj:`int`, `optional`, defaults to 1536):
+            the `inputs_ids` passed when calling [`DebertaV2Model`].
+        hidden_size (`int`, *optional*, defaults to 1536):
            Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 24):
+        num_hidden_layers (`int`, *optional*, defaults to 24):
            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 24):
+        num_attention_heads (`int`, *optional*, defaults to 24):
            Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 6144):
+        intermediate_size (`int`, *optional*, defaults to 6144):
            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"`, :obj:`"gelu"`, :obj:`"tanh"`, :obj:`"gelu_fast"`,
-            :obj:`"mish"`, :obj:`"linear"`, :obj:`"sigmoid"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"`, `"gelu"`, `"tanh"`, `"gelu_fast"`,
+            `"mish"`, `"linear"`, `"sigmoid"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 0):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.DebertaModel` or
-            :class:`~transformers.TFDebertaModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 0):
+            The vocabulary size of the `token_type_ids` passed when calling [`DebertaModel`] or
+            [`TFDebertaModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-7):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-7):
            The epsilon used by the layer normalization layers.
-        relative_attention (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        relative_attention (`bool`, *optional*, defaults to `True`):
            Whether use relative position encoding.
-        max_relative_positions (:obj:`int`, `optional`, defaults to -1):
-            The range of relative positions :obj:`[-max_position_embeddings, max_position_embeddings]`. Use the same
-            value as :obj:`max_position_embeddings`.
-        pad_token_id (:obj:`int`, `optional`, defaults to 0):
+        max_relative_positions (`int`, *optional*, defaults to -1):
+            The range of relative positions `[-max_position_embeddings, max_position_embeddings]`. Use the same
+            value as `max_position_embeddings`.
+        pad_token_id (`int`, *optional*, defaults to 0):
            The value used to pad input_ids.
-        position_biased_input (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        position_biased_input (`bool`, *optional*, defaults to `False`):
            Whether add absolute position embedding to content embedding.
-        pos_att_type (:obj:`List[str]`, `optional`):
-            The type of relative position attention, it can be a combination of :obj:`["p2c", "c2p", "p2p"]`, e.g.
-            :obj:`["p2c"]`, :obj:`["p2c", "c2p"]`, :obj:`["p2c", "c2p", 'p2p"]`.
-        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+        pos_att_type (`List[str]`, *optional*):
+            The type of relative position attention, it can be a combination of `["p2c", "c2p", "p2p"]`, e.g.
+            `["p2c"]`, `["p2c", "c2p"]`, `["p2c", "c2p", 'p2p"]`.
+        layer_norm_eps (`float`, optional, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
    """
    model_type = "deberta-v2"

--- a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
@@ -52,49 +52,48 @@ VOCAB_FILES_NAMES = {"vocab_file": "spm.model"}

 class DebertaV2Tokenizer(PreTrainedTokenizer):
    r"""
-    Constructs a DeBERTa-v2 tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
+    Constructs a DeBERTa-v2 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        do_lower_case (`bool`, *optional*, defaults to `False`):
            Whether or not to lowercase the input when tokenizing.
-        bos_token (:obj:`string`, `optional`, defaults to "[CLS]"):
+        bos_token (`string`, *optional*, defaults to "[CLS]"):
            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
            When building a sequence using special tokens, this is not the token that is used for the beginning of
-            sequence. The token used is the :obj:`cls_token`.
-        eos_token (:obj:`string`, `optional`, defaults to "[SEP]"):
+            sequence. The token used is the `cls_token`.
+        eos_token (`string`, *optional*, defaults to "[SEP]"):
            The end of sequence token. When building a sequence using special tokens, this is not the token that is
-            used for the end of sequence. The token used is the :obj:`sep_token`.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+            used for the end of sequence. The token used is the `sep_token`.
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
-        sp_model_kwargs (:obj:`dict`, `optional`):
-            Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
-            <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:

-            - ``enable_sampling``: Enable subword regularization.
-            - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

-              - ``nbest_size = {0,1}``: No sampling is performed.
-              - ``nbest_size > 1``: samples from the nbest_size results.
-              - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

-            - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.
    """

@@ -183,13 +182,13 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
        - pair of sequences: [CLS] A [SEP] B [SEP]

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """

        if token_ids_1 is None:
@@ -201,18 +200,18 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
+        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        if already_has_special_tokens:
@@ -229,21 +228,21 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
        sequence pair mask has the following format:

-        ::
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```

-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
-
-        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
            sequence(s).
        """
        sep = [self.sep_token_id]
@@ -264,25 +263,24 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):

 class SPMTokenizer:
    r"""
-    Constructs a tokenizer based on `SentencePiece <https://github.com/google/sentencepiece>`__.
+    Constructs a tokenizer based on [SentencePiece](https://github.com/google/sentencepiece).

    Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
-        sp_model_kwargs (:obj:`dict`, `optional`):
-            Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
-            <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:

-            - ``enable_sampling``: Enable subword regularization.
-            - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

-              - ``nbest_size = {0,1}``: No sampling is performed.
-              - ``nbest_size > 1``: samples from the nbest_size results.
-              - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

-            - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.
    """


--- a/src/transformers/models/deit/configuration_deit.py
+++ b/src/transformers/models/deit/configuration_deit.py
@@ -28,59 +28,60 @@ DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class DeiTConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.DeiTModel`. It is used to
+    This is the configuration class to store the configuration of a [`DeiTModel`]. It is used to
    instantiate an DeiT model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the DeiT
-    `facebook/deit-base-distilled-patch16-224 <https://huggingface.co/facebook/deit-base-distilled-patch16-224>`__
+    [facebook/deit-base-distilled-patch16-224](https://huggingface.co/facebook/deit-base-distilled-patch16-224)
    architecture.

-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.


    Args:
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention probabilities.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
-        image_size (:obj:`int`, `optional`, defaults to :obj:`224`):
+        image_size (`int`, *optional*, defaults to `224`):
            The size (resolution) of each image.
-        patch_size (:obj:`int`, `optional`, defaults to :obj:`16`):
+        patch_size (`int`, *optional*, defaults to `16`):
            The size (resolution) of each patch.
-        num_channels (:obj:`int`, `optional`, defaults to :obj:`3`):
+        num_channels (`int`, *optional*, defaults to `3`):
            The number of input channels.
-        qkv_bias (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        qkv_bias (`bool`, *optional*, defaults to `True`):
            Whether to add a bias to the queries, keys and values.


-    Example::
+    Example:

-        >>> from transformers import DeiTModel, DeiTConfig
+    ```python
+    >>> from transformers import DeiTModel, DeiTConfig

-        >>> # Initializing a DeiT deit-base-distilled-patch16-224 style configuration
-        >>> configuration = DeiTConfig()
+    >>> # Initializing a DeiT deit-base-distilled-patch16-224 style configuration
+    >>> configuration = DeiTConfig()

-        >>> # Initializing a model from the deit-base-distilled-patch16-224 style configuration
-        >>> model = DeiTModel(configuration)
+    >>> # Initializing a model from the deit-base-distilled-patch16-224 style configuration
+    >>> model = DeiTModel(configuration)

-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
    model_type = "deit"

    def __init__(

--- a/src/transformers/models/deit/feature_extraction_deit.py
+++ b/src/transformers/models/deit/feature_extraction_deit.py
@@ -38,31 +38,31 @@ class DeiTFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
    r"""
    Constructs a DeiT feature extractor.

-    This feature extractor inherits from :class:`~transformers.FeatureExtractionMixin` which contains most of the main
+    This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main
    methods. Users should refer to this superclass for more information regarding those methods.

    Args:
-        do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether to resize the input to a certain :obj:`size`.
-        size (:obj:`int` or :obj:`Tuple(int)`, `optional`, defaults to 256):
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the input to a certain `size`.
+        size (`int` or `Tuple(int)`, *optional*, defaults to 256):
            Resize the input to the given size. If a tuple is provided, it should be (width, height). If only an
-            integer is provided, then the input will be resized to (size, size). Only has an effect if :obj:`do_resize`
-            is set to :obj:`True`.
-        resample (:obj:`int`, `optional`, defaults to :obj:`PIL.Image.BICUBIC`):
-            An optional resampling filter. This can be one of :obj:`PIL.Image.NEAREST`, :obj:`PIL.Image.BOX`,
-            :obj:`PIL.Image.BILINEAR`, :obj:`PIL.Image.HAMMING`, :obj:`PIL.Image.BICUBIC` or :obj:`PIL.Image.LANCZOS`.
-            Only has an effect if :obj:`do_resize` is set to :obj:`True`.
-        do_center_crop (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether to crop the input at the center. If the input size is smaller than :obj:`crop_size` along any edge,
+            integer is provided, then the input will be resized to (size, size). Only has an effect if `do_resize`
+            is set to `True`.
+        resample (`int`, *optional*, defaults to `PIL.Image.BICUBIC`):
+            An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
+            `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`.
+            Only has an effect if `do_resize` is set to `True`.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to crop the input at the center. If the input size is smaller than `crop_size` along any edge,
            the image is padded with 0's and then center cropped.
-        crop_size (:obj:`int`, `optional`, defaults to 224):
-            Desired output size when applying center-cropping. Only has an effect if :obj:`do_center_crop` is set to
-            :obj:`True`.
-        do_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to normalize the input with :obj:`image_mean` and :obj:`image_std`.
-        image_mean (:obj:`List[int]`, defaults to :obj:`[0.485, 0.456, 0.406]`):
+        crop_size (`int`, *optional*, defaults to 224):
+            Desired output size when applying center-cropping. Only has an effect if `do_center_crop` is set to
+            `True`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the input with `image_mean` and `image_std`.
+        image_mean (`List[int]`, defaults to `[0.485, 0.456, 0.406]`):
            The sequence of means for each channel, to be used when normalizing images.
-        image_std (:obj:`List[int]`, defaults to :obj:`[0.229, 0.224, 0.225]`):
+        image_std (`List[int]`, defaults to `[0.229, 0.224, 0.225]`):
            The sequence of standard deviations for each channel, to be used when normalizing images.
    """

@@ -96,27 +96,29 @@ class DeiTFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
        """
        Main method to prepare for the model one or several image(s).

-        .. warning::
+        <Tip warning={true}>

-           NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
-           PIL images.
+        NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
+        PIL images.
+
+        </Tip>

        Args:
-            images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                number of channels, H and W are image height and width.

-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`, defaults to :obj:`'np'`):
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*, defaults to `'np'`):
                If set, will return tensors of a particular framework. Acceptable values are:

-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return NumPy :obj:`np.ndarray` objects.
-                * :obj:`'jax'`: Return JAX :obj:`jnp.ndarray` objects.
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.

        Returns:
-            :class:`~transformers.BatchFeature`: A :class:`~transformers.BatchFeature` with the following fields:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
              width).

--- a/src/transformers/models/deit/modeling_deit.py
+++ b/src/transformers/models/deit/modeling_deit.py
@@ -487,22 +487,23 @@ class DeiTModel(DeiTPreTrainedModel):
        r"""
        Returns:

-        Examples::
+        Examples:

-            >>> from transformers import DeiTFeatureExtractor, DeiTModel
-            >>> from PIL import Image
-            >>> import requests
+        ```python
+        >>> from transformers import DeiTFeatureExtractor, DeiTModel
+        >>> from PIL import Image
+        >>> import requests

-            >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
-            >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> image = Image.open(requests.get(url, stream=True).raw)

-            >>> feature_extractor = DeiTFeatureExtractor.from_pretrained('facebook/deit-base-distilled-patch16-224')
-            >>> model = DeiTModel.from_pretrained('facebook/deit-base-distilled-patch16-224', add_pooling_layer=False)
+        >>> feature_extractor = DeiTFeatureExtractor.from_pretrained('facebook/deit-base-distilled-patch16-224')
+        >>> model = DeiTModel.from_pretrained('facebook/deit-base-distilled-patch16-224', add_pooling_layer=False)

-            >>> inputs = feature_extractor(images=image, return_tensors="pt")
-            >>> outputs = model(**inputs)
-            >>> last_hidden_states = outputs.last_hidden_state
-        """
+        >>> inputs = feature_extractor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -729,25 +730,26 @@ class DeiTForImageClassificationWithTeacher(DeiTPreTrainedModel):
        """
        Returns:

-        Examples::
+        Examples:

-            >>> from transformers import DeiTFeatureExtractor, DeiTForImageClassificationWithTeacher
-            >>> from PIL import Image
-            >>> import requests
+        ```python
+        >>> from transformers import DeiTFeatureExtractor, DeiTForImageClassificationWithTeacher
+        >>> from PIL import Image
+        >>> import requests

-            >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
-            >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> image = Image.open(requests.get(url, stream=True).raw)

-            >>> feature_extractor = DeiTFeatureExtractor.from_pretrained('facebook/deit-base-distilled-patch16-224')
-            >>> model = DeiTForImageClassificationWithTeacher.from_pretrained('facebook/deit-base-distilled-patch16-224')
+        >>> feature_extractor = DeiTFeatureExtractor.from_pretrained('facebook/deit-base-distilled-patch16-224')
+        >>> model = DeiTForImageClassificationWithTeacher.from_pretrained('facebook/deit-base-distilled-patch16-224')

-            >>> inputs = feature_extractor(images=image, return_tensors="pt")
-            >>> outputs = model(**inputs)
-            >>> logits = outputs.logits
-            >>> # model predicts one of the 1000 ImageNet classes
-            >>> predicted_class_idx = logits.argmax(-1).item()
-            >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
-        """
+        >>> inputs = feature_extractor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> logits = outputs.logits
+        >>> # model predicts one of the 1000 ImageNet classes
+        >>> predicted_class_idx = logits.argmax(-1).item()
+        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
+        ```"""
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.deit(

--- a/src/transformers/models/detr/configuration_detr.py
+++ b/src/transformers/models/detr/configuration_detr.py
@@ -28,93 +28,92 @@ DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class DetrConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.DetrModel`. It is used to
+    This is the configuration class to store the configuration of a [`DetrModel`]. It is used to
    instantiate a DETR model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the DETR `facebook/detr-resnet-50
-    <https://huggingface.co/facebook/detr-resnet-50>`__ architecture.
+    configuration with the defaults will yield a similar configuration to that of the DETR [facebook/detr-resnet-50](https://huggingface.co/facebook/detr-resnet-50) architecture.

-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.


    Args:
-        num_queries (:obj:`int`, `optional`, defaults to 100):
+        num_queries (`int`, *optional*, defaults to 100):
            Number of object queries, i.e. detection slots. This is the maximal number of objects
-            :class:`~transformers.DetrModel` can detect in a single image. For COCO, we recommend 100 queries.
-        d_model (:obj:`int`, `optional`, defaults to 256):
+            [`DetrModel`] can detect in a single image. For COCO, we recommend 100 queries.
+        d_model (`int`, *optional*, defaults to 256):
            Dimension of the layers.
-        encoder_layers (:obj:`int`, `optional`, defaults to 6):
+        encoder_layers (`int`, *optional*, defaults to 6):
            Number of encoder layers.
-        decoder_layers (:obj:`int`, `optional`, defaults to 6):
+        decoder_layers (`int`, *optional*, defaults to 6):
            Number of decoder layers.
-        encoder_attention_heads (:obj:`int`, `optional`, defaults to 8):
+        encoder_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (:obj:`int`, `optional`, defaults to 8):
+        decoder_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
+        decoder_ffn_dim (`int`, *optional*, defaults to 2048):
            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
-        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
+        encoder_ffn_dim (`int`, *optional*, defaults to 2048):
            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
-        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"relu"`):
+        activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
-        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        activation_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for activations inside the fully connected layer.
-        init_std (:obj:`float`, `optional`, defaults to 0.02):
+        init_std (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        init_xavier_std (:obj:`float`, `optional`, defaults to 1):
+        init_xavier_std (`float`, *optional*, defaults to 1):
            The scaling factor used for the Xavier initialization gain in the HM Attention map module.
-        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        auxiliary_loss (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        auxiliary_loss (`bool`, *optional*, defaults to `False`):
            Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
-        position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"sine"`):
-            Type of position embeddings to be used on top of the image features. One of :obj:`"sine"` or
-            :obj:`"learned"`.
-        backbone (:obj:`str`, `optional`, defaults to :obj:`"resnet50"`):
+        position_embedding_type (`str`, *optional*, defaults to `"sine"`):
+            Type of position embeddings to be used on top of the image features. One of `"sine"` or
+            `"learned"`.
+        backbone (`str`, *optional*, defaults to `"resnet50"`):
            Name of convolutional backbone to use. Supports any convolutional backbone from the timm package. For a
-            list of all available models, see `this page
-            <https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model>`__.
-        dilation (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            list of all available models, see [this page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model).
+        dilation (`bool`, *optional*, defaults to `False`):
            Whether to replace stride with dilation in the last convolutional block (DC5).
-        class_cost (:obj:`float`, `optional`, defaults to 1):
+        class_cost (`float`, *optional*, defaults to 1):
            Relative weight of the classification error in the Hungarian matching cost.
-        bbox_cost (:obj:`float`, `optional`, defaults to 5):
+        bbox_cost (`float`, *optional*, defaults to 5):
            Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost.
-        giou_cost (:obj:`float`, `optional`, defaults to 2):
+        giou_cost (`float`, *optional*, defaults to 2):
            Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost.
-        mask_loss_coefficient (:obj:`float`, `optional`, defaults to 1):
+        mask_loss_coefficient (`float`, *optional*, defaults to 1):
            Relative weight of the Focal loss in the panoptic segmentation loss.
-        dice_loss_coefficient (:obj:`float`, `optional`, defaults to 1):
+        dice_loss_coefficient (`float`, *optional*, defaults to 1):
            Relative weight of the DICE/F-1 loss in the panoptic segmentation loss.
-        bbox_loss_coefficient (:obj:`float`, `optional`, defaults to 5):
+        bbox_loss_coefficient (`float`, *optional*, defaults to 5):
            Relative weight of the L1 bounding box loss in the object detection loss.
-        giou_loss_coefficient (:obj:`float`, `optional`, defaults to 2):
+        giou_loss_coefficient (`float`, *optional*, defaults to 2):
            Relative weight of the generalized IoU loss in the object detection loss.
-        eos_coefficient (:obj:`float`, `optional`, defaults to 0.1):
+        eos_coefficient (`float`, *optional*, defaults to 0.1):
            Relative classification weight of the 'no-object' class in the object detection loss.

-    Examples::
+    Examples:

-        >>> from transformers import DetrModel, DetrConfig
+    ```python
+    >>> from transformers import DetrModel, DetrConfig

-        >>> # Initializing a DETR facebook/detr-resnet-50 style configuration
-        >>> configuration = DetrConfig()
+    >>> # Initializing a DETR facebook/detr-resnet-50 style configuration
+    >>> configuration = DetrConfig()

-        >>> # Initializing a model from the facebook/detr-resnet-50 style configuration
-        >>> model = DetrModel(configuration)
+    >>> # Initializing a model from the facebook/detr-resnet-50 style configuration
+    >>> model = DetrModel(configuration)

-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
    model_type = "detr"
    keys_to_ignore_at_inference = ["past_key_values"]
    attribute_map = {