Mass conversion of documentation from rst to Markdown (#14866)

* Convert docstrings of all configurations and tokenizers * Processors and fixes * Last modeling files and fixes to models * Pipeline modules * Utils files * Data submodule * All the other files * Style * Missing examples * Style again * Fix copies * Say bye bye to rst docstrings forever

Mass conversion of documentation from rst to Markdown (#14866)
* Convert docstrings of all configurations and tokenizers * Processors and fixes * Last modeling files and fixes to models * Pipeline modules * Utils files * Data submodule * All the other files * Style * Missing examples * Style again * Fix copies * Say bye bye to rst docstrings forever
27b3031d · Sylvain Gugger · GitHub · 18587639 · 27b3031d · 27b3031d
Unverified Commit 27b3031d authored Dec 21, 2021 by Sylvain Gugger Committed by GitHub Dec 21, 2021
20 changed files
--- a/src/transformers/models/rag/modeling_tf_rag.py
+++ b/src/transformers/models/rag/modeling_tf_rag.py
@@ -540,21 +540,21 @@ class TFRagModel(TFRagPreTrainedModel):
        r"""
        Returns:

-        Example::
-
-            >>> from transformers import RagTokenizer, RagRetriever, RagModel
-            >>> import torch
+        Example:

-            >>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-base")
-            >>> retriever = RagRetriever.from_pretrained("facebook/rag-token-base", index_name="exact", use_dummy_dataset=True)
-            >>> # initialize with RagRetriever to do everything in one forward call
-            >>> model = TFRagModel.from_pretrained("facebook/rag-token-base", retriever=retriever, from_pt=True)
+        ```python
+        >>> from transformers import RagTokenizer, RagRetriever, RagModel
+        >>> import torch

-            >>> input_dict = tokenizer.prepare_seq2seq_batch("How many people live in Paris?", "In Paris, there are 10 million people.", return_tensors="tf")
-            >>> input_ids = input_dict["input_ids"]
-            >>> outputs = model(input_ids)
+        >>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-base")
+        >>> retriever = RagRetriever.from_pretrained("facebook/rag-token-base", index_name="exact", use_dummy_dataset=True)
+        >>> # initialize with RagRetriever to do everything in one forward call
+        >>> model = TFRagModel.from_pretrained("facebook/rag-token-base", retriever=retriever, from_pt=True)

-        """
+        >>> input_dict = tokenizer.prepare_seq2seq_batch("How many people live in Paris?", "In Paris, there are 10 million people.", return_tensors="tf")
+        >>> input_ids = input_dict["input_ids"]
+        >>> outputs = model(input_ids)
+        ```"""
        assert (
            "decoder_cached_states" not in kwargs
        ), "Please use past_key_values to cache intermediate outputs"  # from modeling_tf_bart.py

--- a/src/transformers/models/rag/retrieval_rag.py
+++ b/src/transformers/models/rag/retrieval_rag.py
@@ -44,7 +44,7 @@ LEGACY_INDEX_PATH = "https://storage.googleapis.com/huggingface-nlp/datasets/wik

 class Index:
    """
-    A base class for the Indices encapsulated by the :class:`~transformers.RagRetriever`.
+    A base class for the Indices encapsulated by the [`RagRetriever`].
    """

    def get_doc_dicts(self, doc_ids: np.ndarray) -> List[dict]:
@@ -52,31 +52,31 @@ class Index:
        Returns a list of dictionaries, containing titles and text of the retrieved documents.

        Args:
-            doc_ids (:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs)`):
+            doc_ids (`np.ndarray` of shape `(batch_size, n_docs)`):
                A tensor of document indices.
        """
        raise NotImplementedError

    def get_top_docs(self, question_hidden_states: np.ndarray, n_docs=5) -> Tuple[np.ndarray, np.ndarray]:
        """
-        For each query in the batch, retrieves ``n_docs`` documents.
+        For each query in the batch, retrieves `n_docs` documents.

        Args:
-            question_hidden_states (:obj:`np.ndarray` of shape :obj:`(batch_size, vector_size):
+            question_hidden_states (`np.ndarray` of shape `(batch_size, vector_size)`):
                An array of query vectors.
-            n_docs (:obj:`int`):
+            n_docs (`int`):
                The number of docs retrieved per query.

        Returns:
-            :obj:`np.ndarray` of shape :obj:`(batch_size, n_docs)`: A tensor of indices of retrieved documents.
-            :obj:`np.ndarray` of shape :obj:`(batch_size, vector_size)`: A tensor of vector representations of
+            `np.ndarray` of shape `(batch_size, n_docs)`: A tensor of indices of retrieved documents.
+            `np.ndarray` of shape `(batch_size, vector_size)`: A tensor of vector representations of
            retrieved documents.
        """
        raise NotImplementedError

    def is_initialized(self):
        """
-        Returns :obj:`True` if index is already initialized.
+        Returns `True` if index is already initialized.
        """
        raise NotImplementedError

@@ -95,11 +95,11 @@ class LegacyIndex(Index):
    default faiss index parameters as specified in that repository.

    Args:
-        vector_size (:obj:`int`):
+        vector_size (`int`):
            The dimension of indexed vectors.
-        index_path (:obj:`str`):
-            A path to a `directory` containing index files compatible with
-            :class:`~transformers.models.rag.retrieval_rag.LegacyIndex`
+        index_path (`str`):
+            A path to a *directory* containing index files compatible with
+            [`~models.rag.retrieval_rag.LegacyIndex`]
    """

    INDEX_FILENAME = "hf_bert_base.hnswSQ8_correct_phi_128.c_index"
@@ -114,7 +114,7 @@ class LegacyIndex(Index):
        self._index_initialized = False

    def _resolve_path(self, index_path, filename):
-        assert os.path.isdir(index_path) or is_remote_url(index_path), "Please specify a valid ``index_path``."
+        assert os.path.isdir(index_path) or is_remote_url(index_path), "Please specify a valid `index_path`."
        archive_file = os.path.join(index_path, filename)
        try:
            # Load from URL or cache if already cached
@@ -228,23 +228,23 @@ class HFIndexBase(Index):

 class CanonicalHFIndex(HFIndexBase):
    """
-    A wrapper around an instance of :class:`~datasets.Datasets`. If ``index_path`` is set to ``None``, we load the
-    pre-computed index available with the :class:`~datasets.arrow_dataset.Dataset`, otherwise, we load the index from
+    A wrapper around an instance of [`~datasets.Datasets`]. If `index_path` is set to `None`, we load the
+    pre-computed index available with the [`~datasets.arrow_dataset.Dataset`], otherwise, we load the index from
    the indicated path on disk.

    Args:
-        vector_size (:obj:`int`): the dimension of the passages embeddings used by the index
-        dataset_name (:obj:`str`, optional, defaults to ``wiki_dpr``):
+        vector_size (`int`): the dimension of the passages embeddings used by the index
+        dataset_name (`str`, optional, defaults to `wiki_dpr`):
            A dataset identifier of the indexed dataset on HuggingFace AWS bucket (list all available datasets and ids
-            with ``datasets.list_datasets()``).
-        dataset_split (:obj:`str`, optional, defaults to ``train``)
-            Which split of the ``dataset`` to load.
-        index_name (:obj:`str`, optional, defaults to ``train``)
-            The index_name of the index associated with the ``dataset``. The index loaded from ``index_path`` will be
+            with `datasets.list_datasets()`).
+        dataset_split (`str`, optional, defaults to `train`)
+            Which split of the `dataset` to load.
+        index_name (`str`, optional, defaults to `train`)
+            The index_name of the index associated with the `dataset`. The index loaded from `index_path` will be
            saved under this name.
-        index_path (:obj:`str`, optional, defaults to ``None``)
+        index_path (`str`, optional, defaults to `None`)
            The path to the serialized faiss index on disk.
-        use_dummy_dataset (:obj:`bool`, optional, defaults to ``False``): If True, use the dummy configuration of the dataset for tests.
+        use_dummy_dataset (`bool`, optional, defaults to `False`): If True, use the dummy configuration of the dataset for tests.
    """

    def __init__(
@@ -289,15 +289,15 @@ class CanonicalHFIndex(HFIndexBase):

 class CustomHFIndex(HFIndexBase):
    """
-    A wrapper around an instance of :class:`~datasets.Datasets`. The dataset and the index are both loaded from the
+    A wrapper around an instance of [`~datasets.Datasets`]. The dataset and the index are both loaded from the
    indicated paths on disk.

    Args:
-        vector_size (:obj:`int`): the dimension of the passages embeddings used by the index
-        dataset_path (:obj:`str`):
+        vector_size (`int`): the dimension of the passages embeddings used by the index
+        dataset_path (`str`):
            The path to the serialized dataset on disk. The dataset should have 3 columns: title (str), text (str) and
            embeddings (arrays of dimension vector_size)
-        index_path (:obj:`str`)
+        index_path (`str`)
            The path to the serialized faiss index on disk.
    """

@@ -310,8 +310,8 @@ class CustomHFIndex(HFIndexBase):
        logger.info(f"Loading passages from {dataset_path}")
        if dataset_path is None or index_path is None:
            raise ValueError(
-                "Please provide ``dataset_path`` and ``index_path`` after calling ``dataset.save_to_disk(dataset_path)`` "
-                "and ``dataset.get_index('embeddings').save(index_path)``."
+                "Please provide `dataset_path` and `index_path` after calling `dataset.save_to_disk(dataset_path)` "
+                "and `dataset.get_index('embeddings').save(index_path)`."
            )
        dataset = load_from_disk(dataset_path)
        return cls(vector_size=vector_size, dataset=dataset, index_path=index_path)
@@ -329,40 +329,40 @@ class RagRetriever:
    contents, and it formats them to be used with a RagModel.

    Args:
-        config (:class:`~transformers.RagConfig`):
+        config ([`RagConfig`]):
            The configuration of the RAG model this Retriever is used with. Contains parameters indicating which
-            ``Index`` to build. You can load your own custom dataset with ``config.index_name="custom"`` or use a
-            canonical one (default) from the datasets library with ``config.index_name="wiki_dpr"`` for example.
-        question_encoder_tokenizer (:class:`~transformers.PreTrainedTokenizer`):
+            `Index` to build. You can load your own custom dataset with `config.index_name="custom"` or use a
+            canonical one (default) from the datasets library with `config.index_name="wiki_dpr"` for example.
+        question_encoder_tokenizer ([`PreTrainedTokenizer`]):
            The tokenizer that was used to tokenize the question. It is used to decode the question and then use the
            generator_tokenizer.
-        generator_tokenizer (:class:`~transformers.PreTrainedTokenizer`):
+        generator_tokenizer ([`PreTrainedTokenizer`]):
            The tokenizer used for the generator part of the RagModel.
-        index (:class:`~transformers.models.rag.retrieval_rag.Index`, optional, defaults to the one defined by the configuration):
+        index ([`~models.rag.retrieval_rag.Index`], optional, defaults to the one defined by the configuration):
            If specified, use this index instead of the one built using the configuration

-    Examples::
+    Examples:

-        >>> # To load the default "wiki_dpr" dataset with 21M passages from wikipedia (index name is 'compressed' or 'exact')
-        >>> from transformers import RagRetriever
-        >>> retriever = RagRetriever.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', dataset="wiki_dpr", index_name='compressed')
+    ```python
+    >>> # To load the default "wiki_dpr" dataset with 21M passages from wikipedia (index name is 'compressed' or 'exact')
+    >>> from transformers import RagRetriever
+    >>> retriever = RagRetriever.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', dataset="wiki_dpr", index_name='compressed')

-        >>> # To load your own indexed dataset built with the datasets library. More info on how to build the indexed dataset in examples/rag/use_own_knowledge_dataset.py
-        >>> from transformers import RagRetriever
-        >>> dataset = ...  # dataset must be a datasets.Datasets object with columns "title", "text" and "embeddings", and it must have a faiss index
-        >>> retriever = RagRetriever.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', indexed_dataset=dataset)
+    >>> # To load your own indexed dataset built with the datasets library. More info on how to build the indexed dataset in examples/rag/use_own_knowledge_dataset.py
+    >>> from transformers import RagRetriever
+    >>> dataset = ...  # dataset must be a datasets.Datasets object with columns "title", "text" and "embeddings", and it must have a faiss index
+    >>> retriever = RagRetriever.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', indexed_dataset=dataset)

-        >>> # To load your own indexed dataset built with the datasets library that was saved on disk. More info in examples/rag/use_own_knowledge_dataset.py
-        >>> from transformers import RagRetriever
-        >>> dataset_path = "path/to/my/dataset"  # dataset saved via `dataset.save_to_disk(...)`
-        >>> index_path = "path/to/my/index.faiss"  # faiss index saved via `dataset.get_index("embeddings").save(...)`
-        >>> retriever = RagRetriever.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', index_name='custom', passages_path=dataset_path, index_path=index_path)
+    >>> # To load your own indexed dataset built with the datasets library that was saved on disk. More info in examples/rag/use_own_knowledge_dataset.py
+    >>> from transformers import RagRetriever
+    >>> dataset_path = "path/to/my/dataset"  # dataset saved via *dataset.save_to_disk(...)*
+    >>> index_path = "path/to/my/index.faiss"  # faiss index saved via *dataset.get_index("embeddings").save(...)*
+    >>> retriever = RagRetriever.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', index_name='custom', passages_path=dataset_path, index_path=index_path)

-        >>> # To load the legacy index built originally for Rag's paper
-        >>> from transformers import RagRetriever
-        >>> retriever = RagRetriever.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', index_name='legacy')
-
-    """
+    >>> # To load the legacy index built originally for Rag's paper
+    >>> from transformers import RagRetriever
+    >>> retriever = RagRetriever.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', index_name='legacy')
+    ```"""

    def __init__(self, config, question_encoder_tokenizer, generator_tokenizer, index=None, init_retrieval=True):
        self._init_retrieval = init_retrieval
@@ -454,19 +454,19 @@ class RagRetriever:

    def postprocess_docs(self, docs, input_strings, prefix, n_docs, return_tensors=None):
        r"""
-        Postprocessing retrieved ``docs`` and combining them with ``input_strings``.
+        Postprocessing retrieved `docs` and combining them with `input_strings`.

        Args:
-            docs  (:obj:`dict`):
+            docs  (`dict`):
                Retrieved documents.
-            input_strings (:obj:`str`):
-                Input strings decoded by ``preprocess_query``.
-            prefix (:obj:`str`):
+            input_strings (`str`):
+                Input strings decoded by `preprocess_query`.
+            prefix (`str`):
                Prefix added at the beginning of each input, typically used with T5-based models.

        Return:
-            :obj:`tuple(tensors)`: a tuple consisting of two elements: contextualized ``input_ids`` and a compatible
-            ``attention_mask``.
+            `tuple(tensors)`: a tuple consisting of two elements: contextualized `input_ids` and a compatible
+            `attention_mask`.
        """

        def cat_input_and_doc(doc_title, doc_text, input_string, prefix):
@@ -526,22 +526,22 @@ class RagRetriever:

    def retrieve(self, question_hidden_states: np.ndarray, n_docs: int) -> Tuple[np.ndarray, List[dict]]:
        """
-        Retrieves documents for specified ``question_hidden_states``.
+        Retrieves documents for specified `question_hidden_states`.

        Args:
-            question_hidden_states (:obj:`np.ndarray` of shape :obj:`(batch_size, vector_size)`):
+            question_hidden_states (`np.ndarray` of shape `(batch_size, vector_size)`):
                A batch of query vectors to retrieve with.
-            n_docs (:obj:`int`):
+            n_docs (`int`):
                The number of docs retrieved per query.

        Return:
-            :obj:`Tuple[np.ndarray, np.ndarray, List[dict]]`: A tuple with the following objects:
+            `Tuple[np.ndarray, np.ndarray, List[dict]]`: A tuple with the following objects:

-            - **retrieved_doc_embeds** (:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs, dim)`) -- The retrieval
+            - **retrieved_doc_embeds** (`np.ndarray` of shape `(batch_size, n_docs, dim)`) -- The retrieval
              embeddings of the retrieved docs per query.
-            - **doc_ids** (:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs)`) -- The ids of the documents in the
+            - **doc_ids** (`np.ndarray` of shape `(batch_size, n_docs)`) -- The ids of the documents in the
              index
-            - **doc_dicts** (:obj:`List[dict]`): The :obj:`retrieved_doc_embeds` examples per query.
+            - **doc_dicts** (`List[dict]`): The `retrieved_doc_embeds` examples per query.
        """

        doc_ids, retrieved_doc_embeds = self._main_retrieve(question_hidden_states, n_docs)
@@ -561,34 +561,34 @@ class RagRetriever:
        return_tensors=None,
    ) -> BatchEncoding:
        """
-        Retrieves documents for specified :obj:`question_hidden_states`.
+        Retrieves documents for specified `question_hidden_states`.

        Args:
-            question_input_ids: (:obj:`List[List[int]]`) batch of input ids
-            question_hidden_states (:obj:`np.ndarray` of shape :obj:`(batch_size, vector_size)`:
+            question_input_ids: (`List[List[int]]`) batch of input ids
+            question_hidden_states (`np.ndarray` of shape `(batch_size, vector_size)`:
                A batch of query vectors to retrieve with.
-            prefix: (:obj:`str`, `optional`):
+            prefix: (`str`, *optional*):
                The prefix used by the generator's tokenizer.
-            n_docs (:obj:`int`, `optional`):
+            n_docs (`int`, *optional*):
                The number of docs retrieved per query.
-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`, defaults to "pt"):
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*, defaults to "pt"):
                If set, will return tensors instead of list of python integers. Acceptable values are:

-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.

-        Returns: :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following
+        Returns: [`BatchEncoding`]: A [`BatchEncoding`] with the following
        fields:

            - **context_input_ids** -- List of token ids to be fed to a model.

-              `What are input IDs? <../glossary.html#input-ids>`__
+              [What are input IDs?](../glossary#input-ids)

            - **context_attention_mask** -- List of indices specifying which tokens should be attended to by the model
-            (when :obj:`return_attention_mask=True` or if `"attention_mask"` is in :obj:`self.model_input_names`).
+            (when `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`).

-              `What are attention masks? <../glossary.html#attention-mask>`__
+              [What are attention masks?](../glossary#attention-mask)

            - **retrieved_doc_embeds** -- List of embeddings of the retrieved documents
            - **doc_ids** -- List of ids of the retrieved documents

--- a/src/transformers/models/reformer/configuration_reformer.py
+++ b/src/transformers/models/reformer/configuration_reformer.py
@@ -29,133 +29,131 @@ REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class ReformerConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.ReformerModel`. It is used to
+    This is the configuration class to store the configuration of a [`ReformerModel`]. It is used to
    instantiate a Reformer model according to the specified arguments, defining the model architecture.

-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.

    Args:
-        attention_head_size (:obj:`int`, `optional`, defaults to 64):
+        attention_head_size (`int`, *optional*, defaults to 64):
            Dimensionality of the projected key, query and value vectors
-        attn_layers (:obj:`List[str]`, `optional`, defaults to :obj:`["local", "lsh", "local", "lsh", "local", "lsh"]`):
+        attn_layers (`List[str]`, *optional*, defaults to `["local", "lsh", "local", "lsh", "local", "lsh"]`):
            List of attention layer types in ascending order. It can be chosen between a LSHSelfAttention layer
-            (:obj:`"lsh"`) and a LocalSelfAttention layer (:obj:`"local"`).
+            (`"lsh"`) and a LocalSelfAttention layer (`"local"`).

-            For more information on LSHSelfAttention layer, see `LSH Self Attention
-            <reformer.html#lsh-self-attention>`__. For more information on LocalSelfAttention layer, see `Local Self
-            Attention <reformer.html#local-self-attention>`__.
-        axial_pos_embds (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            For more information on LSHSelfAttention layer, see [LSH Self Attention](reformer#lsh-self-attention). For more information on LocalSelfAttention layer, see [Local Self
+            Attention](reformer#local-self-attention).
+        axial_pos_embds (`bool`, *optional*, defaults to `True`):
            Whether or not to use axial position embeddings. For more information on how axial position embeddings
-            work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__.
-        axial_norm_std (:obj:`float`, `optional`, defaults to 1.0):
+            work, see [Axial Position Encodings](reformer#axial-positional-encodings).
+        axial_norm_std (`float`, *optional*, defaults to 1.0):
            The standard deviation of the normal_initializer for initializing the weight matrices of the axial
            positional encodings.
-        axial_pos_shape (:obj:`List[int]`, `optional`, defaults to :obj:`[64, 64]`):
+        axial_pos_shape (`List[int]`, *optional*, defaults to `[64, 64]`):
            The position dims of the axial position encodings. During training, the product of the position dims has to
            be equal to the sequence length.

-            For more information on how axial position embeddings work, see `Axial Position Encodings
-            <reformer.html#axial-positional-encodings>`__.
-        axial_pos_embds_dim (:obj:`List[int]`, `optional`, defaults to :obj:`[64, 192]`):
+            For more information on how axial position embeddings work, see [Axial Position Encodings](reformer#axial-positional-encodings).
+        axial_pos_embds_dim (`List[int]`, *optional*, defaults to `[64, 192]`):
            The embedding dims of the axial position encodings. The sum of the embedding dims has to be equal to the
            hidden size.

-            For more information on how axial position embeddings work, see `Axial Position Encodings
-            <reformer.html#axial-positional-encodings>`__.
-        chunk_size_lm_head (:obj:`int`, `optional`, defaults to 0):
+            For more information on how axial position embeddings work, see [Axial Position Encodings](reformer#axial-positional-encodings).
+        chunk_size_lm_head (`int`, *optional*, defaults to 0):
            The chunk size of the final language model feed forward head layer. A chunk size of 0 means that the feed
            forward layer is not chunked. A chunk size of n means that the feed forward layer processes n <
            sequence_length embeddings at a time.

-            For more information on feed forward chunking, see `How does Feed Forward Chunking work?
-            <../glossary.html#feed-forward-chunking>`__.
-        eos_token_id (:obj:`int`, `optional`, defaults to 2):
+            For more information on feed forward chunking, see [How does Feed Forward Chunking work?](../glossary#feed-forward-chunking).
+        eos_token_id (`int`, *optional*, defaults to 2):
            The token id for the end-of-sentence token.
-        feed_forward_size (:obj:`int`, `optional`, defaults to 512):
+        feed_forward_size (`int`, *optional*, defaults to 512):
            Dimensionality of the feed_forward layer in the residual attention block.
-        hash_seed (:obj:`int`, `optional`):
-            Seed that can be used to make local sensitive hashing in :obj:`LSHSelfAttention` deterministic. This should
-            only be set for testing purposed. For evaluation and training purposes :obj:`hash_seed` should be left as
-            :obj:`None` to ensure fully random rotations in local sensitive hashing scheme.
-        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"relu"`):
+        hash_seed (`int`, *optional*):
+            Seed that can be used to make local sensitive hashing in `LSHSelfAttention` deterministic. This should
+            only be set for testing purposed. For evaluation and training purposes `hash_seed` should be left as
+            `None` to ensure fully random rotations in local sensitive hashing scheme.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"relu"`):
            The non-linear activation function (function or string) in the feed forward layer in the residual attention
-            block. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.05):
+            block. If string, `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.05):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        hidden_size (:obj:`int`, `optional`, defaults to 256):
+        hidden_size (`int`, *optional*, defaults to 256):
            Dimensionality of the output hidden states of the residual attention blocks.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        is_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to use a causal mask in addition to the :obj:`attention_mask` passed to
-            :class:`~transformers.ReformerModel`. When using the Reformer for causal language modeling, this argument
-            should be set to :obj:`True`.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether or not to use a causal mask in addition to the `attention_mask` passed to
+            [`ReformerModel`]. When using the Reformer for causal language modeling, this argument
+            should be set to `True`.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
-        local_chunk_length (:obj:`int`, `optional`, defaults to 64):
-            Length of chunk which attends to itself in :obj:`LocalSelfAttention`. Chunking reduces memory complexity
+        local_chunk_length (`int`, *optional*, defaults to 64):
+            Length of chunk which attends to itself in `LocalSelfAttention`. Chunking reduces memory complexity
            from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length /
            chunk length (chunked self attention).
-        local_num_chunks_before (:obj:`int`, `optional`, defaults to 1):
-            Number of previous neighbouring chunks to attend to in :obj:`LocalSelfAttention` layer to itself.
-        local_num_chunks_after (:obj:`int`, `optional`, defaults to 0):
-            Number of following neighbouring chunks to attend to in :obj:`LocalSelfAttention` layer in addition to
+        local_num_chunks_before (`int`, *optional*, defaults to 1):
+            Number of previous neighbouring chunks to attend to in `LocalSelfAttention` layer to itself.
+        local_num_chunks_after (`int`, *optional*, defaults to 0):
+            Number of following neighbouring chunks to attend to in `LocalSelfAttention` layer in addition to
            itself.
-        local_attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout ratio for the attention probabilities in :obj:`LocalSelfAttention`.
-        lsh_attn_chunk_length (:obj:`int`, `optional`, defaults to 64):
-            Length of chunk which attends to itself in :obj:`LSHSelfAttention`. Chunking reduces memory complexity from
+        local_attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities in `LocalSelfAttention`.
+        lsh_attn_chunk_length (`int`, *optional*, defaults to 64):
+            Length of chunk which attends to itself in `LSHSelfAttention`. Chunking reduces memory complexity from
            sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk
            length (chunked self attention).
-        lsh_num_chunks_before (:obj:`int`, `optional`, defaults to 1):
-            Number of previous neighbouring chunks to attend to in :obj:`LSHSelfAttention` layer to itself.
-        lsh_num_chunks_after (:obj:`int`, `optional`, defaults to 0):
-            Number of following neighbouring chunks to attend to in :obj:`LSHSelfAttention` layer to itself.
-        lsh_attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout ratio for the attention probabilities in :obj:`LSHSelfAttention`.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 4096):
+        lsh_num_chunks_before (`int`, *optional*, defaults to 1):
+            Number of previous neighbouring chunks to attend to in `LSHSelfAttention` layer to itself.
+        lsh_num_chunks_after (`int`, *optional*, defaults to 0):
+            Number of following neighbouring chunks to attend to in `LSHSelfAttention` layer to itself.
+        lsh_attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities in `LSHSelfAttention`.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
-        num_buckets (:obj:`int` or :obj:`List[int]`, `optional`):
+        num_buckets (`int` or `List[int]`, *optional*):
            Number of buckets, the key query vectors can be "hashed into" using the locality sensitive hashing scheme.
-            Each query key vector is hashed into a hash in :obj:`1, ..., num_buckets`. The number of buckets can also
+            Each query key vector is hashed into a hash in `1, ..., num_buckets`. The number of buckets can also
            be factorized into a list for improved memory complexity. In this case, each query key vector is hashed
-            into a hash in :obj:`1-1, 1-2, ..., num_buckets[0]-1, ..., num_buckets[0]-num_buckets[1]` if
-            :obj:`num_buckets` is factorized into two factors. The number of buckets (or the product the factors)
-            should approximately equal sequence length / lsh_chunk_length. If :obj:`num_buckets` not set, a good value
+            into a hash in `1-1, 1-2, ..., num_buckets[0]-1, ..., num_buckets[0]-num_buckets[1]` if
+            `num_buckets` is factorized into two factors. The number of buckets (or the product the factors)
+            should approximately equal sequence length / lsh_chunk_length. If `num_buckets` not set, a good value
            is calculated on the fly.
-        num_hashes (:obj:`int`, `optional`, defaults to 1):
+        num_hashes (`int`, *optional*, defaults to 1):
            Number of hashing rounds (e.g., number of random rotations) in Local Sensitive Hashing scheme. The higher
-            :obj:`num_hashes`, the more accurate the :obj:`LSHSelfAttention` becomes, but also the more memory and time
+            `num_hashes`, the more accurate the `LSHSelfAttention` becomes, but also the more memory and time
            intensive the hashing becomes.
-        pad_token_id (:obj:`int`, `optional`, defaults to 0):
+        pad_token_id (`int`, *optional*, defaults to 0):
            The token id for the padding token.
-        vocab_size (:obj:`int`, `optional`, defaults to 320):\
+        vocab_size (`int`, *optional*, defaults to 320):\
            Vocabulary size of the Reformer model. Defines the number of different tokens that can be represented by
-            the :obj:`inputs_ids` passed when calling :class:`~transformers.ReformerModel`.
-        tie_word_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            the `inputs_ids` passed when calling [`ReformerModel`].
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie input and output embeddings.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
-        classifier_dropout (:obj:`float`, `optional`):
+        classifier_dropout (`float`, *optional*):
            The dropout ratio for the classification head.

-    Examples::
+    Examples:

-        >>> from transformers import ReformerModel, ReformerConfig
+    ```python
+    >>> from transformers import ReformerModel, ReformerConfig

-        >>> # Initializing a Reformer configuration
-        >>> configuration = ReformerConfig()
+    >>> # Initializing a Reformer configuration
+    >>> configuration = ReformerConfig()

-        >>> # Initializing a Reformer model
-        >>> model = ReformerModel(configuration)
+    >>> # Initializing a Reformer model
+    >>> model = ReformerModel(configuration)

-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+"""
    model_type = "reformer"
    keys_to_ignore_at_inference = ["past_buckets_states"]
    attribute_map = {}

--- a/src/transformers/models/reformer/tokenization_reformer.py
+++ b/src/transformers/models/reformer/tokenization_reformer.py
@@ -45,42 +45,44 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {

 class ReformerTokenizer(PreTrainedTokenizer):
    """
-    Construct a Reformer tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__ .
+    Construct a Reformer tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece) .

-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
    Users should refer to this superclass for more information regarding those methods.

    Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

-            .. note::
+            <Tip>

-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
-        additional_special_tokens (:obj:`List[str]`, `optional`):
+        additional_special_tokens (`List[str]`, *optional*):
            Additional special tokens used by the tokenizer.
-        sp_model_kwargs (:obj:`dict`, `optional`):
-            Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
-            <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:

-            - ``enable_sampling``: Enable subword regularization.
-            - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

-              - ``nbest_size = {0,1}``: No sampling is performed.
-              - ``nbest_size > 1``: samples from the nbest_size results.
-              - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

-            - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.
    """


--- a/src/transformers/models/reformer/tokenization_reformer_fast.py
+++ b/src/transformers/models/reformer/tokenization_reformer_fast.py
@@ -53,29 +53,31 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {

 class ReformerTokenizerFast(PreTrainedTokenizerFast):
    """
-    Construct a "fast" Reformer tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
-    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__.
+    Construct a "fast" Reformer tokenizer (backed by HuggingFace's *tokenizers* library). Based on [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).

-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
    methods. Users should refer to this superclass for more information regarding those methods.

    Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

-            .. note::
+            <Tip>

-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
-        additional_special_tokens (:obj:`List[str]`, `optional`):
+        additional_special_tokens (`List[str]`, *optional*):
            Additional special tokens used by the tokenizer.
    """


--- a/src/transformers/models/rembert/configuration_rembert.py
+++ b/src/transformers/models/rembert/configuration_rembert.py
@@ -28,56 +28,60 @@ REMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class RemBertConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.RemBertModel`. It is used to
+    This is the configuration class to store the configuration of a [`RemBertModel`]. It is used to
    instantiate an RemBERT model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the remert-large architecture.

-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.


    Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 250300):
+        vocab_size (`int`, *optional*, defaults to 250300):
            Vocabulary size of the RemBERT model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.RemBertModel` or
-            :class:`~transformers.TFRemBertModel`. Vocabulary size of the model. Defines the different tokens that can
-            be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.RemBertModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 1152):
+            `inputs_ids` passed when calling [`RemBertModel`] or
+            [`TFRemBertModel`]. Vocabulary size of the model. Defines the different tokens that can
+            be represented by the *inputs_ids* passed to the forward method of [`RemBertModel`].
+        hidden_size (`int`, *optional*, defaults to 1152):
            Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 32):
+        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 18):
+        num_attention_heads (`int`, *optional*, defaults to 18):
            Number of attention heads for each attention layer in the Transformer encoder.
-        input_embedding_size (:obj:`int`, `optional`, defaults to 256):
+        input_embedding_size (`int`, *optional*, defaults to 256):
            Dimensionality of the input embeddings.
-        output_embedding_size (:obj:`int`, `optional`, defaults to 1664):
+        output_embedding_size (`int`, *optional*, defaults to 1664):
            Dimensionality of the output embeddings.
-        intermediate_size (:obj:`int`, `optional`, defaults to 4608):
+        intermediate_size (`int`, *optional*, defaults to 4608):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0):
            The dropout ratio for the attention probabilities.
-        classifier_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        classifier_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the classifier layer when fine-tuning.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.RemBertModel` or
-            :class:`~transformers.TFRemBertModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`RemBertModel`] or
+            [`TFRemBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if ``config.is_decoder=True``.
+            relevant if `config.is_decoder=True`.

-        Example::
+    Example:
+
+    ```python
+
+    ```

        >>> from transformers import RemBertModel, RemBertConfig
        >>> # Initializing a RemBERT rembert style configuration

--- a/src/transformers/models/rembert/tokenization_rembert.py
+++ b/src/transformers/models/rembert/tokenization_rembert.py
@@ -42,48 +42,54 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {

 class RemBertTokenizer(PreTrainedTokenizer):
    """
-    Construct a RemBERT tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
+    Construct a RemBERT tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
    Users should refer to this superclass for more information regarding those methods.

    Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        bos_token (`str`, *optional*, defaults to `"[CLS]"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

-            .. note::
+            <Tip>

-                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                sequence. The token used is the :obj:`cls_token`.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"[SEP]"`):
            The end of sequence token.

-            .. note::
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.

-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            </Tip>
+
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.

    Attributes:
-        sp_model (:obj:`SentencePieceProcessor`):
-            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
    """

    vocab_files_names = VOCAB_FILES_NAMES
@@ -170,17 +176,17 @@ class RemBertTokenizer(PreTrainedTokenizer):
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A REMBERT sequence has the following format:

-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
@@ -193,18 +199,18 @@ class RemBertTokenizer(PreTrainedTokenizer):
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        if already_has_special_tokens:
@@ -226,21 +232,21 @@ class RemBertTokenizer(PreTrainedTokenizer):
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A RemBERT
        sequence pair mask has the following format:

-        ::
-
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```

-        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
            sequence(s).
        """
        sep = [self.sep_token_id]

--- a/src/transformers/models/rembert/tokenization_rembert_fast.py
+++ b/src/transformers/models/rembert/tokenization_rembert_fast.py
@@ -51,44 +51,46 @@ SPIECE_UNDERLINE = "▁"

 class RemBertTokenizerFast(PreTrainedTokenizerFast):
    """
-    Construct a "fast" RemBert tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
-    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__. This tokenizer
-    inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main methods. Users should
+    Construct a "fast" RemBert tokenizer (backed by HuggingFace's *tokenizers* library). Based on [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This tokenizer
+    inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods

    Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
-        remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        remove_space (`bool`, *optional*, defaults to `True`):
            Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
-        keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        keep_accents (`bool`, *optional*, defaults to `False`):
            Whether or not to keep accents when tokenizing.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        bos_token (`str`, *optional*, defaults to `"[CLS]"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

-            .. note::
+            <Tip>

-               When building a sequence using special tokens, this is not the token that is used for the beginning of
-               sequence. The token used is the :obj:`cls_token`.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"[SEP]"`):
            The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token
-            that is used for the end of sequence. The token used is the :obj:`sep_token`.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            that is used for the end of sequence. The token used is the `sep_token`.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
    """
@@ -145,17 +147,17 @@ class RemBertTokenizerFast(PreTrainedTokenizerFast):
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A RemBERT sequence has the following format:

-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added
-            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+            token_ids_1 (`List[int]`, *optional*, defaults to `None`):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
@@ -168,18 +170,18 @@ class RemBertTokenizerFast(PreTrainedTokenizerFast):
    ) -> List[int]:
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of ids.
-            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+            token_ids_1 (`List[int]`, *optional*, defaults to `None`):
                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Set to True if the token list is already formatted with special tokens for the model

        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        if already_has_special_tokens:
@@ -201,21 +203,21 @@ class RemBertTokenizerFast(PreTrainedTokenizerFast):
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. A RemBERT
        sequence pair mask has the following format:

-        ::
-
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```

        if token_ids_1 is None, only returns the first portion of the mask (0s).

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of ids.
-            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+            token_ids_1 (`List[int]`, *optional*, defaults to `None`):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
            sequence(s).
        """
        sep = [self.sep_token_id]

--- a/src/transformers/models/retribert/configuration_retribert.py
+++ b/src/transformers/models/retribert/configuration_retribert.py
@@ -28,44 +28,44 @@ RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class RetriBertConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.RetriBertModel`. It is used
+    This is the configuration class to store the configuration of a [`RetriBertModel`]. It is used
    to instantiate a RetriBertModel model according to the specified arguments, defining the model architecture.

-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.


    Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+        vocab_size (`int`, *optional*, defaults to 30522):
            Vocabulary size of the RetriBERT model. Defines the number of different tokens that can be represented by
-            the :obj:`inputs_ids` passed when calling :class:`~transformers.RetriBertModel`
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            the `inputs_ids` passed when calling [`RetriBertModel`]
+        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the *token_type_ids* passed into [`BertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
-        share_encoders (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        share_encoders (`bool`, *optional*, defaults to `True`):
            Whether or not to use the same Bert-type encoder for the queries and document
-        projection_dim (:obj:`int`, `optional`, defaults to 128):
+        projection_dim (`int`, *optional*, defaults to 128):
            Final dimension of the query and document representation after projection
    """
    model_type = "retribert"

--- a/src/transformers/models/retribert/tokenization_retribert.py
+++ b/src/transformers/models/retribert/tokenization_retribert.py
@@ -42,10 +42,10 @@ class RetriBertTokenizer(BertTokenizer):
    r"""
    Constructs a RetriBERT tokenizer.

-    :class:`~transformers.RetroBertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
+    [`RetroBertTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end
    tokenization: punctuation splitting and wordpiece.

-    Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning
    parameters.
    """


--- a/src/transformers/models/retribert/tokenization_retribert_fast.py
+++ b/src/transformers/models/retribert/tokenization_retribert_fast.py
@@ -44,12 +44,12 @@ PRETRAINED_INIT_CONFIGURATION = {

 class RetriBertTokenizerFast(BertTokenizerFast):
    r"""
-    Construct a "fast" RetriBERT tokenizer (backed by HuggingFace's `tokenizers` library).
+    Construct a "fast" RetriBERT tokenizer (backed by HuggingFace's *tokenizers* library).

-    :class:`~transformers.RetriBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
+    [`RetriBertTokenizerFast`] is identical to [`BertTokenizerFast`] and runs
    end-to-end tokenization: punctuation splitting and wordpiece.

-    Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning
    parameters.
    """


--- a/src/transformers/models/roberta/configuration_roberta.py
+++ b/src/transformers/models/roberta/configuration_roberta.py
@@ -36,30 +36,31 @@ ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class RobertaConfig(BertConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.RobertaModel` or a
-    :class:`~transformers.TFRobertaModel`. It is used to instantiate a RoBERTa model according to the specified
+    This is the configuration class to store the configuration of a [`RobertaModel`] or a
+    [`TFRobertaModel`]. It is used to instantiate a RoBERTa model according to the specified
    arguments, defining the model architecture.


-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.

-    The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`. It reuses the
+    The [`RobertaConfig`] class directly inherits [`BertConfig`]. It reuses the
    same defaults. Please check the parent class for more information.

-    Examples::
+    Examples:

-        >>> from transformers import RobertaConfig, RobertaModel
+    ```python
+    >>> from transformers import RobertaConfig, RobertaModel

-        >>> # Initializing a RoBERTa configuration
-        >>> configuration = RobertaConfig()
+    >>> # Initializing a RoBERTa configuration
+    >>> configuration = RobertaConfig()

-        >>> # Initializing a model from the configuration
-        >>> model = RobertaModel(configuration)
+    >>> # Initializing a model from the configuration
+    >>> model = RobertaModel(configuration)

-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
    model_type = "roberta"

    def __init__(self, pad_token_id=1, bos_token_id=0, eos_token_id=2, **kwargs):

--- a/src/transformers/models/roberta/tokenization_roberta.py
+++ b/src/transformers/models/roberta/tokenization_roberta.py
@@ -64,64 +64,71 @@ class RobertaTokenizer(GPT2Tokenizer):
    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
    be encoded differently whether it is at the beginning of the sentence (without space) or not:

-    ::
-
-        >>> from transformers import RobertaTokenizer
-        >>> tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
-        >>> tokenizer("Hello world")['input_ids']
-        [0, 31414, 232, 328, 2]
-        >>> tokenizer(" Hello world")['input_ids']
-        [0, 20920, 232, 2]
-
-    You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
+    ```
+    >>> from transformers import RobertaTokenizer
+    >>> tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
+    >>> tokenizer("Hello world")['input_ids']
+    [0, 31414, 232, 328, 2]
+    >>> tokenizer(" Hello world")['input_ids']
+    [0, 20920, 232, 2]
+    ```
+
+    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.

-    .. note::
+    <Tip>

-        When used with ``is_split_into_words=True``, this tokenizer will add a space before each word (even the first
-        one).
+    When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first
+    one).

-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    </Tip>
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
    Users should refer to this superclass for more information regarding those methods.

    Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
            Path to the vocabulary file.
-        merges_file (:obj:`str`):
+        merges_file (`str`):
            Path to the merges file.
-        errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
-            Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
-            <https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

-            .. note::
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.

-                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                sequence. The token used is the :obj:`cls_token`.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

-            .. note::
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>

-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
-        add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        add_prefix_space (`bool`, *optional*, defaults to `False`):
            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
            other word. (RoBERTa tokenizer detect beginning of words by the preceding space).
    """
@@ -178,17 +185,17 @@ class RobertaTokenizer(GPT2Tokenizer):
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A RoBERTa sequence has the following format:

-        - single sequence: ``<s> X </s>``
-        - pair of sequences: ``<s> A </s></s> B </s>``
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
@@ -201,18 +208,18 @@ class RobertaTokenizer(GPT2Tokenizer):
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
@@ -231,13 +238,13 @@ class RobertaTokenizer(GPT2Tokenizer):
        make use of token type ids, therefore a list of zeros is returned.

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`: List of zeros.
+            `List[int]`: List of zeros.
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

--- a/src/transformers/models/roberta/tokenization_roberta_fast.py
+++ b/src/transformers/models/roberta/tokenization_roberta_fast.py
@@ -65,73 +65,80 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {

 class RobertaTokenizerFast(GPT2TokenizerFast):
    """
-    Construct a "fast" RoBERTa tokenizer (backed by HuggingFace's `tokenizers` library), derived from the GPT-2
+    Construct a "fast" RoBERTa tokenizer (backed by HuggingFace's *tokenizers* library), derived from the GPT-2
    tokenizer, using byte-level Byte-Pair-Encoding.

    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
    be encoded differently whether it is at the beginning of the sentence (without space) or not:

-    ::
+    ```
+    >>> from transformers import RobertaTokenizerFast
+    >>> tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
+    >>> tokenizer("Hello world")['input_ids']
+    [0, 31414, 232, 328, 2]
+    >>> tokenizer(" Hello world")['input_ids']
+    [0, 20920, 232, 2]
+    ```

-        >>> from transformers import RobertaTokenizerFast
-        >>> tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
-        >>> tokenizer("Hello world")['input_ids']
-        [0, 31414, 232, 328, 2]
-        >>> tokenizer(" Hello world")['input_ids']
-        [0, 20920, 232, 2]
-
-    You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
+    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.

-    .. note::
+    <Tip>
+
+    When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with
+    `add_prefix_space=True`.

-        When used with ``is_split_into_words=True``, this tokenizer needs to be instantiated with
-        ``add_prefix_space=True``.
+    </Tip>

-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
    methods. Users should refer to this superclass for more information regarding those methods.

    Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
            Path to the vocabulary file.
-        merges_file (:obj:`str`):
+        merges_file (`str`):
            Path to the merges file.
-        errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
-            Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
-            <https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

-            .. note::
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.

-                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                sequence. The token used is the :obj:`cls_token`.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

-            .. note::
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>

-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
-        add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        add_prefix_space (`bool`, *optional*, defaults to `False`):
            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
            other word. (RoBERTa tokenizer detect beginning of words by the preceding space).
-        trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        trim_offsets (`bool`, *optional*, defaults to `True`):
            Whether the post processing step should trim offsets to avoid including whitespaces.
    """

@@ -176,11 +183,11 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
    @property
    def mask_token(self) -> str:
        """
-        :obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
+        `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
        not having been set.

        Roberta tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
-        comprise the space before the `<mask>`.
+        comprise the space before the *<mask>*.
        """
        if self._mask_token is None and self.verbose:
            logger.error("Using mask_token, but it is not set yet.")
@@ -214,13 +221,13 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
        make use of token type ids, therefore a list of zeros is returned.

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`: List of zeros.
+            `List[int]`: List of zeros.
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

--- a/src/transformers/models/roformer/configuration_roformer.py
+++ b/src/transformers/models/roformer/configuration_roformer.py
@@ -33,67 +33,68 @@ ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class RoFormerConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.RoFormerModel`. It is used to
+    This is the configuration class to store the configuration of a [`RoFormerModel`]. It is used to
    instantiate an RoFormer model according to the specified arguments, defining the model architecture. Instantiating
    a configuration with the defaults will yield a similar configuration to that of the RoFormer
-    `junnyu/roformer_chinese_base <https://huggingface.co/junnyu/roformer_chinese_base>`__ architecture.
+    [junnyu/roformer_chinese_base](https://huggingface.co/junnyu/roformer_chinese_base) architecture.

-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.


    Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 50000):
+        vocab_size (`int`, *optional*, defaults to 50000):
            Vocabulary size of the RoFormer model. Defines the number of different tokens that can be represented by
-            the :obj:`inputs_ids` passed when calling :class:`~transformers.RoFormerModel` or
-            :class:`~transformers.TFRoFormerModel`.
-        embedding_size (:obj:`int`, `optional`, defaults to None):
-            Dimensionality of the encoder layers and the pooler layer. Defaults to the :obj:`hidden_size` if not
+            the `inputs_ids` passed when calling [`RoFormerModel`] or
+            [`TFRoFormerModel`].
+        embedding_size (`int`, *optional*, defaults to None):
+            Dimensionality of the encoder layers and the pooler layer. Defaults to the `hidden_size` if not
            provided.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+        hidden_size (`int`, *optional*, defaults to 768):
            Dimension of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 1536):
+        max_position_embeddings (`int`, *optional*, defaults to 1536):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 1536).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.RoFormerModel`
-            or :class:`~transformers.TFRoFormerModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`RoFormerModel`]
+            or [`TFRoFormerModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if ``config.is_decoder=True``.
-        rotary_value (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            relevant if `config.is_decoder=True`.
+        rotary_value (`bool`, *optional*, defaults to `False`):
            Whether or not apply rotary position embeddings on value layer.

-    Example::
+    Example:

-        >>> from transformers import RoFormerModel, RoFormerConfig
+    ```python
+    >>> from transformers import RoFormerModel, RoFormerConfig

-        >>> # Initializing a RoFormer junnyu/roformer_chinese_base style configuration
-        >>> configuration = RoFormerConfig()
+    >>> # Initializing a RoFormer junnyu/roformer_chinese_base style configuration
+    >>> configuration = RoFormerConfig()

-        >>> # Initializing a model from the junnyu/roformer_chinese_base style configuration
-        >>> model = RoFormerModel(configuration)
+    >>> # Initializing a model from the junnyu/roformer_chinese_base style configuration
+    >>> model = RoFormerModel(configuration)

-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
    model_type = "roformer"

    def __init__(

--- a/src/transformers/models/roformer/tokenization_roformer.py
+++ b/src/transformers/models/roformer/tokenization_roformer.py
@@ -60,53 +60,52 @@ PRETRAINED_INIT_CONFIGURATION = {

 class RoFormerTokenizer(PreTrainedTokenizer):
    r"""
-    Construct a RoFormer tokenizer. Based on `Rust Jieba <https://pypi.org/project/rjieba/>`.
+    Construct a RoFormer tokenizer. Based on *Rust Jieba <https://pypi.org/project/rjieba/>*.

-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
    Users should refer to this superclass for more information regarding those methods.

    Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
            File containing the vocabulary.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
-        do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
            Whether or not to do basic tokenization before WordPiece.
-        never_split (:obj:`Iterable`, `optional`):
+        never_split (`Iterable`, *optional*):
            Collection of tokens which will never be split during tokenization. Only has an effect when
-            :obj:`do_basic_tokenize=True`
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+            `do_basic_tokenize=True`
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
-        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters.

-            This should likely be deactivated for Japanese (see this `issue
-            <https://github.com/huggingface/transformers/issues/328>`__).
-        strip_accents: (:obj:`bool`, `optional`):
+            This should likely be deactivated for Japanese (see this [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents: (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for :obj:`lowercase` (as in the original BERT).
+            value for `lowercase` (as in the original BERT).

-    Example::
+    Example:

-        >>> from transformers import RoFormerTokenizer
-        >>> tokenizer = RoFormerTokenizer.from_pretrained('junnyu/roformer_chinese_base')
-        >>> tokenizer.tokenize("今天天气非常好。")
-        # ['今', '天', '天', '气', '非常', '好', '。']
-
-    """
+    ```python
+    >>> from transformers import RoFormerTokenizer
+    >>> tokenizer = RoFormerTokenizer.from_pretrained('junnyu/roformer_chinese_base')
+    >>> tokenizer.tokenize("今天天气非常好。")
+    # ['今', '天', '天', '气', '非常', '好', '。']
+    ```"""
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
@@ -230,17 +229,17 @@ class RoFormerTokenizer(PreTrainedTokenizer):
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A RoFormer sequence has the following format:

-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
@@ -253,18 +252,18 @@ class RoFormerTokenizer(PreTrainedTokenizer):
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        if already_has_special_tokens:
@@ -283,21 +282,21 @@ class RoFormerTokenizer(PreTrainedTokenizer):
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A RoFormer
        sequence pair mask has the following format:

-        ::
-
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```

-        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
            sequence(s).
        """
        sep = [self.sep_token_id]

--- a/src/transformers/models/roformer/tokenization_roformer_fast.py
+++ b/src/transformers/models/roformer/tokenization_roformer_fast.py
@@ -62,23 +62,23 @@ PRETRAINED_INIT_CONFIGURATION = {

 class RoFormerTokenizerFast(PreTrainedTokenizerFast):
    r"""
-    Construct a "fast" RoFormer tokenizer (backed by HuggingFace's `tokenizers` library).
+    Construct a "fast" RoFormer tokenizer (backed by HuggingFace's *tokenizers* library).

-    :class:`~transformers.RoFormerTokenizerFast` is almost identical to :class:`~transformers.BertTokenizerFast` and
+    [`RoFormerTokenizerFast`] is almost identical to [`BertTokenizerFast`] and
    runs end-to-end tokenization: punctuation splitting and wordpiece. There are some difference between them when
    tokenizing Chinese.

-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
    methods. Users should refer to this superclass for more information regarding those methods.

-    Example::
+    Example:

-        >>> from transformers import RoFormerTokenizerFast
-        >>> tokenizer = RoFormerTokenizerFast.from_pretrained('junnyu/roformer_chinese_base')
-        >>> tokenizer.tokenize("今天天气非常好。")
-        # ['今', '天', '天', '气', '非常', '好', '。']
-
-    """
+    ```python
+    >>> from transformers import RoFormerTokenizerFast
+    >>> tokenizer = RoFormerTokenizerFast.from_pretrained('junnyu/roformer_chinese_base')
+    >>> tokenizer.tokenize("今天天气非常好。")
+    # ['今', '天', '天', '气', '非常', '好', '。']
+    ```"""

    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
@@ -141,17 +141,17 @@ class RoFormerTokenizerFast(PreTrainedTokenizerFast):
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A RoFormer sequence has the following format:

-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]

@@ -167,21 +167,21 @@ class RoFormerTokenizerFast(PreTrainedTokenizerFast):
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A RoFormer
        sequence pair mask has the following format:

-        ::
-
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```

-        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
            sequence(s).
        """
        sep = [self.sep_token_id]

--- a/src/transformers/models/segformer/configuration_segformer.py
+++ b/src/transformers/models/segformer/configuration_segformer.py
@@ -28,75 +28,76 @@ SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class SegformerConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.SegformerModel`. It is used
+    This is the configuration class to store the configuration of a [`SegformerModel`]. It is used
    to instantiate an SegFormer model according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the SegFormer
-    `nvidia/segformer-b0-finetuned-ade-512-512 <https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512>`__
+    [nvidia/segformer-b0-finetuned-ade-512-512](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512)
    architecture.

-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.

    Args:
-        image_size (:obj:`int`, `optional`, defaults to 512):
+        image_size (`int`, *optional*, defaults to 512):
            The size (resolution) of each image.
-        num_channels (:obj:`int`, `optional`, defaults to 3):
+        num_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
-        num_encoder_blocks (:obj:`int`, `optional`, defaults to 4):
+        num_encoder_blocks (`int`, *optional*, defaults to 4):
            The number of encoder blocks (i.e. stages in the Mix Transformer encoder).
-        depths (:obj:`List[int]`, `optional`, defaults to [2, 2, 2, 2]):
+        depths (`List[int]`, *optional*, defaults to [2, 2, 2, 2]):
            The number of layers in each encoder block.
-        sr_ratios (:obj:`List[int]`, `optional`, defaults to [8, 4, 2, 1]):
+        sr_ratios (`List[int]`, *optional*, defaults to [8, 4, 2, 1]):
            Sequence reduction ratios in each encoder block.
-        hidden_sizes (:obj:`List[int]`, `optional`, defaults to [32, 64, 160, 256]):
+        hidden_sizes (`List[int]`, *optional*, defaults to [32, 64, 160, 256]):
            Dimension of each of the encoder blocks.
-        downsampling_rates (:obj:`List[int]`, `optional`, defaults to [1, 4, 8, 16]):
+        downsampling_rates (`List[int]`, *optional*, defaults to [1, 4, 8, 16]):
            Downsample rate of the image resolution compared to the original image size before each encoder block.
-        patch_sizes (:obj:`List[int]`, `optional`, defaults to [7, 3, 3, 3]):
+        patch_sizes (`List[int]`, *optional*, defaults to [7, 3, 3, 3]):
            Patch size before each encoder block.
-        strides (:obj:`List[int]`, `optional`, defaults to [4, 2, 2, 2]):
+        strides (`List[int]`, *optional*, defaults to [4, 2, 2, 2]):
            Stride before each encoder block.
-        num_attention_heads (:obj:`List[int]`, `optional`, defaults to [1, 2, 4, 8]):
+        num_attention_heads (`List[int]`, *optional*, defaults to [1, 2, 4, 8]):
            Number of attention heads for each attention layer in each block of the Transformer encoder.
-        mlp_ratios (:obj:`List[int]`, `optional`, defaults to [4, 4, 4, 4]):
+        mlp_ratios (`List[int]`, *optional*, defaults to [4, 4, 4, 4]):
            Ratio of the size of the hidden layer compared to the size of the input layer of the Mix FFNs in the
            encoder blocks.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.0):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.0):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
-        classifier_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        classifier_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout probability before the classification head.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        drop_path_rate (:obj:`float`, `optional`, defaults to 0.1):
+        drop_path_rate (`float`, *optional*, defaults to 0.1):
            The dropout probability for stochastic depth, used in the blocks of the Transformer encoder.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-6):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
            The epsilon used by the layer normalization layers.
-        decoder_hidden_size (:obj:`int`, `optional`, defaults to 256):
+        decoder_hidden_size (`int`, *optional*, defaults to 256):
            The dimension of the all-MLP decode head.
-        reshape_last_stage (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether to reshape the features of the last stage back to :obj:`(batch_size, num_channels, height, width)`.
+        reshape_last_stage (`bool`, *optional*, defaults to `True`):
+            Whether to reshape the features of the last stage back to `(batch_size, num_channels, height, width)`.
            Only required for the semantic segmentation model.
-        semantic_loss_ignore_index (:obj:`int`, `optional`, defaults to 255):
+        semantic_loss_ignore_index (`int`, *optional*, defaults to 255):
            The index that is ignored by the loss function of the semantic segmentation model.

-    Example::
+    Example:

-        >>> from transformers import SegformerModel, SegformerConfig
+    ```python
+    >>> from transformers import SegformerModel, SegformerConfig

-        >>> # Initializing a SegFormer nvidia/segformer-b0-finetuned-ade-512-512 style configuration
-        >>> configuration = SegformerConfig()
+    >>> # Initializing a SegFormer nvidia/segformer-b0-finetuned-ade-512-512 style configuration
+    >>> configuration = SegformerConfig()

-        >>> # Initializing a model from the nvidia/segformer-b0-finetuned-ade-512-512 style configuration
-        >>> model = SegformerModel(configuration)
+    >>> # Initializing a model from the nvidia/segformer-b0-finetuned-ade-512-512 style configuration
+    >>> model = SegformerModel(configuration)

-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
    model_type = "segformer"

    def __init__(

--- a/src/transformers/models/segformer/feature_extraction_segformer.py
+++ b/src/transformers/models/segformer/feature_extraction_segformer.py
@@ -38,28 +38,28 @@ class SegformerFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMi
    r"""
    Constructs a SegFormer feature extractor.

-    This feature extractor inherits from :class:`~transformers.FeatureExtractionMixin` which contains most of the main
+    This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main
    methods. Users should refer to this superclass for more information regarding those methods.

    Args:
-        do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether to resize the input based on a certain :obj:`size`.
-        size (:obj:`int` or :obj:`Tuple(int)`, `optional`, defaults to 512):
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the input based on a certain `size`.
+        size (`int` or `Tuple(int)`, *optional*, defaults to 512):
            Resize the input to the given size. If a tuple is provided, it should be (width, height). If only an
-            integer is provided, then the input will be resized to (size, size). Only has an effect if :obj:`do_resize`
-            is set to :obj:`True`.
-        resample (:obj:`int`, `optional`, defaults to :obj:`PIL.Image.BILINEAR`):
-            An optional resampling filter. This can be one of :obj:`PIL.Image.NEAREST`, :obj:`PIL.Image.BOX`,
-            :obj:`PIL.Image.BILINEAR`, :obj:`PIL.Image.HAMMING`, :obj:`PIL.Image.BICUBIC` or :obj:`PIL.Image.LANCZOS`.
-            Only has an effect if :obj:`do_resize` is set to :obj:`True`.
-        do_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            integer is provided, then the input will be resized to (size, size). Only has an effect if `do_resize`
+            is set to `True`.
+        resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
+            An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
+            `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`.
+            Only has an effect if `do_resize` is set to `True`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether or not to normalize the input with mean and standard deviation.
-        image_mean (:obj:`int`, `optional`, defaults to :obj:`[0.485, 0.456, 0.406]`):
+        image_mean (`int`, *optional*, defaults to `[0.485, 0.456, 0.406]`):
            The sequence of means for each channel, to be used when normalizing images. Defaults to the ImageNet mean.
-        image_std (:obj:`int`, `optional`, defaults to :obj:`[0.229, 0.224, 0.225]`):
+        image_std (`int`, *optional*, defaults to `[0.229, 0.224, 0.225]`):
            The sequence of standard deviations for each channel, to be used when normalizing images. Defaults to the
            ImageNet std.
-        reduce_labels (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        reduce_labels (`bool`, *optional*, defaults to `False`):
            Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 is
            used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k). The
            background label will be replaced by 255.
@@ -97,34 +97,36 @@ class SegformerFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMi
        """
        Main method to prepare for the model one or several image(s) and optional corresponding segmentation maps.

-        .. warning::
+        <Tip warning={true}>

-           NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
-           PIL images.
+        NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
+        PIL images.
+
+        </Tip>

        Args:
-            images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is
                the number of channels, H and W are image height and width.

-            segmentation_maps (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`, `optional`):
+            segmentation_maps (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*):
                Optionally, the corresponding semantic segmentation maps with the pixel-wise annotations.

-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`, defaults to :obj:`'np'`):
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*, defaults to `'np'`):
                If set, will return tensors of a particular framework. Acceptable values are:

-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return NumPy :obj:`np.ndarray` objects.
-                * :obj:`'jax'`: Return JAX :obj:`jnp.ndarray` objects.
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.

        Returns:
-            :class:`~transformers.BatchFeature`: A :class:`~transformers.BatchFeature` with the following fields:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
              width).
-            - **labels** -- Optional labels to be fed to a model (when :obj:`segmentation_maps` are provided)
+            - **labels** -- Optional labels to be fed to a model (when `segmentation_maps` are provided)
        """
        # Input type checking for clearer error
        valid_images = False

--- a/src/transformers/models/segformer/modeling_segformer.py
+++ b/src/transformers/models/segformer/modeling_segformer.py
@@ -485,22 +485,23 @@ class SegformerModel(SegformerPreTrainedModel):
        r"""
        Returns:

-        Examples::
+        Examples:

-            >>> from transformers import SegformerFeatureExtractor, SegformerModel
-            >>> from PIL import Image
-            >>> import requests
+        ```python
+        >>> from transformers import SegformerFeatureExtractor, SegformerModel
+        >>> from PIL import Image
+        >>> import requests

-            >>> feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
-            >>> model = SegformerModel("nvidia/segformer-b0-finetuned-ade-512-512")
+        >>> feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
+        >>> model = SegformerModel("nvidia/segformer-b0-finetuned-ade-512-512")

-            >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
-            >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> image = Image.open(requests.get(url, stream=True).raw)

-            >>> inputs = feature_extractor(images=image, return_tensors="pt")
-            >>> outputs = model(**inputs)
-            >>> sequence_output = outputs.last_hidden_state
-        """
+        >>> inputs = feature_extractor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> sequence_output = outputs.last_hidden_state
+        ```"""

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (