Mass conversion of documentation from rst to Markdown (#14866)

* Convert docstrings of all configurations and tokenizers * Processors and fixes * Last modeling files and fixes to models * Pipeline modules * Utils files * Data submodule * All the other files * Style * Missing examples * Style again * Fix copies * Say bye bye to rst docstrings forever

Mass conversion of documentation from rst to Markdown (#14866)
* Convert docstrings of all configurations and tokenizers * Processors and fixes * Last modeling files and fixes to models * Pipeline modules * Utils files * Data submodule * All the other files * Style * Missing examples * Style again * Fix copies * Say bye bye to rst docstrings forever
27b3031d · Sylvain Gugger · GitHub · 18587639 · 27b3031d · 27b3031d
Unverified Commit 27b3031d authored Dec 21, 2021 by Sylvain Gugger Committed by GitHub Dec 21, 2021
20 changed files
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -13,10 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
- Utilities to convert slow tokenizers in their fast tokenizers counterparts.
+Utilities to convert slow tokenizers in their fast tokenizers counterparts.
-    All the conversions are grouped here to gather SentencePiece dependencies outside of the fast tokenizers files and
+All the conversions are grouped here to gather SentencePiece dependencies outside of the fast tokenizers files and
-    allow to make our dependency on SentencePiece optional.
+allow to make our dependency on SentencePiece optional.
 """
 from typing import Dict, List, Tuple
@@ -960,13 +960,13 @@ def convert_slow_tokenizer(transformer_tokenizer) -> Tokenizer:
    Utilities to convert a slow tokenizer instance in a fast tokenizer instance.
    Args:
-        transformer_tokenizer (:class:`~transformers.tokenization_utils_base.PreTrainedTokenizer`):
+        transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]):
            Instance of a slow tokenizer to convert in the backend tokenizer for
-            :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerFast`.
+            [`~tokenization_utils_base.PreTrainedTokenizerFast`].
    Return:
-        A instance of :class:`~tokenizers.Tokenizer` to be used as the backend tokenizer of a
+        A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
-        :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerFast`
+        [`~tokenization_utils_base.PreTrainedTokenizerFast`]
    """
    tokenizer_class_name = transformer_tokenizer.__class__.__name__

--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@@ -50,8 +50,8 @@ def default_data_collator(features: List[InputDataClass], return_tensors="pt") -
    Very simple data collator that simply collates batches of dict-like objects and performs special handling for
    potential keys named:
-        - ``label``: handles a single value (int or float) per object
+        - `label`: handles a single value (int or float) per object
-        - ``label_ids``: handles a list of values per object
+        - `label_ids`: handles a list of values per object
    Does not do any additional preprocessing: property names of the input object will be used as corresponding inputs
    to the model. See glue and ner for example of how it's useful.
@@ -76,8 +76,8 @@ class DefaultDataCollator(DataCollatorMixin):
    Very simple data collator that simply collates batches of dict-like objects and performs special handling for
    potential keys named:
-        - ``label``: handles a single value (int or float) per object
+        - `label`: handles a single value (int or float) per object
-        - ``label_ids``: handles a list of values per object
+        - `label_ids`: handles a list of values per object
    Does not do any additional preprocessing: property names of the input object will be used as corresponding inputs
    to the model. See glue and ner for example of how it's useful.
@@ -86,7 +86,7 @@ class DefaultDataCollator(DataCollatorMixin):
    helpful if you need to set a return_tensors value at initialization.
    Args:
-        return_tensors (:obj:`str`):
+        return_tensors (`str`):
            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
    """
@@ -213,26 +213,26 @@ class DataCollatorWithPadding:
    Data collator that will dynamically pad the inputs received.
    Args:
-        tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
+        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
            The tokenizer used for encoding the data.
-        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+        padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
-            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+            - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
-            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
-            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+            - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
-        max_length (:obj:`int`, `optional`):
+        max_length (`int`, *optional*):
            Maximum length of the returned list and optionally padding length (see above).
-        pad_to_multiple_of (:obj:`int`, `optional`):
+        pad_to_multiple_of (`int`, *optional*):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
-        return_tensors (:obj:`str`):
+        return_tensors (`str`):
            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
    """
@@ -265,28 +265,28 @@ class DataCollatorForTokenClassification(DataCollatorMixin):
    Data collator that will dynamically pad the inputs received, as well as the labels.
    Args:
-        tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
+        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
            The tokenizer used for encoding the data.
-        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+        padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
-            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+            - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
-            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
-            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+            - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
-        max_length (:obj:`int`, `optional`):
+        max_length (`int`, *optional*):
            Maximum length of the returned list and optionally padding length (see above).
-        pad_to_multiple_of (:obj:`int`, `optional`):
+        pad_to_multiple_of (`int`, *optional*):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
-        label_pad_token_id (:obj:`int`, `optional`, defaults to -100):
+        label_pad_token_id (`int`, *optional*, defaults to -100):
            The id to use when padding the labels (-100 will be automatically ignore by PyTorch loss functions).
-        return_tensors (:obj:`str`):
+        return_tensors (`str`):
            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
    """
@@ -515,33 +515,33 @@ class DataCollatorForSeq2Seq:
    Data collator that will dynamically pad the inputs received, as well as the labels.
    Args:
-        tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
+        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
            The tokenizer used for encoding the data.
-        model (:class:`~transformers.PreTrainedModel`):
+        model ([`PreTrainedModel`]):
-            The model that is being trained. If set and has the `prepare_decoder_input_ids_from_labels`, use it to
+            The model that is being trained. If set and has the *prepare_decoder_input_ids_from_labels*, use it to
-            prepare the `decoder_input_ids`
+            prepare the *decoder_input_ids*
-            This is useful when using `label_smoothing` to avoid calculating loss twice.
+            This is useful when using *label_smoothing* to avoid calculating loss twice.
-        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+        padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
-            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+            - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence is provided).
-            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
-            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+            - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
-        max_length (:obj:`int`, `optional`):
+        max_length (`int`, *optional*):
            Maximum length of the returned list and optionally padding length (see above).
-        pad_to_multiple_of (:obj:`int`, `optional`):
+        pad_to_multiple_of (`int`, *optional*):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
-        label_pad_token_id (:obj:`int`, `optional`, defaults to -100):
+        label_pad_token_id (`int`, *optional*, defaults to -100):
            The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).
-        return_tensors (:obj:`str`):
+        return_tensors (`str`):
            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
    """
@@ -605,26 +605,27 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
    are not all of the same length.
    Args:
-        tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
+        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
            The tokenizer used for encoding the data.
-        mlm (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        mlm (`bool`, *optional*, defaults to `True`):
-            Whether or not to use masked language modeling. If set to :obj:`False`, the labels are the same as the
+            Whether or not to use masked language modeling. If set to `False`, the labels are the same as the
            inputs with the padding tokens ignored (by setting them to -100). Otherwise, the labels are -100 for
            non-masked tokens and the value to predict for the masked token.
-        mlm_probability (:obj:`float`, `optional`, defaults to 0.15):
+        mlm_probability (`float`, *optional*, defaults to 0.15):
-            The probability with which to (randomly) mask tokens in the input, when :obj:`mlm` is set to :obj:`True`.
+            The probability with which to (randomly) mask tokens in the input, when `mlm` is set to `True`.
-        pad_to_multiple_of (:obj:`int`, `optional`):
+        pad_to_multiple_of (`int`, *optional*):
            If set will pad the sequence to a multiple of the provided value.
-        return_tensors (:obj:`str`):
+        return_tensors (`str`):
            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
-    .. note::
+    <Tip>
-        For best performance, this data collator should be used with a dataset having items that are dictionaries or
+    For best performance, this data collator should be used with a dataset having items that are dictionaries or
-        BatchEncoding, with the :obj:`"special_tokens_mask"` key, as returned by a
+    BatchEncoding, with the `"special_tokens_mask"` key, as returned by a
-        :class:`~transformers.PreTrainedTokenizer` or a :class:`~transformers.PreTrainedTokenizerFast` with the
+    [`PreTrainedTokenizer`] or a [`PreTrainedTokenizerFast`] with the
-        argument :obj:`return_special_tokens_mask=True`.
+    argument `return_special_tokens_mask=True`.
-    """
+    </Tip>"""
    tokenizer: PreTrainedTokenizerBase
    mlm: bool = True
@@ -845,13 +846,14 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
    - collates batches of tensors, honoring their tokenizer's pad_token
    - preprocesses batches for masked language modeling
-    .. note::
+    <Tip>
-        This collator relies on details of the implementation of subword tokenization by
+    This collator relies on details of the implementation of subword tokenization by
-        :class:`~transformers.BertTokenizer`, specifically that subword tokens are prefixed with `##`. For tokenizers
+    [`BertTokenizer`], specifically that subword tokens are prefixed with *##*. For tokenizers
-        that do not adhere to this scheme, this collator will produce an output that is roughly equivalent to
+    that do not adhere to this scheme, this collator will produce an output that is roughly equivalent to
-        :class:`.DataCollatorForLanguageModeling`.
+    [`.DataCollatorForLanguageModeling`].
-    """
+    </Tip>"""
    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
        if isinstance(examples[0], (dict, BatchEncoding)):
@@ -1227,14 +1229,13 @@ class DataCollatorForPermutationLanguageModeling(DataCollatorMixin):
        """
        The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
-            0. Start from the beginning of the sequence by setting ``cur_len = 0`` (number of tokens processed so far).
+            0. Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
-            1. Sample a ``span_length`` from the interval ``[1, max_span_length]`` (length of span of tokens to be
+            1. Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be
               masked)
-            2. Reserve a context of length ``context_length = span_length / plm_probability`` to surround span to be
+            2. Reserve a context of length `context_length = span_length / plm_probability` to surround span to be
               masked
-            3. Sample a starting point ``start_index`` from the interval ``[cur_len, cur_len + context_length -
+            3. Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length - span_length]` and mask tokens `start_index:start_index + span_length`
-               span_length]`` and mask tokens ``start_index:start_index + span_length``
+            4. Set `cur_len = cur_len + context_length`. If `cur_len < max_len` (i.e. there are tokens remaining in
-            4. Set ``cur_len = cur_len + context_length``. If ``cur_len < max_len`` (i.e. there are tokens remaining in
               the sequence to be processed), repeat from Step 1.
        """
        import torch
@@ -1325,14 +1326,13 @@ class DataCollatorForPermutationLanguageModeling(DataCollatorMixin):
        """
        The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
-            0. Start from the beginning of the sequence by setting ``cur_len = 0`` (number of tokens processed so far).
+            0. Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
-            1. Sample a ``span_length`` from the interval ``[1, max_span_length]`` (length of span of tokens to be
+            1. Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be
               masked)
-            2. Reserve a context of length ``context_length = span_length / plm_probability`` to surround span to be
+            2. Reserve a context of length `context_length = span_length / plm_probability` to surround span to be
               masked
-            3. Sample a starting point ``start_index`` from the interval ``[cur_len, cur_len + context_length -
+            3. Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length - span_length]` and mask tokens `start_index:start_index + span_length`
-               span_length]`` and mask tokens ``start_index:start_index + span_length``
+            4. Set `cur_len = cur_len + context_length`. If `cur_len < max_len` (i.e. there are tokens remaining in
-            4. Set ``cur_len = cur_len + context_length``. If ``cur_len < max_len`` (i.e. there are tokens remaining in
               the sequence to be processed), repeat from Step 1.
        """
        from random import randint
@@ -1434,14 +1434,13 @@ class DataCollatorForPermutationLanguageModeling(DataCollatorMixin):
        """
        The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
-            0. Start from the beginning of the sequence by setting ``cur_len = 0`` (number of tokens processed so far).
+            0. Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
-            1. Sample a ``span_length`` from the interval ``[1, max_span_length]`` (length of span of tokens to be
+            1. Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be
               masked)
-            2. Reserve a context of length ``context_length = span_length / plm_probability`` to surround span to be
+            2. Reserve a context of length `context_length = span_length / plm_probability` to surround span to be
               masked
-            3. Sample a starting point ``start_index`` from the interval ``[cur_len, cur_len + context_length -
+            3. Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length - span_length]` and mask tokens `start_index:start_index + span_length`
-               span_length]`` and mask tokens ``start_index:start_index + span_length``
+            4. Set `cur_len = cur_len + context_length`. If `cur_len < max_len` (i.e. there are tokens remaining in
-            4. Set ``cur_len = cur_len + context_length``. If ``cur_len < max_len`` (i.e. there are tokens remaining in
               the sequence to be processed), repeat from Step 1.
        """
        from random import randint

--- a/src/transformers/data/processors/glue.py
+++ b/src/transformers/data/processors/glue.py
@@ -48,20 +48,20 @@ def glue_convert_examples_to_features(
    output_mode=None,
 ):
    """
-    Loads a data file into a list of ``InputFeatures``
+    Loads a data file into a list of `InputFeatures`
    Args:
-        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
+        examples: List of `InputExamples` or `tf.data.Dataset` containing the examples.
        tokenizer: Instance of a tokenizer that will tokenize the examples
        max_length: Maximum example length. Defaults to the tokenizer's max_len
        task: GLUE task
-        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
+        label_list: List of labels. Can be obtained from the processor using the `processor.get_labels()` method
-        output_mode: String indicating the output mode. Either ``regression`` or ``classification``
+        output_mode: String indicating the output mode. Either `regression` or `classification`
    Returns:
-        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the
+        If the `examples` input is a `tf.data.Dataset`, will return a `tf.data.Dataset` containing the
-        task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific
+        task-specific features. If the input is a list of `InputExamples`, will return a list of task-specific
-        ``InputFeatures`` which can be fed to the model.
+        `InputFeatures` which can be fed to the model.
    """
    warnings.warn(DEPRECATION_WARNING.format("function"), FutureWarning)
@@ -84,7 +84,7 @@ if is_tf_available():
    ) -> tf.data.Dataset:
        """
        Returns:
-            A ``tf.data.Dataset`` containing the task-specific features.
+            A `tf.data.Dataset` containing the task-specific features.
        """
        processor = glue_processors[task]()

--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@@ -332,8 +332,8 @@ def squad_convert_examples_to_features(
    model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
    Args:
-        examples: list of :class:`~transformers.data.processors.squad.SquadExample`
+        examples: list of [`~data.processors.squad.SquadExample`]
-        tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer`
+        tokenizer: an instance of a child of [`PreTrainedTokenizer`]
        max_seq_length: The maximum sequence length of the inputs.
        doc_stride: The stride used when the context is too large and is split across several features.
        max_query_length: The maximum length of the query.
@@ -345,22 +345,23 @@ def squad_convert_examples_to_features(
    Returns:
-        list of :class:`~transformers.data.processors.squad.SquadFeatures`
+        list of [`~data.processors.squad.SquadFeatures`]
-    Example::
+    Example:
-        processor = SquadV2Processor()
+    ```python
-        examples = processor.get_dev_examples(data_dir)
+    processor = SquadV2Processor()
+    examples = processor.get_dev_examples(data_dir)
-        features = squad_convert_examples_to_features(
+    features = squad_convert_examples_to_features(
-            examples=examples,
+        examples=examples,
-            tokenizer=tokenizer,
+        tokenizer=tokenizer,
-            max_seq_length=args.max_seq_length,
+        max_seq_length=args.max_seq_length,
-            doc_stride=args.doc_stride,
+        doc_stride=args.doc_stride,
-            max_query_length=args.max_query_length,
+        max_query_length=args.max_query_length,
-            is_training=not evaluate,
+        is_training=not evaluate,
-        )
+    )
-    """
+    ```"""
    # Defining helper methods
    features = []
@@ -574,23 +575,24 @@ class SquadProcessor(DataProcessor):
    def get_examples_from_dataset(self, dataset, evaluate=False):
        """
-        Creates a list of :class:`~transformers.data.processors.squad.SquadExample` using a TFDS dataset.
+        Creates a list of [`~data.processors.squad.SquadExample`] using a TFDS dataset.
        Args:
-            dataset: The tfds dataset loaded from `tensorflow_datasets.load("squad")`
+            dataset: The tfds dataset loaded from *tensorflow_datasets.load("squad")*
            evaluate: Boolean specifying if in evaluation mode or in training mode
        Returns:
            List of SquadExample
-        Examples::
+        Examples:
-            >>> import tensorflow_datasets as tfds
+        ```python
-            >>> dataset = tfds.load("squad")
+        >>> import tensorflow_datasets as tfds
+        >>> dataset = tfds.load("squad")
-            >>> training_examples = get_examples_from_dataset(dataset, evaluate=False)
+        >>> training_examples = get_examples_from_dataset(dataset, evaluate=False)
-            >>> evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
+        >>> evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
-        """
+        ```"""
        if evaluate:
            dataset = dataset["validation"]
@@ -759,8 +761,8 @@ class SquadExample:
 class SquadFeatures:
    """
    Single squad example features to be fed to a model. Those features are model-specific and can be crafted from
-    :class:`~transformers.data.processors.squad.SquadExample` using the
+    [`~data.processors.squad.SquadExample`] using the
-    :method:`~transformers.data.processors.squad.squad_convert_examples_to_features` method.
+    :method:*~transformers.data.processors.squad.squad_convert_examples_to_features* method.
    Args:
        input_ids: Indices of input sequence tokens in the vocabulary.

--- a/src/transformers/data/processors/utils.py
+++ b/src/transformers/data/processors/utils.py
@@ -60,7 +60,7 @@ class InputFeatures:
    Args:
        input_ids: Indices of input sequence tokens in the vocabulary.
        attention_mask: Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``: Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded)
+            Mask values selected in `[0, 1]`: Usually `1` for tokens that are NOT MASKED, `0` for MASKED (padded)
            tokens.
        token_type_ids: (Optional) Segment token indices to indicate first and second
            portions of the inputs. Only some models use them.
@@ -92,15 +92,15 @@ class DataProcessor:
        raise NotImplementedError()
    def get_train_examples(self, data_dir):
-        """Gets a collection of :class:`InputExample` for the train set."""
+        """Gets a collection of [`InputExample`] for the train set."""
        raise NotImplementedError()
    def get_dev_examples(self, data_dir):
-        """Gets a collection of :class:`InputExample` for the dev set."""
+        """Gets a collection of [`InputExample`] for the dev set."""
        raise NotImplementedError()
    def get_test_examples(self, data_dir):
-        """Gets a collection of :class:`InputExample` for the test set."""
+        """Gets a collection of [`InputExample`] for the test set."""
        raise NotImplementedError()
    def get_labels(self):
@@ -240,21 +240,21 @@ class SingleSentenceClassificationProcessor(DataProcessor):
        return_tensors=None,
    ):
        """
-        Convert examples in a list of ``InputFeatures``
+        Convert examples in a list of `InputFeatures`
        Args:
            tokenizer: Instance of a tokenizer that will tokenize the examples
            max_length: Maximum example length
-            pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
+            pad_on_left: If set to `True`, the examples will be padded on the left rather than on the right (default)
            pad_token: Padding token
-            mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
+            mask_padding_with_zero: If set to `True`, the attention mask will be filled by `1` for actual values
-                and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
+                and by `0` for padded values. If set to `False`, inverts it (`1` for padded values, `0` for
                actual values)
        Returns:
-            If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the
+            If the `examples` input is a `tf.data.Dataset`, will return a `tf.data.Dataset` containing the
-            task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific
+            task-specific features. If the input is a list of `InputExamples`, will return a list of task-specific
-            ``InputFeatures`` which can be fed to the model.
+            `InputFeatures` which can be fed to the model.
        """
        if max_length is None:

--- a/src/transformers/debug_utils.py
+++ b/src/transformers/debug_utils.py
@@ -28,7 +28,7 @@ logger = logging.get_logger(__name__)
 class DebugUnderflowOverflow:
    """
    This debug class helps detect and understand where the model starts getting very large or very small, and more
-    importantly ``nan`` or ``inf`` weight and activation elements.
+    importantly `nan` or `inf` weight and activation elements.
    There are 2 working modes:
@@ -37,69 +37,77 @@ class DebugUnderflowOverflow:
    Mode 1: Underflow/overflow detection
-    To activate the underflow/overflow detection, initialize the object with the model ::
+    To activate the underflow/overflow detection, initialize the object with the model :
-        debug_overflow = DebugUnderflowOverflow(model)
+    ```python
+    debug_overflow = DebugUnderflowOverflow(model)
+    ```
-    then run the training as normal and if ``nan`` or ``inf`` gets detected in at least one of the weight, input or
+    then run the training as normal and if `nan` or `inf` gets detected in at least one of the weight, input or
-    output elements this module will throw an exception and will print ``max_frames_to_save`` frames that lead to this
+    output elements this module will throw an exception and will print `max_frames_to_save` frames that lead to this
    event, each frame reporting
-    1. the fully qualified module name plus the class name whose ``forward`` was run
+    1. the fully qualified module name plus the class name whose `forward` was run
    2. the absolute min and max value of all elements for each module weights, and the inputs and output
-    For example, here is the header and the last few frames in detection report for ``google/mt5-small`` run in fp16 mixed precision ::
+    For example, here is the header and the last few frames in detection report for `google/mt5-small` run in fp16 mixed precision :
-        Detected inf/nan during batch_number=0
+    ```
-        Last 21 forward frames:
+    Detected inf/nan during batch_number=0
-        abs min  abs max  metadata
+    Last 21 forward frames:
-        [...]
+    abs min  abs max  metadata
-                          encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
+    [...]
-        2.17e-07 4.50e+00 weight
+                      encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
-        1.79e-06 4.65e+00 input[0]
+    2.17e-07 4.50e+00 weight
-        2.68e-06 3.70e+01 output
+    1.79e-06 4.65e+00 input[0]
-                          encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
+    2.68e-06 3.70e+01 output
-        8.08e-07 2.66e+01 weight
+                      encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
-        1.79e-06 4.65e+00 input[0]
+    8.08e-07 2.66e+01 weight
-        1.27e-04 2.37e+02 output
+    1.79e-06 4.65e+00 input[0]
-                          encoder.block.2.layer.1.DenseReluDense.wo Linear
+    1.27e-04 2.37e+02 output
-        1.01e-06 6.44e+00 weight
+                      encoder.block.2.layer.1.DenseReluDense.wo Linear
-        0.00e+00 9.74e+03 input[0]
+    1.01e-06 6.44e+00 weight
-        3.18e-04 6.27e+04 output
+    0.00e+00 9.74e+03 input[0]
-                          encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
+    3.18e-04 6.27e+04 output
-        1.79e-06 4.65e+00 input[0]
+                      encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
-        3.18e-04 6.27e+04 output
+    1.79e-06 4.65e+00 input[0]
-                          encoder.block.2.layer.1.dropout Dropout
+    3.18e-04 6.27e+04 output
-        3.18e-04 6.27e+04 input[0]
+                      encoder.block.2.layer.1.dropout Dropout
-        0.00e+00      inf output
+    3.18e-04 6.27e+04 input[0]
+    0.00e+00      inf output
-    You can see here, that ``T5DenseGatedGeluDense.forward`` resulted in output activations, whose absolute max value
+    ```
-    was around 62.7K, which is very close to fp16's top limit of 64K. In the next frame we have ``Dropout`` which
+    You can see here, that `T5DenseGatedGeluDense.forward` resulted in output activations, whose absolute max value
+    was around 62.7K, which is very close to fp16's top limit of 64K. In the next frame we have `Dropout` which
    renormalizes the weights, after it zeroed some of the elements, which pushes the absolute max value to more than
    64K, and we get an overlow.
    As you can see it's the previous frames that we need to look into when the numbers start going into very large for
    fp16 numbers.
-    The tracking is done in a forward hook, which gets invoked immediately after ``forward`` has completed.
+    The tracking is done in a forward hook, which gets invoked immediately after `forward` has completed.
-    By default the last 21 frames are printed. You can change the default to adjust for your needs. For example ::
+    By default the last 21 frames are printed. You can change the default to adjust for your needs. For example :
-        debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100)
+    ```python
+    debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100)
+    ```
-    To validate that you have set up this debugging feature correctly, and you intend to use it in a training that may
+        To validate that you have set up this debugging feature correctly, and you intend to use it in a training that may
-    take hours to complete, first run it with normal tracing enabled for one of a few batches as explained in the next
+        take hours to complete, first run it with normal tracing enabled for one of a few batches as explained in the next
-    section.
+        section.
-    Mode 2. Specific batch absolute min/max tracing without detection
+        Mode 2. Specific batch absolute min/max tracing without detection
-    The second work mode is per-batch tracing with the underflow/overflow detection feature turned off.
+        The second work mode is per-batch tracing with the underflow/overflow detection feature turned off.
-    Let's say you want to watch the absolute min and max values for all the ingredients of each ``forward`` call of a
+        Let's say you want to watch the absolute min and max values for all the ingredients of each `forward` call of a
-    given batch, and only do that for batches 1 and 3. Then you instantiate this class as ::
+    given batch, and only do that for batches 1 and 3. Then you instantiate this class as :
-        debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3])
+    ```python
+    debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3])
+    ```
    And now full batches 1 and 3 will be traced using the same format as explained above. Batches are 0-indexed.
@@ -109,28 +117,29 @@ class DebugUnderflowOverflow:
    Early stopping:
-    You can also specify the batch number after which to stop the training, with ::
+    You can also specify the batch number after which to stop the training, with :
-        debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3], abort_after_batch_num=3)
+    ```python
+    debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3], abort_after_batch_num=3)
+    ```
    This feature is mainly useful in the tracing mode, but you can use it for any mode.
    **Performance**:
-    As this module measures absolute ``min``/``max`` of each weight of the model on every forward it'll slow the
+    As this module measures absolute `min`/``max` of each weight of the model on every forward it'll slow the
    training down. Therefore remember to turn it off once the debugging needs have been met.
    Args:
-        model (:obj:`nn.Module`):
+        model (`nn.Module`):
            The model to debug.
-        max_frames_to_save (:obj:`int`, `optional`, defaults to 21):
+        max_frames_to_save (`int`, *optional*, defaults to 21):
            How many frames back to record
-        trace_batch_nums(:obj:`List[int]`, `optional`, defaults to ``[]``):
+        trace_batch_nums(`List[int]`, *optional*, defaults to `[]`):
            Which batch numbers to trace (turns detection off)
-        abort_after_batch_num  (:obj:`int`, `optional`):
+        abort_after_batch_num  (`int``, *optional*):
            Whether to abort after a certain batch number has finished
    """
    def __init__(self, model, max_frames_to_save=21, trace_batch_nums=[], abort_after_batch_num=None):
@@ -287,7 +296,7 @@ def get_abs_min_max(var, ctx):
 def detect_overflow(var, ctx):
    """
-    Report whether the tensor contains any ``nan`` or ``inf`` entries.
+    Report whether the tensor contains any `nan` or `inf` entries.
    This is useful for detecting overflows/underflows and best to call right after the function that did some math that
    modified the tensor in question.
@@ -300,7 +309,7 @@ def detect_overflow(var, ctx):
        ctx: the message to print as a context
    Return:
-        :obj:`True` if ``inf`` or ``nan`` was detected, :obj:`False` otherwise
+        `True` if `inf` or `nan` was detected, `False` otherwise
    """
    detected = False
    if torch.isnan(var).any().item():

--- a/src/transformers/deepspeed.py
+++ b/src/transformers/deepspeed.py
@@ -41,16 +41,16 @@ class HfDeepSpeedConfig:
    """
    This object contains a DeepSpeed configuration dictionary and can be quickly queried for things like zero stage.
-    A ``weakref`` of this object is stored in the module's globals to be able to access the config from areas where
+    A `weakref` of this object is stored in the module's globals to be able to access the config from areas where
-    things like the Trainer object is not available (e.g. ``from_pretrained`` and ``_get_resized_embeddings``).
+    things like the Trainer object is not available (e.g. `from_pretrained` and `_get_resized_embeddings`).
    Therefore it's important that this object remains alive while the program is still running.
-    :class:`~transformers.Trainer` uses the ``HfTrainerDeepSpeedConfig`` subclass instead. That subclass has logic to
+    [`Trainer`] uses the `HfTrainerDeepSpeedConfig` subclass instead. That subclass has logic to
-    sync the configuration with values of :class:`~transformers.TrainingArguments` by replacing special placeholder
+    sync the configuration with values of [`TrainingArguments`] by replacing special placeholder
-    values: ``"auto"``. Without this special logic the DeepSpeed configuration is not modified in any way.
+    values: `"auto"`. Without this special logic the DeepSpeed configuration is not modified in any way.
    Args:
-        config_file_or_dict (:obj:`Union[str, Dict]`): path to DeepSpeed config file or dict.
+        config_file_or_dict (`Union[str, Dict]`): path to DeepSpeed config file or dict.
    """
@@ -104,7 +104,7 @@ class HfDeepSpeedConfig:
    def get_value(self, ds_key_long, default=None):
        """
-        Returns the set value or ``default`` if no value is set
+        Returns the set value or `default` if no value is set
        """
        config, ds_key = self.find_config_node(ds_key_long)
        if config is None:
@@ -115,7 +115,7 @@ class HfDeepSpeedConfig:
        """
        Deletes a sub-section of the config file if it's found.
-        Unless ``must_exist`` is :obj:`True` the section doesn't have to exist.
+        Unless `must_exist` is `True` the section doesn't have to exist.
        """
        config = self.config
@@ -136,8 +136,7 @@ class HfDeepSpeedConfig:
    def is_true(self, ds_key_long):
        """
-        Returns :obj:`True`/:obj:`False` only if the value is set, always :obj:`False` otherwise. So use this method to
+        Returns `True`/``False` only if the value is set, always `False` otherwise. So use this method to ask the very specific question of whether the value is set to `True` (and it's not set to `False`` or
-        ask the very specific question of whether the value is set to :obj:`True` (and it's not set to :obj:`False` or
        isn't set).
        """
@@ -146,8 +145,7 @@ class HfDeepSpeedConfig:
    def is_false(self, ds_key_long):
        """
-        Returns :obj:`True`/:obj:`False` only if the value is set, always :obj:`False` otherwise. So use this method to
+        Returns `True`/``False` only if the value is set, always `False` otherwise. So use this method to ask the very specific question of whether the value is set to `False` (and it's not set to `True`` or
-        ask the very specific question of whether the value is set to :obj:`False` (and it's not set to :obj:`True` or
        isn't set).
        """
        value = self.get_value(ds_key_long)
@@ -165,7 +163,7 @@ class HfDeepSpeedConfig:
 class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig):
    """
-    The ``HfTrainerDeepSpeedConfig`` object is meant to be created during ``TrainingArguments`` object creation and has
+    The `HfTrainerDeepSpeedConfig` object is meant to be created during `TrainingArguments` object creation and has
    the same lifespan as the latter.
    """
@@ -181,11 +179,11 @@ class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig):
        """
        A utility method that massages the config file and can optionally verify that the values match.
-        1. Replace "auto" values with ``TrainingArguments`` value.
+        1. Replace "auto" values with `TrainingArguments` value.
-        2. If it wasn't "auto" and ``must_match`` is true, then check that DS config matches Trainer
+        2. If it wasn't "auto" and `must_match` is true, then check that DS config matches Trainer
-        config values and if mismatched add the entry to ``self.mismatched`` - will assert during
+        config values and if mismatched add the entry to `self.mismatched` - will assert during
-        ``trainer_config_finalize`` for one or more mismatches.
+        `trainer_config_finalize` for one or more mismatches.
        """
        config, ds_key = self.find_config_node(ds_key_long)
@@ -207,7 +205,7 @@ class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig):
    def trainer_config_process(self, args):
        """
-        Adjust the config with ``TrainingArguments`` values. This stage is run during ``TrainingArguments`` object
+        Adjust the config with `TrainingArguments` values. This stage is run during `TrainingArguments` object
        creation.
        """
        # DeepSpeed does:
@@ -373,7 +371,7 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None, inf
    """
    Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args.
-    If ``resume_from_checkpoint`` was passed then an attempt to resume from a previously saved checkpoint will be made.
+    If `resume_from_checkpoint` was passed then an attempt to resume from a previously saved checkpoint will be made.
    Args:
        trainer: Trainer object

--- a/src/transformers/feature_extraction_sequence_utils.py
+++ b/src/transformers/feature_extraction_sequence_utils.py
@@ -40,11 +40,11 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
    This is a general feature extraction class for speech recognition.
    Args:
-        feature_size (:obj:`int`):
+        feature_size (`int`):
            The feature dimension of the extracted features.
-        sampling_rate (:obj:`int`):
+        sampling_rate (`int`):
            The sampling rate at which the audio files should be digitalized expressed in Hertz per second (Hz).
-        padding_value (:obj:`float`):
+        padding_value (`float`):
            The value that is used to fill the padding values / vectors.
    """
@@ -79,53 +79,54 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
        max sequence length in the batch.
        Padding side (left/right) padding values are defined at the feature extractor level (with
-        ``self.padding_side``, ``self.padding_value``)
+        `self.padding_side`, `self.padding_value`)
-        .. note::
+        <Tip>
-            If the ``processed_features`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors,
+        If the `processed_features` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors,
-            the result will use the same type unless you provide a different tensor type with ``return_tensors``. In
+        the result will use the same type unless you provide a different tensor type with `return_tensors`. In
-            the case of PyTorch tensors, you will lose the specific device of your tensors however.
+        the case of PyTorch tensors, you will lose the specific device of your tensors however.
+        </Tip>
        Args:
-            processed_features (:class:`~transformers.BatchFeature`, list of :class:`~transformers.BatchFeature`, :obj:`Dict[str, List[float]]`, :obj:`Dict[str, List[List[float]]` or :obj:`List[Dict[str, List[float]]]`):
+            processed_features ([`BatchFeature`], list of [`BatchFeature`], `Dict[str, List[float]]`, `Dict[str, List[List[float]]` or `List[Dict[str, List[float]]]`):
-                Processed inputs. Can represent one input (:class:`~transformers.BatchFeature` or :obj:`Dict[str,
+                Processed inputs. Can represent one input ([`BatchFeature`] or `Dict[str, List[float]]`) or a batch of input values / vectors (list of [`BatchFeature`],
-                List[float]]`) or a batch of input values / vectors (list of :class:`~transformers.BatchFeature`,
+                *Dict[str, List[List[float]]]* or *List[Dict[str, List[float]]]*) so you can use this method during
-                `Dict[str, List[List[float]]]` or `List[Dict[str, List[float]]]`) so you can use this method during
                preprocessing as well as in a PyTorch Dataloader collate function.
-                Instead of :obj:`List[float]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow
+                Instead of `List[float]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow
                tensors), see the note above for the return type.
-            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `True`):
                Select a strategy to pad the returned sequences (according to the model's padding side and padding
                index) among:
-                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a
                  single sequence if provided).
-                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided.
-                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
                  different lengths).
-            max_length (:obj:`int`, `optional`):
+            max_length (`int`, *optional*):
                Maximum length of the returned list and optionally padding length (see above).
-            truncation (:obj:`bool`):
+            truncation (`bool`):
-                Activates truncation to cut input sequences longer than :obj:`max_length` to :obj:`max_length`.
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
-            pad_to_multiple_of (:obj:`int`, `optional`):
+            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value.
                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                >= 7.5 (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
-            return_attention_mask (:obj:`bool`, `optional`):
+            return_attention_mask (`bool`, *optional*):
                Whether to return the attention mask. If left to the default, will return the attention mask according
                to the specific feature_extractor's default.
-                `What are attention masks? <../glossary.html#attention-mask>`__
+                [What are attention masks?](../glossary#attention-mask)
-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:
-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
        """
        # If we have a list of dicts, let's convert it in a dict of lists
        # We do this to allow using this method as a collate_fn function in PyTorch Dataloader

--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@@ -54,16 +54,16 @@ PreTrainedFeatureExtractor = Union["SequenceFeatureExtractor"]  # noqa: F821
 class BatchFeature(UserDict):
    r"""
-    Holds the output of the :meth:`~transformers.SequenceFeatureExtractor.pad` and feature extractor specific
+    Holds the output of the [`~SequenceFeatureExtractor.pad`] and feature extractor specific
-    ``__call__`` methods.
+    `__call__` methods.
    This class is derived from a python dictionary and can be used as a dictionary.
    Args:
-        data (:obj:`dict`):
+        data (`dict`):
            Dictionary of lists/arrays/tensors returned by the __call__/pad methods ('input_values', 'attention_mask',
            etc.).
-        tensor_type (:obj:`Union[None, str, TensorType]`, `optional`):
+        tensor_type (`Union[None, str, TensorType]`, *optional*):
            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
            initialization.
    """
@@ -74,7 +74,7 @@ class BatchFeature(UserDict):
    def __getitem__(self, item: str) -> Union[Any]:
        """
-        If the key is a string, returns the value of the dict associated to :obj:`key` ('input_values',
+        If the key is a string, returns the value of the dict associated to `key` ('input_values',
        'attention_mask', etc.).
        """
        if isinstance(item, str):
@@ -112,9 +112,9 @@ class BatchFeature(UserDict):
        Convert the inner content to tensors.
        Args:
-            tensor_type (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+            tensor_type (`str` or [`~file_utils.TensorType`], *optional*):
-                The type of tensors to use. If :obj:`str`, should be one of the values of the enum
+                The type of tensors to use. If `str`, should be one of the values of the enum
-                :class:`~transformers.file_utils.TensorType`. If :obj:`None`, no modification is done.
+                [`~file_utils.TensorType`]. If `None`, no modification is done.
        """
        if tensor_type is None:
            return self
@@ -176,13 +176,13 @@ class BatchFeature(UserDict):
    # Copied from transformers.tokenization_utils_base.BatchEncoding.to with BatchEncoding->BatchFeature
    def to(self, device: Union[str, "torch.device"]) -> "BatchFeature":
        """
-        Send all values to device by calling :obj:`v.to(device)` (PyTorch only).
+        Send all values to device by calling `v.to(device)` (PyTorch only).
        Args:
-            device (:obj:`str` or :obj:`torch.device`): The device to put the tensors on.
+            device (`str` or `torch.device`): The device to put the tensors on.
        Returns:
-            :class:`~transformers.BatchFeature`: The same instance after modification.
+            [`BatchFeature`]: The same instance after modification.
        """
        # This check catches things like APEX blindly calling "to" on all inputs to a module
@@ -216,83 +216,84 @@ class FeatureExtractionMixin:
        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
    ) -> PreTrainedFeatureExtractor:
        r"""
-        Instantiate a type of :class:`~transformers.feature_extraction_utils.FeatureExtractionMixin` from a feature
+        Instantiate a type of [`~feature_extraction_utils.FeatureExtractionMixin`] from a feature
-        extractor, *e.g.* a derived class of :class:`~transformers.SequenceFeatureExtractor`.
+        extractor, *e.g.* a derived class of [`SequenceFeatureExtractor`].
        Args:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
                This can be either:
-                - a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
+                - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
+                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
-                  namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
+                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
-                - a path to a `directory` containing a feature extractor file saved using the
+                - a path to a *directory* containing a feature extractor file saved using the
-                  :func:`~transformers.feature_extraction_utils.FeatureExtractionMixin.save_pretrained` method, e.g.,
+                  [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] method, e.g.,
-                  ``./my_model_directory/``.
+                  `./my_model_directory/`.
-                - a path or url to a saved feature extractor JSON `file`, e.g.,
+                - a path or url to a saved feature extractor JSON *file*, e.g.,
-                  ``./my_model_directory/preprocessor_config.json``.
+                  `./my_model_directory/preprocessor_config.json`.
-            cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
+            cache_dir (`str` or `os.PathLike`, *optional*):
                Path to a directory in which a downloaded pretrained model feature extractor should be cached if the
                standard cache should not be used.
-            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force to (re-)download the feature extractor files and override the cached versions
                if they exist.
-            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            resume_download (`bool`, *optional*, defaults to `False`):
                Whether or not to delete incompletely received file. Attempts to resume the download if such a file
                exists.
-            proxies (:obj:`Dict[str, str]`, `optional`):
+            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+            use_auth_token (`str` or *bool*, *optional*):
-            use_auth_token (:obj:`str` or `bool`, `optional`):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token
-                The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
+                generated when running `transformers-cli login` (stored in `~/.huggingface`).
-                generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
+            revision(`str`, *optional*, defaults to `"main"`):
-            revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                identifier allowed by git.
-            return_unused_kwargs (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
-                If :obj:`False`, then this function returns just the final feature extractor object. If :obj:`True`,
+                If `False`, then this function returns just the final feature extractor object. If `True`,
-                then this functions returns a :obj:`Tuple(feature_extractor, unused_kwargs)` where `unused_kwargs` is a
+                then this functions returns a `Tuple(feature_extractor, unused_kwargs)` where *unused_kwargs* is a
                dictionary consisting of the key/value pairs whose keys are not feature extractor attributes: i.e., the
-                part of ``kwargs`` which has not been used to update ``feature_extractor`` and is otherwise ignored.
+                part of `kwargs` which has not been used to update `feature_extractor` and is otherwise ignored.
-            kwargs (:obj:`Dict[str, Any]`, `optional`):
+            kwargs (`Dict[str, Any]`, *optional*):
                The values in kwargs of any keys which are feature extractor attributes will be used to override the
                loaded values. Behavior concerning key/value pairs whose keys are *not* feature extractor attributes is
-                controlled by the ``return_unused_kwargs`` keyword parameter.
+                controlled by the `return_unused_kwargs` keyword parameter.
-        .. note::
+        <Tip>
-            Passing :obj:`use_auth_token=True` is required when you want to use a private model.
+        Passing `use_auth_token=True` is required when you want to use a private model.
+        </Tip>
        Returns:
-            A feature extractor of type :class:`~transformers.feature_extraction_utils.FeatureExtractionMixin`.
+            A feature extractor of type [`~feature_extraction_utils.FeatureExtractionMixin`].
-        Examples::
+        Examples:
-            # We can't instantiate directly the base class `FeatureExtractionMixin` nor `SequenceFeatureExtractor` so let's show the examples on a
+        ```python
-            # derived class: `Wav2Vec2FeatureExtractor`
+        # We can't instantiate directly the base class *FeatureExtractionMixin* nor *SequenceFeatureExtractor* so let's show the examples on a
-            feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h')    # Download feature_extraction_config from huggingface.co and cache.
+        # derived class: *Wav2Vec2FeatureExtractor*
-            feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('./test/saved_model/')  # E.g. feature_extractor (or model) was saved using `save_pretrained('./test/saved_model/')`
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h')    # Download feature_extraction_config from huggingface.co and cache.
-            feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('./test/saved_model/preprocessor_config.json')
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('./test/saved_model/')  # E.g. feature_extractor (or model) was saved using *save_pretrained('./test/saved_model/')*
-            feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h', return_attention_mask=False, foo=False)
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('./test/saved_model/preprocessor_config.json')
-            assert feature_extractor.return_attention_mask is False
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h', return_attention_mask=False, foo=False)
-            feature_extractor, unused_kwargs = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h', return_attention_mask=False,
+        assert feature_extractor.return_attention_mask is False
-                                                               foo=False, return_unused_kwargs=True)
+        feature_extractor, unused_kwargs = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h', return_attention_mask=False,
-            assert feature_extractor.return_attention_mask is False
+                                                           foo=False, return_unused_kwargs=True)
-            assert unused_kwargs == {'foo': False}
+        assert feature_extractor.return_attention_mask is False
-        """
+        assert unused_kwargs == {'foo': False}
+        ```"""
        feature_extractor_dict, kwargs = cls.get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs)
        return cls.from_dict(feature_extractor_dict, **kwargs)
    def save_pretrained(self, save_directory: Union[str, os.PathLike]):
        """
-        Save a feature_extractor object to the directory ``save_directory``, so that it can be re-loaded using the
+        Save a feature_extractor object to the directory `save_directory`, so that it can be re-loaded using the
-        :func:`~transformers.feature_extraction_utils.FeatureExtractionMixin.from_pretrained` class method.
+        [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] class method.
        Args:
-            save_directory (:obj:`str` or :obj:`os.PathLike`):
+            save_directory (`str` or `os.PathLike`):
                Directory where the feature extractor JSON file will be saved (will be created if it does not exist).
        """
        if os.path.isfile(save_directory):
@@ -309,16 +310,16 @@ class FeatureExtractionMixin:
        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
        """
-        From a ``pretrained_model_name_or_path``, resolve to a dictionary of parameters, to be used for instantiating a
+        From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
-        feature extractor of type :class:`~transformers.feature_extraction_utils.FeatureExtractionMixin` using
+        feature extractor of type [`~feature_extraction_utils.FeatureExtractionMixin`] using
-        ``from_dict``.
+        `from_dict`.
        Parameters:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
                The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
        Returns:
-            :obj:`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the feature extractor
+            `Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the feature extractor
            object.
        """
        cache_dir = kwargs.pop("cache_dir", None)
@@ -397,19 +398,19 @@ class FeatureExtractionMixin:
    @classmethod
    def from_dict(cls, feature_extractor_dict: Dict[str, Any], **kwargs) -> PreTrainedFeatureExtractor:
        """
-        Instantiates a type of :class:`~transformers.feature_extraction_utils.FeatureExtractionMixin` from a Python
+        Instantiates a type of [`~feature_extraction_utils.FeatureExtractionMixin`] from a Python
        dictionary of parameters.
        Args:
-            feature_extractor_dict (:obj:`Dict[str, Any]`):
+            feature_extractor_dict (`Dict[str, Any]`):
                Dictionary that will be used to instantiate the feature extractor object. Such a dictionary can be
                retrieved from a pretrained checkpoint by leveraging the
-                :func:`~transformers.feature_extraction_utils.FeatureExtractionMixin.to_dict` method.
+                [`~feature_extraction_utils.FeatureExtractionMixin.to_dict`] method.
-            kwargs (:obj:`Dict[str, Any]`):
+            kwargs (`Dict[str, Any]`):
                Additional parameters from which to initialize the feature extractor object.
        Returns:
-            :class:`~transformers.feature_extraction_utils.FeatureExtractionMixin`: The feature extractor object
+            [`~feature_extraction_utils.FeatureExtractionMixin`]: The feature extractor object
            instantiated from those parameters.
        """
        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
@@ -436,7 +437,7 @@ class FeatureExtractionMixin:
        Serializes this instance to a Python dictionary.
        Returns:
-            :obj:`Dict[str, Any]`: Dictionary of all the attributes that make up this feature extractor instance.
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this feature extractor instance.
        """
        output = copy.deepcopy(self.__dict__)
        output["feature_extractor_type"] = self.__class__.__name__
@@ -446,15 +447,15 @@ class FeatureExtractionMixin:
    @classmethod
    def from_json_file(cls, json_file: Union[str, os.PathLike]) -> PreTrainedFeatureExtractor:
        """
-        Instantiates a feature extractor of type :class:`~transformers.feature_extraction_utils.FeatureExtractionMixin`
+        Instantiates a feature extractor of type [`~feature_extraction_utils.FeatureExtractionMixin`]
        from the path to a JSON file of parameters.
        Args:
-            json_file (:obj:`str` or :obj:`os.PathLike`):
+            json_file (`str` or `os.PathLike`):
                Path to the JSON file containing the parameters.
        Returns:
-            A feature extractor of type :class:`~transformers.feature_extraction_utils.FeatureExtractionMixin`: The
+            A feature extractor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The
            feature_extractor object instantiated from that JSON file.
        """
        with open(json_file, "r", encoding="utf-8") as reader:
@@ -467,7 +468,7 @@ class FeatureExtractionMixin:
        Serializes this instance to a JSON string.
        Returns:
-            :obj:`str`: String containing all the attributes that make up this feature_extractor instance in JSON
+            `str`: String containing all the attributes that make up this feature_extractor instance in JSON
            format.
        """
        dictionary = self.to_dict()
@@ -483,7 +484,7 @@ class FeatureExtractionMixin:
        Save this instance to a JSON file.
        Args:
-            json_file_path (:obj:`str` or :obj:`os.PathLike`):
+            json_file_path (`str` or `os.PathLike`):
                Path to the JSON file in which this feature_extractor instance's parameters will be saved.
        """
        with open(json_file_path, "w", encoding="utf-8") as writer:

--- a/src/transformers/generation_beam_search.py
+++ b/src/transformers/generation_beam_search.py
@@ -25,70 +25,70 @@ from .file_utils import add_start_docstrings
 PROCESS_INPUTS_DOCSTRING = r"""
    Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * num_beams, sequence_length)`):
+        input_ids (`torch.LongTensor` of shape `(batch_size * num_beams, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using any class inheriting from :class:`~transformers.PreTrainedTokenizer`. See
+            Indices can be obtained using any class inheriting from [`PreTrainedTokenizer`]. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
            details.
-            `What are input IDs? <../glossary.html#input-ids>`__
+            [What are input IDs?](../glossary#input-ids)
-        next_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2 * num_beams)`):
+        next_scores (`torch.FloatTensor` of shape `(batch_size, 2 * num_beams)`):
-            Current scores of the top :obj:`2 * num_beams` non-finished beam hypotheses.
+            Current scores of the top `2 * num_beams` non-finished beam hypotheses.
-        next_tokens (:obj:`torch.LongTensor` of shape :obj:`(batch_size, 2 * num_beams)`):
+        next_tokens (`torch.LongTensor` of shape `(batch_size, 2 * num_beams)`):
-            :obj:`input_ids` of the tokens corresponding to the top :obj:`2 * num_beams` non-finished beam hypotheses.
+            `input_ids` of the tokens corresponding to the top `2 * num_beams` non-finished beam hypotheses.
-        next_indices (:obj:`torch.LongTensor` of shape :obj:`(batch_size, 2 * num_beams)`):
+        next_indices (`torch.LongTensor` of shape `(batch_size, 2 * num_beams)`):
-            Beam indices indicating to which beam hypothesis the :obj:`next_tokens` correspond.
+            Beam indices indicating to which beam hypothesis the `next_tokens` correspond.
-        pad_token_id (:obj:`int`, `optional`):
+        pad_token_id (`int`, *optional*):
-            The id of the `padding` token.
+            The id of the *padding* token.
-        eos_token_id (:obj:`int`, `optional`):
+        eos_token_id (`int`, *optional*):
-            The id of the `end-of-sequence` token.
+            The id of the *end-of-sequence* token.
    Return:
-        :obj:`UserDict`: A dictionary composed of the fields as defined above:
+        `UserDict`: A dictionary composed of the fields as defined above:
-            - **next_beam_scores** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Updated
+            - **next_beam_scores** (`torch.FloatTensor` of shape `(batch_size * num_beams)`) -- Updated
              scores of all non-finished beams.
-            - **next_beam_tokens** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Next tokens
+            - **next_beam_tokens** (`torch.FloatTensor` of shape `(batch_size * num_beams)`) -- Next tokens
              to be added to the non-finished beam_hypotheses.
-            - **next_beam_indices** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Beam indices
+            - **next_beam_indices** (`torch.FloatTensor` of shape `(batch_size * num_beams)`) -- Beam indices
              indicating to which beam the next tokens shall be added.
 """
 FINALIZE_INPUTS_DOCSTRING = r"""
    Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * num_beams, sequence_length)`):
+        input_ids (`torch.LongTensor` of shape `(batch_size * num_beams, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using any class inheriting from :class:`~transformers.PreTrainedTokenizer`. See
+            Indices can be obtained using any class inheriting from [`PreTrainedTokenizer`]. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
            details.
-            `What are input IDs? <../glossary.html#input-ids>`__
+            [What are input IDs?](../glossary#input-ids)
-        final_beam_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`):
+        final_beam_scores (`torch.FloatTensor` of shape `(batch_size * num_beams)`):
            The final scores of all non-finished beams.
-        final_beam_tokens (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`):
+        final_beam_tokens (`torch.FloatTensor` of shape `(batch_size * num_beams)`):
            The last tokens to be added to the non-finished beam_hypotheses.
-        final_beam_indices (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`):
+        final_beam_indices (`torch.FloatTensor` of shape `(batch_size * num_beams)`):
-            The beam indices indicating to which beam the :obj:`final_beam_tokens` shall be added.
+            The beam indices indicating to which beam the `final_beam_tokens` shall be added.
-        pad_token_id (:obj:`int`, `optional`):
+        pad_token_id (`int`, *optional*):
-            The id of the `padding` token.
+            The id of the *padding* token.
-        eos_token_id (:obj:`int`, `optional`):
+        eos_token_id (`int`, *optional*):
-            The id of the `end-of-sequence` token.
+            The id of the *end-of-sequence* token.
    Return:
-        :obj:`torch.LongTensor` of shape :obj:`(batch_size * num_return_sequences, sequence_length)`: The generated
+        `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)`: The generated
-        sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or shorter if all
+        sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter if all
-        batches finished early due to the :obj:`eos_token_id`.
+        batches finished early due to the `eos_token_id`.
 """
 class BeamScorer(ABC):
    """
-    Abstract base class for all beam scorers that are used for :meth:`~transformers.PreTrainedModel.beam_search` and
+    Abstract base class for all beam scorers that are used for [`~PreTrainedModel.beam_search`] and
-    :meth:`~transformers.PreTrainedModel.beam_sample`.
+    [`~PreTrainedModel.beam_sample`].
    """
    @abstractmethod
@@ -119,36 +119,34 @@ class BeamScorer(ABC):
 class BeamSearchScorer(BeamScorer):
    r"""
-    :class:`transformers.BeamScorer` implementing standard beam search decoding.
+    [`BeamScorer`] implementing standard beam search decoding.
-    Adapted in part from `Facebook's XLM beam search code
+    Adapted in part from [Facebook's XLM beam search code](https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529).
-    <https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529>`__.
-    Reference for the diverse beam search algorithm and implementation `Ashwin Kalyan's DBS implementation
+    Reference for the diverse beam search algorithm and implementation [Ashwin Kalyan's DBS implementation](https://github.com/ashwinkalyan/dbs/blob/master/dbs/beam_utils.lua)
-    <https://github.com/ashwinkalyan/dbs/blob/master/dbs/beam_utils.lua>`__
    Args:
-        batch_size (:obj:`int`):
+        batch_size (`int`):
-            Batch Size of :obj:`input_ids` for which standard beam search decoding is run in parallel.
+            Batch Size of `input_ids` for which standard beam search decoding is run in parallel.
-        max_length (:obj:`int`):
+        max_length (`int`):
            The maximum length of the sequence to be generated.
-        num_beams (:obj:`int`):
+        num_beams (`int`):
            Number of beams for beam search.
-        device (:obj:`torch.device`):
+        device (`torch.device`):
-            Defines the device type (*e.g.*, :obj:`"cpu"` or :obj:`"cuda"`) on which this instance of
+            Defines the device type (*e.g.*, `"cpu"` or `"cuda"`) on which this instance of
-            :obj:`BeamSearchScorer` will be allocated.
+            `BeamSearchScorer` will be allocated.
-        length_penalty (:obj:`float`, `optional`, defaults to 1.0):
+        length_penalty (`float`, *optional*, defaults to 1.0):
            Exponential penalty to the length. 1.0 means no penalty. Set to values < 1.0 in order to encourage the
            model to generate shorter sequences, to a value > 1.0 in order to encourage the model to produce longer
            sequences.
-        do_early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        do_early_stopping (`bool`, *optional*, defaults to `False`):
-            Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not.
+            Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not.
-        num_beam_hyps_to_keep (:obj:`int`, `optional`, defaults to 1):
+        num_beam_hyps_to_keep (`int`, *optional*, defaults to 1):
            The number of beam hypotheses that shall be returned upon calling
-            :meth:`~transformer.BeamSearchScorer.finalize`.
+            [`~transformer.BeamSearchScorer.finalize`].
-        num_beam_groups (:obj:`int`):
+        num_beam_groups (`int`):
-            Number of groups to divide :obj:`num_beams` into in order to ensure diversity among different groups of
+            Number of groups to divide `num_beams` into in order to ensure diversity among different groups of
-            beams. See `this paper <https://arxiv.org/pdf/1610.02424.pdf>`__ for more details.
+            beams. See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
    """
    def __init__(

--- a/src/transformers/generation_flax_logits_process.py
+++ b/src/transformers/generation_flax_logits_process.py
@@ -29,22 +29,22 @@ logger = get_logger(__name__)
 LOGITS_PROCESSOR_INPUTS_DOCSTRING = r"""
    Args:
-        input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`):
+        input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using :class:`~transformers.PreTrainedTokenizer`. See
+            Indices can be obtained using [`PreTrainedTokenizer`]. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
            details.
-            `What are input IDs? <../glossary.html#input-ids>`__
+            [What are input IDs?](../glossary#input-ids)
-        scores (:obj:`jnp.ndarray` of shape :obj:`(batch_size, config.vocab_size)`):
+        scores (`jnp.ndarray` of shape `(batch_size, config.vocab_size)`):
            Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam
            search or log softmax for each vocabulary token when using beam search
        kwargs:
            Additional logits processor specific kwargs.
    Return:
-        :obj:`jnp.ndarray` of shape :obj:`(batch_size, config.vocab_size)`: The processed prediction scores.
+        `jnp.ndarray` of shape `(batch_size, config.vocab_size)`: The processed prediction scores.
 """
@@ -73,10 +73,10 @@ class FlaxLogitsWarper(ABC):
 class FlaxLogitsProcessorList(list):
    """
-    This class can be used to create a list of :class:`~transformers.FlaxLogitsProcessor` or
+    This class can be used to create a list of [`FlaxLogitsProcessor`] or
-    :class:`~transformers.FlaxLogitsWarper` to subsequently process a :obj:`scores` input tensor. This class inherits
+    [`FlaxLogitsWarper`] to subsequently process a `scores` input tensor. This class inherits
-    from list and adds a specific `__call__` method to apply each :class:`~transformers.FlaxLogitsProcessor` or
+    from list and adds a specific *__call__* method to apply each [`FlaxLogitsProcessor`] or
-    :class:`~transformers.FlaxLogitsWarper` to the inputs.
+    [`FlaxLogitsWarper`] to the inputs.
    """
    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
@@ -97,10 +97,10 @@ class FlaxLogitsProcessorList(list):
 class FlaxTemperatureLogitsWarper(FlaxLogitsWarper):
    r"""
-    :class:`transformers.LogitsWarper` for temperature (exponential scaling output probability distribution).
+    [`LogitsWarper`] for temperature (exponential scaling output probability distribution).
    Args:
-        temperature (:obj:`float`):
+        temperature (`float`):
            The value used to module the logits distribution.
    """
@@ -117,16 +117,16 @@ class FlaxTemperatureLogitsWarper(FlaxLogitsWarper):
 class FlaxTopPLogitsWarper(FlaxLogitsWarper):
    """
-    :class:`transformers.LogitsWarper` that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <=
+    [`LogitsWarper`] that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <=
    prob_cut_off.
    Args:
-        top_p (:obj:`float`):
+        top_p (`float`):
-            If set to < 1, only the most probable tokens with probabilities that add up to :obj:`top_p` or higher are
+            If set to < 1, only the most probable tokens with probabilities that add up to `top_p` or higher are
            kept for generation.
-        filter_value (:obj:`float`, `optional`, defaults to :obj:`-float("Inf")`):
+        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
            All filtered values will be set to this float value.
-        min_tokens_to_keep (:obj:`int`, `optional`, defaults to 1):
+        min_tokens_to_keep (`int`, *optional*, defaults to 1):
            Minimum number of tokens that cannot be filtered.
    """
@@ -159,14 +159,14 @@ class FlaxTopPLogitsWarper(FlaxLogitsWarper):
 class FlaxTopKLogitsWarper(FlaxLogitsWarper):
    r"""
-    :class:`transformers.LogitsWarper` that performs top-k, i.e. restricting to the k highest probability elements.
+    [`LogitsWarper`] that performs top-k, i.e. restricting to the k highest probability elements.
    Args:
-        top_k (:obj:`int`):
+        top_k (`int`):
            The number of highest probability vocabulary tokens to keep for top-k-filtering.
-        filter_value (:obj:`float`, `optional`, defaults to :obj:`-float("Inf")`):
+        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
            All filtered values will be set to this float value.
-        min_tokens_to_keep (:obj:`int`, `optional`, defaults to 1):
+        min_tokens_to_keep (`int`, *optional*, defaults to 1):
            Minimum number of tokens that cannot be filtered.
    """
@@ -195,10 +195,10 @@ class FlaxTopKLogitsWarper(FlaxLogitsWarper):
 class FlaxForcedBOSTokenLogitsProcessor(FlaxLogitsProcessor):
    r"""
-    :class:`~transformers.FlaxLogitsProcessor` that enforces the specified token as the first generated token.
+    [`FlaxLogitsProcessor`] that enforces the specified token as the first generated token.
    Args:
-        bos_token_id (:obj:`int`):
+        bos_token_id (`int`):
            The id of the token to force as the first generated token.
    """
@@ -219,14 +219,14 @@ class FlaxForcedBOSTokenLogitsProcessor(FlaxLogitsProcessor):
 class FlaxForcedEOSTokenLogitsProcessor(FlaxLogitsProcessor):
    r"""
-    :class:`~transformers.FlaxLogitsProcessor` that enforces the specified token as the last generated token when
+    [`FlaxLogitsProcessor`] that enforces the specified token as the last generated token when
-    :obj:`max_length` is reached.
+    `max_length` is reached.
    Args:
-        max_length (:obj:`int`):
+        max_length (`int`):
            The maximum length of the sequence to be generated.
-        eos_token_id (:obj:`int`):
+        eos_token_id (`int`):
-            The id of the token to force as the last generated token when :obj:`max_length` is reached.
+            The id of the token to force as the last generated token when `max_length` is reached.
    """
    def __init__(self, max_length: int, eos_token_id: int):
@@ -247,13 +247,13 @@ class FlaxForcedEOSTokenLogitsProcessor(FlaxLogitsProcessor):
 class FlaxMinLengthLogitsProcessor(FlaxLogitsProcessor):
    r"""
-    :class:`transformers.FlaxLogitsProcessor` enforcing a min-length by setting EOS probability to 0.
+    [`FlaxLogitsProcessor`] enforcing a min-length by setting EOS probability to 0.
    Args:
-        min_length (:obj:`int`):
+        min_length (`int`):
-            The minimum length below which the score of :obj:`eos_token_id` is set to :obj:`-float("Inf")`.
+            The minimum length below which the score of `eos_token_id` is set to `-float("Inf")`.
-        eos_token_id (:obj:`int`):
+        eos_token_id (`int`):
-            The id of the `end-of-sequence` token.
+            The id of the *end-of-sequence* token.
    """
    def __init__(self, min_length: int, eos_token_id: int):

--- a/src/transformers/generation_flax_utils.py
+++ b/src/transformers/generation_flax_utils.py
@@ -48,7 +48,7 @@ class FlaxGreedySearchOutput(ModelOutput):
    Args:
-        sequences (:obj:`jnp.ndarray` of shape :obj:`(batch_size, max_length)`):
+        sequences (`jnp.ndarray` of shape `(batch_size, max_length)`):
            The generated sequences.
    """
@@ -62,7 +62,7 @@ class FlaxSampleOutput(ModelOutput):
    Args:
-        sequences (:obj:`jnp.ndarray` of shape :obj:`(batch_size, max_length)`):
+        sequences (`jnp.ndarray` of shape `(batch_size, max_length)`):
            The generated sequences.
    """
@@ -76,9 +76,9 @@ class FlaxBeamSearchOutput(ModelOutput):
    Args:
-        sequences (:obj:`jnp.ndarray` of shape :obj:`(batch_size, max_length)`):
+        sequences (`jnp.ndarray` of shape `(batch_size, max_length)`):
            The generated sequences.
-        scores (:obj:`jnp.ndarray` of shape :obj:`(batch_size,)`):
+        scores (`jnp.ndarray` of shape `(batch_size,)`):
            The scores (log probabilites) of the generated sequences.
    """
@@ -119,7 +119,7 @@ class BeamSearchState:
 class FlaxGenerationMixin:
    """
    A class containing all of the functions supporting generation, to be used as a mixin in
-    :class:`~transformers.FlaxPreTrainedModel`.
+    [`FlaxPreTrainedModel`].
    """
    @staticmethod
@@ -149,7 +149,7 @@ class FlaxGenerationMixin:
        """
        This function can be overwritten in the specific modeling_flax_<model-name>.py classes to allow for custom beam
        search behavior. Note that the only model that overwrites this method is
-        :class:`~transformes.FlaxMarianMTModel`.
+        [`~transformes.FlaxMarianMTModel`].
        """
        return logits
@@ -181,61 +181,62 @@ class FlaxGenerationMixin:
        Generates sequences for models with a language modeling head. The method currently supports greedy decoding,
        and, multinomial sampling.
-        Apart from :obj:`input_ids`, all the arguments below will default to the value of the attribute of the same
+        Apart from `input_ids`, all the arguments below will default to the value of the attribute of the same
-        name inside the :class:`~transformers.PretrainedConfig` of the model. The default values indicated are the
+        name inside the [`PretrainedConfig`] of the model. The default values indicated are the
        default values of those config.
-        Most of these parameters are explained in more detail in `this blog post
+        Most of these parameters are explained in more detail in [this blog post](https://huggingface.co/blog/how-to-generate).
-        <https://huggingface.co/blog/how-to-generate>`__.
        Parameters:
-            input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`):
+            input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation.
-            max_length (:obj:`int`, `optional`, defaults to 20):
+            max_length (`int`, *optional*, defaults to 20):
                The maximum length of the sequence to be generated.
-            do_sample (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            do_sample (`bool`, *optional*, defaults to `False`):
                Whether or not to use sampling ; use greedy decoding otherwise.
-            temperature (:obj:`float`, `optional`, defaults to 1.0):
+            temperature (`float`, *optional*, defaults to 1.0):
                The value used to module the next token probabilities.
-            top_k (:obj:`int`, `optional`, defaults to 50):
+            top_k (`int`, *optional*, defaults to 50):
                The number of highest probability vocabulary tokens to keep for top-k-filtering.
-            top_p (:obj:`float`, `optional`, defaults to 1.0):
+            top_p (`float`, *optional*, defaults to 1.0):
-                If set to float < 1, only the most probable tokens with probabilities that add up to :obj:`top_p` or
+                If set to float < 1, only the most probable tokens with probabilities that add up to `top_p` or
                higher are kept for generation.
-            pad_token_id (:obj:`int`, `optional`):
+            pad_token_id (`int`, *optional*):
-                The id of the `padding` token.
+                The id of the *padding* token.
-            bos_token_id (:obj:`int`, `optional`):
+            bos_token_id (`int`, *optional*):
-                The id of the `beginning-of-sequence` token.
+                The id of the *beginning-of-sequence* token.
-            eos_token_id (:obj:`int`, `optional`):
+            eos_token_id (`int`, *optional*):
-                The id of the `end-of-sequence` token.
+                The id of the *end-of-sequence* token.
-            num_beams (:obj:`int`, `optional`, defaults to 1):
+            num_beams (`int`, *optional*, defaults to 1):
                Number of beams for beam search. 1 means no beam search.
-            decoder_start_token_id (:obj:`int`, `optional`):
+            decoder_start_token_id (`int`, *optional*):
-                If an encoder-decoder model starts decoding with a different token than `bos`, the id of that token.
+                If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token.
-            trace (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            trace (`bool`, *optional*, defaults to `True`):
-                Whether to trace generation. Setting ``trace=False`` should only be used for debugging and will lead to
+                Whether to trace generation. Setting `trace=False` should only be used for debugging and will lead to
                a considerably slower runtime.
-            params (:obj:`Dict[str, jnp.ndarray]`, `optional`):
+            params (`Dict[str, jnp.ndarray]`, *optional*):
                Optionally the model parameters can be passed. Can be useful for parallelized generation.
            model_kwargs:
-                Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model.
+                Additional model specific kwargs will be forwarded to the `forward` function of the model.
        Return:
-            :class:`~transformers.file_utils.ModelOutput`.
+            [`~file_utils.ModelOutput`].
-        Examples::
+        Examples:
-            >>> from transformers import AutoTokenizer, FlaxAutoModelForCausalLM
+        ```python
-            >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+        >>> from transformers import AutoTokenizer, FlaxAutoModelForCausalLM
-            >>> model = FlaxAutoModelForCausalLM.from_pretrained("distilgpt2")
-            >>> input_context = "The dog"
+        >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
-            >>> # encode input context
+        >>> model = FlaxAutoModelForCausalLM.from_pretrained("distilgpt2")
-            >>> input_ids = tokenizer(input_context, return_tensors="np").input_ids
+        >>> input_context = "The dog"
-            >>> # generate candidates using sampling
+        >>> # encode input context
-            >>> outputs = model.generate(input_ids=input_ids, max_length=20, top_k=30, do_sample=True)
+        >>> input_ids = tokenizer(input_context, return_tensors="np").input_ids
-            >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
+        >>> # generate candidates using sampling
-        """
+        >>> outputs = model.generate(input_ids=input_ids, max_length=20, top_k=30, do_sample=True)
+        >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
+        ```"""
        # set init values
        max_length = max_length if max_length is not None else self.config.max_length
        bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id
@@ -326,8 +327,8 @@ class FlaxGenerationMixin:
        self, top_k: int = None, top_p: float = None, temperature: float = None
    ) -> FlaxLogitsProcessorList:
        """
-        This class returns a :class:`~transformers.FlaxLogitsProcessorList` list object that contains all relevant
+        This class returns a [`FlaxLogitsProcessorList`] list object that contains all relevant
-        :class:`~transformers.FlaxLogitsWarper` instances used for multinomial sampling.
+        [`FlaxLogitsWarper`] instances used for multinomial sampling.
        """
        # init warp parameters
@@ -358,8 +359,8 @@ class FlaxGenerationMixin:
        forced_eos_token_id: int,
    ) -> FlaxLogitsProcessorList:
        """
-        This class returns a :class:`~transformers.FlaxLogitsProcessorList` list object that contains all relevant
+        This class returns a [`FlaxLogitsProcessorList`] list object that contains all relevant
-        :class:`~transformers.FlaxLogitsProcessor` instances used to modify the scores of the language model head.
+        [`FlaxLogitsProcessor`] instances used to modify the scores of the language model head.
        """
        processors = FlaxLogitsProcessorList()

--- a/src/transformers/generation_logits_process.py
+++ b/src/transformers/generation_logits_process.py
@@ -30,22 +30,22 @@ logger = get_logger(__name__)
 LOGITS_PROCESSOR_INPUTS_DOCSTRING = r"""
    Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using :class:`~transformers.BertTokenizer`. See
+            Indices can be obtained using [`BertTokenizer`]. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
            details.
-            `What are input IDs? <../glossary.html#input-ids>`__
+            [What are input IDs?](../glossary#input-ids)
-        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.vocab_size)`):
+        scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
            Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam
            search or log softmax for each vocabulary token when using beam search
        kwargs:
            Additional logits processor specific kwargs.
    Return:
-        :obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.vocab_size)`: The processed prediction scores.
+        `torch.FloatTensor` of shape `(batch_size, config.vocab_size)`: The processed prediction scores.
 """
@@ -74,10 +74,10 @@ class LogitsWarper(ABC):
 class LogitsProcessorList(list):
    """
-    This class can be used to create a list of :class:`~transformers.LogitsProcessor` or
+    This class can be used to create a list of [`LogitsProcessor`] or
-    :class:`~transformers.LogitsWarper` to subsequently process a :obj:`scores` input tensor. This class inherits from
+    [`LogitsWarper`] to subsequently process a `scores` input tensor. This class inherits from
-    list and adds a specific `__call__` method to apply each :class:`~transformers.LogitsProcessor` or
+    list and adds a specific *__call__* method to apply each [`LogitsProcessor`] or
-    :class:`~transformers.LogitsWarper` to the inputs.
+    [`LogitsWarper`] to the inputs.
    """
    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
@@ -98,13 +98,13 @@ class LogitsProcessorList(list):
 class MinLengthLogitsProcessor(LogitsProcessor):
    r"""
-    :class:`transformers.LogitsProcessor` enforcing a min-length by setting EOS probability to 0.
+    [`LogitsProcessor`] enforcing a min-length by setting EOS probability to 0.
    Args:
-        min_length (:obj:`int`):
+        min_length (`int`):
-            The minimum length below which the score of :obj:`eos_token_id` is set to :obj:`-float("Inf")`.
+            The minimum length below which the score of `eos_token_id` is set to `-float("Inf")`.
-        eos_token_id (:obj:`int`):
+        eos_token_id (`int`):
-            The id of the `end-of-sequence` token.
+            The id of the *end-of-sequence* token.
    """
    def __init__(self, min_length: int, eos_token_id: int):
@@ -126,10 +126,10 @@ class MinLengthLogitsProcessor(LogitsProcessor):
 class TemperatureLogitsWarper(LogitsWarper):
    r"""
-    :class:`transformers.LogitsWarper` for temperature (exponential scaling output probability distribution).
+    [`LogitsWarper`] for temperature (exponential scaling output probability distribution).
    Args:
-        temperature (:obj:`float`):
+        temperature (`float`):
            The value used to module the logits distribution.
    """
@@ -146,12 +146,11 @@ class TemperatureLogitsWarper(LogitsWarper):
 class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
    r"""
-    :class:`transformers.LogitsProcessor` enforcing an exponential penalty on repeated sequences.
+    [`LogitsProcessor`] enforcing an exponential penalty on repeated sequences.
    Args:
-        repetition_penalty (:obj:`float`):
+        repetition_penalty (`float`):
-            The parameter for repetition penalty. 1.0 means no penalty. See `this paper
+            The parameter for repetition penalty. 1.0 means no penalty. See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
-            <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.
    """
    def __init__(self, penalty: float):
@@ -172,16 +171,16 @@ class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
 class TopPLogitsWarper(LogitsWarper):
    """
-    :class:`transformers.LogitsWarper` that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <=
+    [`LogitsWarper`] that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <=
    prob_cut_off.
    Args:
-        top_p (:obj:`float`):
+        top_p (`float`):
-            If set to < 1, only the most probable tokens with probabilities that add up to :obj:`top_p` or higher are
+            If set to < 1, only the most probable tokens with probabilities that add up to `top_p` or higher are
            kept for generation.
-        filter_value (:obj:`float`, `optional`, defaults to :obj:`-float("Inf")`):
+        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
            All filtered values will be set to this float value.
-        min_tokens_to_keep (:obj:`int`, `optional`, defaults to 1):
+        min_tokens_to_keep (`int`, *optional*, defaults to 1):
            Minimum number of tokens that cannot be filtered.
    """
@@ -215,14 +214,14 @@ class TopPLogitsWarper(LogitsWarper):
 class TopKLogitsWarper(LogitsWarper):
    r"""
-    :class:`transformers.LogitsWarper` that performs top-k, i.e. restricting to the k highest probability elements.
+    [`LogitsWarper`] that performs top-k, i.e. restricting to the k highest probability elements.
    Args:
-        top_k (:obj:`int`):
+        top_k (`int`):
            The number of highest probability vocabulary tokens to keep for top-k-filtering.
-        filter_value (:obj:`float`, `optional`, defaults to :obj:`-float("Inf")`):
+        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
            All filtered values will be set to this float value.
-        min_tokens_to_keep (:obj:`int`, `optional`, defaults to 1):
+        min_tokens_to_keep (`int`, *optional*, defaults to 1):
            Minimum number of tokens that cannot be filtered.
    """
@@ -279,12 +278,11 @@ def _calc_banned_ngram_tokens(
 class NoRepeatNGramLogitsProcessor(LogitsProcessor):
    r"""
-    :class:`transformers.LogitsProcessor` that enforces no repetition of n-grams. See `Fairseq
+    [`LogitsProcessor`] that enforces no repetition of n-grams. See [Fairseq](https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345).
-    <https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345>`__.
    Args:
-        ngram_size (:obj:`int`):
+        ngram_size (`int`):
-            All ngrams of size :obj:`ngram_size` can only occur once.
+            All ngrams of size `ngram_size` can only occur once.
    """
    def __init__(self, ngram_size: int):
@@ -305,13 +303,13 @@ class NoRepeatNGramLogitsProcessor(LogitsProcessor):
 class EncoderNoRepeatNGramLogitsProcessor(LogitsProcessor):
    r"""
-    :class:`transformers.LogitsProcessor` that enforces no repetition of encoder input ids n-grams for the decoder ids.
+    [`LogitsProcessor`] that enforces no repetition of encoder input ids n-grams for the decoder ids.
-    See `ParlAI <https://github.com/facebookresearch/ParlAI/blob/master/parlai/core/torch_generator_agent.py#L1350>`__.
+    See [ParlAI](https://github.com/facebookresearch/ParlAI/blob/master/parlai/core/torch_generator_agent.py#L1350).
    Args:
-        encoder_ngram_size (:obj:`int`):
+        encoder_ngram_size (`int`):
-            All ngrams of size :obj:`ngram_size` can only occur within the encoder input ids.
+            All ngrams of size `ngram_size` can only occur within the encoder input ids.
-        encoder_input_ids (:obj:`int`):
+        encoder_input_ids (`int`):
            The encoder_input_ids that should not be repeated within the decoder ids.
    """
@@ -346,15 +344,14 @@ class EncoderNoRepeatNGramLogitsProcessor(LogitsProcessor):
 class NoBadWordsLogitsProcessor(LogitsProcessor):
    """
-    :class:`transformers.LogitsProcessor` that enforces that specified sequences will never be sampled.
+    [`LogitsProcessor`] that enforces that specified sequences will never be sampled.
    Args:
-        bad_words_ids (:obj:`List[List[int]]`):
+        bad_words_ids (`List[List[int]]`):
            List of list of token ids that are not allowed to be generated. In order to get the tokens of the words
-            that should not appear in the generated text, use :obj:`tokenizer(bad_word,
+            that should not appear in the generated text, use `tokenizer(bad_word, add_prefix_space=True).input_ids`.
-            add_prefix_space=True).input_ids`.
+        eos_token_id (`int`):
-        eos_token_id (:obj:`int`):
+            The id of the *end-of-sequence* token.
-            The id of the `end-of-sequence` token.
    """
    def __init__(self, bad_words_ids: List[List[int]], eos_token_id: int):
@@ -474,16 +471,16 @@ class NoBadWordsLogitsProcessor(LogitsProcessor):
 class PrefixConstrainedLogitsProcessor(LogitsProcessor):
    r"""
-    :class:`transformers.LogitsProcessor` that enforces constrained generation and is useful for prefix-conditioned
+    [`LogitsProcessor`] that enforces constrained generation and is useful for prefix-conditioned
-    constrained generation. See `Autoregressive Entity Retrieval <https://arxiv.org/abs/2010.00904>`__ for more
+    constrained generation. See [Autoregressive Entity Retrieval](https://arxiv.org/abs/2010.00904) for more
    information.
    Args:
-        prefix_allowed_tokens_fn: (:obj:`Callable[[int, torch.Tensor], List[int]]`):
+        prefix_allowed_tokens_fn: (`Callable[[int, torch.Tensor], List[int]]`):
            This function constraints the beam search to allowed tokens only at each step. This function takes 2
-            arguments :obj:`inputs_ids` and the batch ID :obj:`batch_id`. It has to return a list with the allowed
+            arguments `inputs_ids` and the batch ID `batch_id`. It has to return a list with the allowed
-            tokens for the next generation step conditioned on the previously generated tokens :obj:`inputs_ids` and
+            tokens for the next generation step conditioned on the previously generated tokens `inputs_ids` and
-            the batch ID :obj:`batch_id`.
+            the batch ID `batch_id`.
    """
    def __init__(self, prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], List[int]], num_beams: int):
@@ -501,20 +498,20 @@ class PrefixConstrainedLogitsProcessor(LogitsProcessor):
 class HammingDiversityLogitsProcessor(LogitsProcessor):
    r"""
-    :class:`transformers.LogitsProcessor` that enforces diverse beam search. Note that this logits processor is only
+    [`LogitsProcessor`] that enforces diverse beam search. Note that this logits processor is only
-    effective for :meth:`transformers.PreTrainedModel.group_beam_search`. See `Diverse Beam Search: Decoding Diverse
+    effective for [`PreTrainedModel.group_beam_search`]. See [Diverse Beam Search: Decoding Diverse
-    Solutions from Neural Sequence Models <https://arxiv.org/pdf/1610.02424.pdf>`__ for more details.
+    Solutions from Neural Sequence Models](https://arxiv.org/pdf/1610.02424.pdf) for more details.
    Args:
-        diversity_penalty (:obj:`float`):
+        diversity_penalty (`float`):
            This value is subtracted from a beam's score if it generates a token same as any beam from other group at a
-            particular time. Note that :obj:`diversity_penalty` is only effective if ``group beam search`` is enabled.
+            particular time. Note that `diversity_penalty` is only effective if `group beam search` is enabled.
-        num_beams (:obj:`int`):
+        num_beams (`int`):
-            Number of beams used for group beam search. See `this paper <https://arxiv.org/pdf/1610.02424.pdf>`__ for
+            Number of beams used for group beam search. See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for
            more details.
-        num_beam_groups (:obj:`int`):
+        num_beam_groups (`int`):
-            Number of groups to divide :obj:`num_beams` into in order to ensure diversity among different groups of
+            Number of groups to divide `num_beams` into in order to ensure diversity among different groups of
-            beams. See `this paper <https://arxiv.org/pdf/1610.02424.pdf>`__ for more details.
+            beams. See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
    """
    def __init__(self, diversity_penalty: float, num_beams: int, num_beam_groups: int):
@@ -561,10 +558,10 @@ class HammingDiversityLogitsProcessor(LogitsProcessor):
 class ForcedBOSTokenLogitsProcessor(LogitsProcessor):
    r"""
-    :class:`~transformers.LogitsProcessor` that enforces the specified token as the first generated token.
+    [`LogitsProcessor`] that enforces the specified token as the first generated token.
    Args:
-        bos_token_id (:obj:`int`):
+        bos_token_id (`int`):
            The id of the token to force as the first generated token.
    """
@@ -582,14 +579,14 @@ class ForcedBOSTokenLogitsProcessor(LogitsProcessor):
 class ForcedEOSTokenLogitsProcessor(LogitsProcessor):
    r"""
-    :class:`~transformers.LogitsProcessor` that enforces the specified token as the last generated token when
+    [`LogitsProcessor`] that enforces the specified token as the last generated token when
-    :obj:`max_length` is reached.
+    `max_length` is reached.
    Args:
-        max_length (:obj:`int`):
+        max_length (`int`):
            The maximum length of the sequence to be generated.
-        eos_token_id (:obj:`int`):
+        eos_token_id (`int`):
-            The id of the token to force as the last generated token when :obj:`max_length` is reached.
+            The id of the token to force as the last generated token when `max_length` is reached.
    """
    def __init__(self, max_length: int, eos_token_id: int):
@@ -607,9 +604,9 @@ class ForcedEOSTokenLogitsProcessor(LogitsProcessor):
 class InfNanRemoveLogitsProcessor(LogitsProcessor):
    r"""
-    :class:`~transformers.LogitsProcessor` that removes all :obj:`nan` and :obj:`inf` values to avoid the generation
+    [`LogitsProcessor`] that removes all `nan` and `inf` values to avoid the generation
    method to fail. Note that using the logits processor should only be used if necessary since it can slow down the
-    generation method. :obj:`max_length` is reached.
+    generation method. `max_length` is reached.
    """
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:

--- a/src/transformers/generation_stopping_criteria.py
+++ b/src/transformers/generation_stopping_criteria.py
@@ -11,22 +11,22 @@ from .file_utils import add_start_docstrings
 STOPPING_CRITERIA_INPUTS_DOCSTRING = r"""
    Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using :class:`~transformers.BertTokenizer`. See
+            Indices can be obtained using [`BertTokenizer`]. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
            details.
-            `What are input IDs? <../glossary.html#input-ids>`__
+            [What are input IDs?](../glossary#input-ids)
-        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.vocab_size)`):
+        scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
            Prediction scores of a language modeling head. These can be scores for each vocabulary token before SoftMax
            or scores for each vocabulary token after SoftMax.
        kwargs:
            Additional stopping criteria specific kwargs.
    Return:
-        :obj:`bool`. :obj:`False` indicates we should continue, :obj:`True` indicates we should stop.
+        `bool`. `False` indicates we should continue, `True` indicates we should stop.
 """
@@ -41,11 +41,11 @@ class StoppingCriteria(ABC):
 class MaxLengthCriteria(StoppingCriteria):
    """
-    This class can be used to stop generation whenever the full generated number of tokens exceeds :obj:`max_length`.
+    This class can be used to stop generation whenever the full generated number of tokens exceeds `max_length`.
    Keep in mind for decoder-only type of transformers, this will include the initial prompted tokens.
    Args:
-        max_length (:obj:`int`):
+        max_length (`int`):
            The maximum length that the output sequence can have in number of tokens.
    """
@@ -59,14 +59,14 @@ class MaxLengthCriteria(StoppingCriteria):
 class MaxNewTokensCriteria(StoppingCriteria):
    """
-    This class can be used to stop generation whenever the generated number of tokens exceeds :obj:`max_new_tokens`.
+    This class can be used to stop generation whenever the generated number of tokens exceeds `max_new_tokens`.
    Keep in mind for decoder-only type of transformers, this will **not** include the initial prompted tokens. This is
-    very close to :obj:`MaxLengthCriteria` but ignores the number of initial tokens.
+    very close to `MaxLengthCriteria` but ignores the number of initial tokens.
    Args:
-        start_length (:obj:`int`):
+        start_length (`int`):
            The number of initial tokens.
-        max_new_tokens (:obj:`int`):
+        max_new_tokens (`int`):
            The maximum number of tokens to generate.
    """
@@ -90,12 +90,12 @@ class MaxTimeCriteria(StoppingCriteria):
    """
    This class can be used to stop generation whenever the full generation exceeds some amount of time. By default, the
    time will start being counted when you initialize this function. You can override this by passing an
-    :obj:`initial_time`.
+    `initial_time`.
    Args:
-        max_time (:obj:`float`):
+        max_time (`float`):
            The maximum allowed time in seconds for the generation.
-        initial_time (:obj:`float`, `optional`, defaults to :obj:`time.time()`):
+        initial_time (`float`, *optional*, defaults to `time.time()`):
            The start of the generation allowed time.
    """

--- a/src/transformers/generation_tf_utils.py
+++ b/src/transformers/generation_tf_utils.py
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -41,14 +41,14 @@ def is_torch_tensor(obj):
 def load_image(image: Union[str, "PIL.Image.Image"]) -> "PIL.Image.Image":
    """
-    Loads :obj:`image` to a PIL Image.
+    Loads `image` to a PIL Image.
    Args:
-        image (:obj:`str` or :obj:`PIL.Image.Image`):
+        image (`str` or `PIL.Image.Image`):
            The image to convert to the PIL Image format.
    Returns:
-        :obj:`PIL.Image.Image`: A PIL Image.
+        `PIL.Image.Image`: A PIL Image.
    """
    if isinstance(image, str):
        if image.startswith("http://") or image.startswith("https://"):
@@ -87,15 +87,15 @@ class ImageFeatureExtractionMixin:
    def to_pil_image(self, image, rescale=None):
        """
-        Converts :obj:`image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last
+        Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last
        axis if needed.
        Args:
-            image (:obj:`PIL.Image.Image` or :obj:`numpy.ndarray` or :obj:`torch.Tensor`):
+            image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor`):
                The image to convert to the PIL Image format.
-            rescale (:obj:`bool`, `optional`):
+            rescale (`bool`, *optional*):
                Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will
-                default to :obj:`True` if the image type is a floating type, :obj:`False` otherwise.
+                default to `True` if the image type is a floating type, `False` otherwise.
        """
        self._ensure_format_supported(image)
@@ -117,17 +117,17 @@ class ImageFeatureExtractionMixin:
    def to_numpy_array(self, image, rescale=None, channel_first=True):
        """
-        Converts :obj:`image` to a numpy array. Optionally rescales it and puts the channel dimension as the first
+        Converts `image` to a numpy array. Optionally rescales it and puts the channel dimension as the first
        dimension.
        Args:
-            image (:obj:`PIL.Image.Image` or :obj:`np.ndarray` or :obj:`torch.Tensor`):
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
                The image to convert to a NumPy array.
-            rescale (:obj:`bool`, `optional`):
+            rescale (`bool`, *optional*):
                Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.). Will
-                default to :obj:`True` if the image is a PIL Image or an array/tensor of integers, :obj:`False`
+                default to `True` if the image is a PIL Image or an array/tensor of integers, `False`
                otherwise.
-            channel_first (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            channel_first (`bool`, *optional*, defaults to `True`):
                Whether or not to permute the dimensions of the image to put the channel dimension first.
        """
        self._ensure_format_supported(image)
@@ -151,15 +151,15 @@ class ImageFeatureExtractionMixin:
    def normalize(self, image, mean, std):
        """
-        Normalizes :obj:`image` with :obj:`mean` and :obj:`std`. Note that this will trigger a conversion of
+        Normalizes `image` with `mean` and `std`. Note that this will trigger a conversion of
-        :obj:`image` to a NumPy array if it's a PIL Image.
+        `image` to a NumPy array if it's a PIL Image.
        Args:
-            image (:obj:`PIL.Image.Image` or :obj:`np.ndarray` or :obj:`torch.Tensor`):
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
                The image to normalize.
-            mean (:obj:`List[float]` or :obj:`np.ndarray` or :obj:`torch.Tensor`):
+            mean (`List[float]` or `np.ndarray` or `torch.Tensor`):
                The mean (per channel) to use for normalization.
-            std (:obj:`List[float]` or :obj:`np.ndarray` or :obj:`torch.Tensor`):
+            std (`List[float]` or `np.ndarray` or `torch.Tensor`):
                The standard deviation (per channel) to use for normalization.
        """
        self._ensure_format_supported(image)
@@ -187,14 +187,14 @@ class ImageFeatureExtractionMixin:
    def resize(self, image, size, resample=PIL.Image.BILINEAR):
        """
-        Resizes :obj:`image`. Note that this will trigger a conversion of :obj:`image` to a PIL Image.
+        Resizes `image`. Note that this will trigger a conversion of `image` to a PIL Image.
        Args:
-            image (:obj:`PIL.Image.Image` or :obj:`np.ndarray` or :obj:`torch.Tensor`):
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
                The image to resize.
-            size (:obj:`int` or :obj:`Tuple[int, int]`):
+            size (`int` or `Tuple[int, int]`):
                The size to use for resizing the image.
-            resample (:obj:`int`, `optional`, defaults to :obj:`PIL.Image.BILINEAR`):
+            resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
                The filter to user for resampling.
        """
        self._ensure_format_supported(image)
@@ -210,13 +210,13 @@ class ImageFeatureExtractionMixin:
    def center_crop(self, image, size):
        """
-        Crops :obj:`image` to the given size using a center crop. Note that if the image is too small to be cropped to
+        Crops `image` to the given size using a center crop. Note that if the image is too small to be cropped to
        the size given, it will be padded (so the returned result has the size asked).
        Args:
-            image (:obj:`PIL.Image.Image` or :obj:`np.ndarray` or :obj:`torch.Tensor`):
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
                The image to resize.
-            size (:obj:`int` or :obj:`Tuple[int, int]`):
+            size (`int` or `Tuple[int, int]`):
                The size to which crop the image.
        """
        self._ensure_format_supported(image)

--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -264,11 +264,11 @@ def run_hp_search_ray(trainer, n_trials: int, direction: str, **kwargs) -> BestR
    @functools.wraps(trainable)
    def dynamic_modules_import_trainable(*args, **kwargs):
        """
-        Wrapper around ``tune.with_parameters`` to ensure datasets_modules are loaded on each Actor.
+        Wrapper around `tune.with_parameters` to ensure datasets_modules are loaded on each Actor.
        Without this, an ImportError will be thrown. See https://github.com/huggingface/transformers/issues/11565.
-        Assumes that ``_objective``, defined above, is a function.
+        Assumes that `_objective`, defined above, is a function.
        """
        if is_datasets_available():
            import datasets.load
@@ -372,11 +372,10 @@ def rewrite_logs(d):
 class TensorBoardCallback(TrainerCallback):
    """
-    A :class:`~transformers.TrainerCallback` that sends the logs to `TensorBoard
+    A [`TrainerCallback`] that sends the logs to [TensorBoard](https://www.tensorflow.org/tensorboard).
-    <https://www.tensorflow.org/tensorboard>`__.
    Args:
-        tb_writer (:obj:`SummaryWriter`, `optional`):
+        tb_writer (`SummaryWriter`, *optional*):
            The writer to use. Will instantiate one if not set.
    """
@@ -461,7 +460,7 @@ class TensorBoardCallback(TrainerCallback):
 class WandbCallback(TrainerCallback):
    """
-    A :class:`~transformers.TrainerCallback` that sends the logs to `Weight and Biases <https://www.wandb.com/>`__.
+    A [`TrainerCallback`] that sends the logs to [Weight and Biases](https://www.wandb.com/).
    """
    def __init__(self):
@@ -478,22 +477,21 @@ class WandbCallback(TrainerCallback):
    def setup(self, args, state, model, **kwargs):
        """
-        Setup the optional Weights & Biases (`wandb`) integration.
+        Setup the optional Weights & Biases (*wandb*) integration.
-        One can subclass and override this method to customize the setup if needed. Find more information `here
+        One can subclass and override this method to customize the setup if needed. Find more information [here](https://docs.wandb.ai/integrations/huggingface). You can also override the following environment variables:
-        <https://docs.wandb.ai/integrations/huggingface>`__. You can also override the following environment variables:
        Environment:
-            WANDB_LOG_MODEL (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            WANDB_LOG_MODEL (`bool`, *optional*, defaults to `False`):
                Whether or not to log model as artifact at the end of training. Use along with
-                `TrainingArguments.load_best_model_at_end` to upload best model.
+                *TrainingArguments.load_best_model_at_end* to upload best model.
-            WANDB_WATCH (:obj:`str`, `optional` defaults to :obj:`"gradients"`):
+            WANDB_WATCH (`str`, *optional* defaults to `"gradients"`):
-                Can be :obj:`"gradients"`, :obj:`"all"` or :obj:`"false"`. Set to :obj:`"false"` to disable gradient
+                Can be `"gradients"`, `"all"` or `"false"`. Set to `"false"` to disable gradient
-                logging or :obj:`"all"` to log gradients and parameters.
+                logging or `"all"` to log gradients and parameters.
-            WANDB_PROJECT (:obj:`str`, `optional`, defaults to :obj:`"huggingface"`):
+            WANDB_PROJECT (`str`, *optional*, defaults to `"huggingface"`):
                Set this to a custom string to store results in a different project.
-            WANDB_DISABLED (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            WANDB_DISABLED (`bool`, *optional*, defaults to `False`):
-                Whether or not to disable wandb entirely. Set `WANDB_DISABLED=true` to disable.
+                Whether or not to disable wandb entirely. Set *WANDB_DISABLED=true* to disable.
        """
        if self._wandb is None:
            return
@@ -585,7 +583,7 @@ class WandbCallback(TrainerCallback):
 class CometCallback(TrainerCallback):
    """
-    A :class:`~transformers.TrainerCallback` that sends the logs to `Comet ML <https://www.comet.ml/site/>`__.
+    A [`TrainerCallback`] that sends the logs to [Comet ML](https://www.comet.ml/site/).
    """
    def __init__(self):
@@ -599,19 +597,18 @@ class CometCallback(TrainerCallback):
        Setup the optional Comet.ml integration.
        Environment:
-            COMET_MODE (:obj:`str`, `optional`):
+            COMET_MODE (`str`, *optional*):
                Whether to create an online, offline experiment or disable Comet logging. Can be "OFFLINE", "ONLINE",
                or "DISABLED". Defaults to "ONLINE".
-            COMET_PROJECT_NAME (:obj:`str`, `optional`):
+            COMET_PROJECT_NAME (`str`, *optional*):
                Comet project name for experiments
-            COMET_OFFLINE_DIRECTORY (:obj:`str`, `optional`):
+            COMET_OFFLINE_DIRECTORY (`str`, *optional*):
-                Folder to use for saving offline experiments when :obj:`COMET_MODE` is "OFFLINE"
+                Folder to use for saving offline experiments when `COMET_MODE` is "OFFLINE"
-            COMET_LOG_ASSETS (:obj:`str`, `optional`):
+            COMET_LOG_ASSETS (`str`, *optional*):
                Whether or not to log training assets (tf event logs, checkpoints, etc), to Comet. Can be "TRUE", or
                "FALSE". Defaults to "TRUE".
-        For a number of configurable items in the environment, see `here
+        For a number of configurable items in the environment, see [here](https://www.comet.ml/docs/python-sdk/advanced/#comet-configuration-variables).
-        <https://www.comet.ml/docs/python-sdk/advanced/#comet-configuration-variables>`__.
        """
        self._initialized = True
        log_assets = os.getenv("COMET_LOG_ASSETS", "FALSE").upper()
@@ -661,8 +658,7 @@ class CometCallback(TrainerCallback):
 class AzureMLCallback(TrainerCallback):
    """
-    A :class:`~transformers.TrainerCallback` that sends the logs to `AzureML
+    A [`TrainerCallback`] that sends the logs to [AzureML](https://pypi.org/project/azureml-sdk/).
-    <https://pypi.org/project/azureml-sdk/>`__.
    """
    def __init__(self, azureml_run=None):
@@ -685,7 +681,7 @@ class AzureMLCallback(TrainerCallback):
 class MLflowCallback(TrainerCallback):
    """
-    A :class:`~transformers.TrainerCallback` that sends the logs to `MLflow <https://www.mlflow.org/>`__.
+    A [`TrainerCallback`] that sends the logs to [MLflow](https://www.mlflow.org/).
    """
    def __init__(self):
@@ -705,11 +701,11 @@ class MLflowCallback(TrainerCallback):
        Setup the optional MLflow integration.
        Environment:
-            HF_MLFLOW_LOG_ARTIFACTS (:obj:`str`, `optional`):
+            HF_MLFLOW_LOG_ARTIFACTS (`str`, *optional*):
                Whether to use MLflow .log_artifact() facility to log artifacts.
-                This only makes sense if logging to a remote server, e.g. s3 or GCS. If set to `True` or `1`, will copy
+                This only makes sense if logging to a remote server, e.g. s3 or GCS. If set to *True* or *1*, will copy
-                whatever is in :class:`~transformers.TrainingArguments`'s ``output_dir`` to the local or remote
+                whatever is in [`TrainingArguments`]'s `output_dir` to the local or remote
                artifact storage. Using it without a remote storage will just copy the files to your artifact location.
        """
        log_artifacts = os.getenv("HF_MLFLOW_LOG_ARTIFACTS", "FALSE").upper()
@@ -774,7 +770,7 @@ class MLflowCallback(TrainerCallback):
 class NeptuneCallback(TrainerCallback):
    """
-    A :class:`~transformers.TrainerCallback` that sends the logs to `Neptune <https://neptune.ai>`.
+    A [`TrainerCallback`] that sends the logs to *Neptune <https://neptune.ai>*.
    """
    def __init__(self):
@@ -793,13 +789,13 @@ class NeptuneCallback(TrainerCallback):
        Setup the Neptune integration.
        Environment:
-            NEPTUNE_PROJECT (:obj:`str`, `required`):
+            NEPTUNE_PROJECT (`str`, *required*):
-                The project ID for neptune.ai account. Should be in format `workspace_name/project_name`
+                The project ID for neptune.ai account. Should be in format *workspace_name/project_name*
-            NEPTUNE_API_TOKEN (:obj:`str`, `required`):
+            NEPTUNE_API_TOKEN (`str`, *required*):
                API-token for neptune.ai account
-            NEPTUNE_CONNECTION_MODE (:obj:`str`, `optional`):
+            NEPTUNE_CONNECTION_MODE (`str`, *optional*):
-                Neptune connection mode. `async` by default
+                Neptune connection mode. *async* by default
-            NEPTUNE_RUN_NAME (:obj:`str`, `optional`):
+            NEPTUNE_RUN_NAME (`str`, *optional*):
                The name of run process on Neptune dashboard
        """
        if state.is_world_process_zero:
@@ -831,7 +827,7 @@ class NeptuneCallback(TrainerCallback):
    def __del__(self):
        """
        Environment:
-            NEPTUNE_STOP_TIMEOUT (:obj:`int`, `optional`):
+            NEPTUNE_STOP_TIMEOUT (`int`, *optional*):
                Number of seconsds to wait for all Neptune.ai tracking calls to finish, before stopping the tracked
                run. If not set it will wait for all tracking calls to finish.
        """
@@ -845,7 +841,7 @@ class NeptuneCallback(TrainerCallback):
 class CodeCarbonCallback(TrainerCallback):
    """
-    A :class:`~transformers.TrainerCallback` that tracks the CO2 emission of training.
+    A [`TrainerCallback`] that tracks the CO2 emission of training.
    """
    def __init__(self):

--- a/src/transformers/keras_callbacks.py
+++ b/src/transformers/keras_callbacks.py