Mass conversion of documentation from rst to Markdown (#14866)

* Convert docstrings of all configurations and tokenizers * Processors and fixes * Last modeling files and fixes to models * Pipeline modules * Utils files * Data submodule * All the other files * Style * Missing examples * Style again * Fix copies * Say bye bye to rst docstrings forever

Mass conversion of documentation from rst to Markdown (#14866)
* Convert docstrings of all configurations and tokenizers * Processors and fixes * Last modeling files and fixes to models * Pipeline modules * Utils files * Data submodule * All the other files * Style * Missing examples * Style again * Fix copies * Say bye bye to rst docstrings forever
27b3031d · Sylvain Gugger · GitHub · 18587639 · 27b3031d · 27b3031d
Unverified Commit 27b3031d authored Dec 21, 2021 by Sylvain Gugger Committed by GitHub Dec 21, 2021
9 changed files
--- a/src/transformers/training_args_tf.py
+++ b/src/transformers/training_args_tf.py
@@ -33,134 +33,135 @@ class TFTrainingArguments(TrainingArguments):
    TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop
    itself**.

-    Using :class:`~transformers.HfArgumentParser` we can turn this class into `argparse
-    <https://docs.python.org/3/library/argparse.html#module-argparse>`__ arguments that can be specified on the command
+    Using [`HfArgumentParser`] we can turn this class into [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the command
    line.

    Parameters:
-        output_dir (:obj:`str`):
+        output_dir (`str`):
            The output directory where the model predictions and checkpoints will be written.
-        overwrite_output_dir (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            If :obj:`True`, overwrite the content of the output directory. Use this to continue training if
-            :obj:`output_dir` points to a checkpoint directory.
-        do_train (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to run training or not. This argument is not directly used by :class:`~transformers.Trainer`, it's
-            intended to be used by your training/evaluation scripts instead. See the `example scripts
-            <https://github.com/huggingface/transformers/tree/master/examples>`__ for more details.
-        do_eval (:obj:`bool`, `optional`):
-            Whether to run evaluation on the validation set or not. Will be set to :obj:`True` if
-            :obj:`evaluation_strategy` is different from :obj:`"no"`. This argument is not directly used by
-            :class:`~transformers.Trainer`, it's intended to be used by your training/evaluation scripts instead. See
-            the `example scripts <https://github.com/huggingface/transformers/tree/master/examples>`__ for more
+        overwrite_output_dir (`bool`, *optional*, defaults to `False`):
+            If `True`, overwrite the content of the output directory. Use this to continue training if
+            `output_dir` points to a checkpoint directory.
+        do_train (`bool`, *optional*, defaults to `False`):
+            Whether to run training or not. This argument is not directly used by [`Trainer`], it's
+            intended to be used by your training/evaluation scripts instead. See the [example scripts](https://github.com/huggingface/transformers/tree/master/examples) for more details.
+        do_eval (`bool`, *optional*):
+            Whether to run evaluation on the validation set or not. Will be set to `True` if
+            `evaluation_strategy` is different from `"no"`. This argument is not directly used by
+            [`Trainer`], it's intended to be used by your training/evaluation scripts instead. See
+            the [example scripts](https://github.com/huggingface/transformers/tree/master/examples) for more
            details.
-        do_predict (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        do_predict (`bool`, *optional*, defaults to `False`):
            Whether to run predictions on the test set or not. This argument is not directly used by
-            :class:`~transformers.Trainer`, it's intended to be used by your training/evaluation scripts instead. See
-            the `example scripts <https://github.com/huggingface/transformers/tree/master/examples>`__ for more
+            [`Trainer`], it's intended to be used by your training/evaluation scripts instead. See
+            the [example scripts](https://github.com/huggingface/transformers/tree/master/examples) for more
            details.
-        evaluation_strategy (:obj:`str` or :class:`~transformers.trainer_utils.IntervalStrategy`, `optional`, defaults to :obj:`"no"`):
+        evaluation_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"no"`):
            The evaluation strategy to adopt during training. Possible values are:

-                * :obj:`"no"`: No evaluation is done during training.
-                * :obj:`"steps"`: Evaluation is done (and logged) every :obj:`eval_steps`.
-                * :obj:`"epoch"`: Evaluation is done at the end of each epoch.
+                - `"no"`: No evaluation is done during training.
+                - `"steps"`: Evaluation is done (and logged) every `eval_steps`.
+                - `"epoch"`: Evaluation is done at the end of each epoch.

-        per_device_train_batch_size (:obj:`int`, `optional`, defaults to 8):
+        per_device_train_batch_size (`int`, *optional*, defaults to 8):
            The batch size per GPU/TPU core/CPU for training.
-        per_device_eval_batch_size (:obj:`int`, `optional`, defaults to 8):
+        per_device_eval_batch_size (`int`, *optional*, defaults to 8):
            The batch size per GPU/TPU core/CPU for evaluation.
-        gradient_accumulation_steps: (:obj:`int`, `optional`, defaults to 1):
+        gradient_accumulation_steps: (`int`, *optional*, defaults to 1):
            Number of updates steps to accumulate the gradients for, before performing a backward/update pass.

-            .. warning::
+            <Tip warning={true}>

-                When using gradient accumulation, one step is counted as one step with backward pass. Therefore,
-                logging, evaluation, save will be conducted every ``gradient_accumulation_steps * xxx_step`` training
-                examples.
-        learning_rate (:obj:`float`, `optional`, defaults to 5e-5):
+            When using gradient accumulation, one step is counted as one step with backward pass. Therefore,
+            logging, evaluation, save will be conducted every `gradient_accumulation_steps * xxx_step` training
+            examples.
+
+            </Tip>
+
+        learning_rate (`float`, *optional*, defaults to 5e-5):
            The initial learning rate for Adam.
-        weight_decay (:obj:`float`, `optional`, defaults to 0):
+        weight_decay (`float`, *optional*, defaults to 0):
            The weight decay to apply (if not zero).
-        adam_beta1 (:obj:`float`, `optional`, defaults to 0.9):
+        adam_beta1 (`float`, *optional*, defaults to 0.9):
            The beta1 hyperparameter for the Adam optimizer.
-        adam_beta2 (:obj:`float`, `optional`, defaults to 0.999):
+        adam_beta2 (`float`, *optional*, defaults to 0.999):
            The beta2 hyperparameter for the Adam optimizer.
-        adam_epsilon (:obj:`float`, `optional`, defaults to 1e-8):
+        adam_epsilon (`float`, *optional*, defaults to 1e-8):
            The epsilon hyperparameter for the Adam optimizer.
-        max_grad_norm (:obj:`float`, `optional`, defaults to 1.0):
+        max_grad_norm (`float`, *optional*, defaults to 1.0):
            Maximum gradient norm (for gradient clipping).
-        num_train_epochs(:obj:`float`, `optional`, defaults to 3.0):
+        num_train_epochs(`float`, *optional*, defaults to 3.0):
            Total number of training epochs to perform.
-        max_steps (:obj:`int`, `optional`, defaults to -1):
+        max_steps (`int`, *optional*, defaults to -1):
            If set to a positive number, the total number of training steps to perform. Overrides
-            :obj:`num_train_epochs`.
-        warmup_ratio (:obj:`float`, `optional`, defaults to 0.0):
-            Ratio of total training steps used for a linear warmup from 0 to :obj:`learning_rate`.
-        warmup_steps (:obj:`int`, `optional`, defaults to 0):
-            Number of steps used for a linear warmup from 0 to :obj:`learning_rate`. Overrides any effect of
-            :obj:`warmup_ratio`.
-        logging_dir (:obj:`str`, `optional`):
-            `TensorBoard <https://www.tensorflow.org/tensorboard>`__ log directory. Will default to
-            `runs/**CURRENT_DATETIME_HOSTNAME**`.
-        logging_strategy (:obj:`str` or :class:`~transformers.trainer_utils.IntervalStrategy`, `optional`, defaults to :obj:`"steps"`):
+            `num_train_epochs`.
+        warmup_ratio (`float`, *optional*, defaults to 0.0):
+            Ratio of total training steps used for a linear warmup from 0 to `learning_rate`.
+        warmup_steps (`int`, *optional*, defaults to 0):
+            Number of steps used for a linear warmup from 0 to `learning_rate`. Overrides any effect of
+            `warmup_ratio`.
+        logging_dir (`str`, *optional*):
+            [TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to
+            *runs/**CURRENT_DATETIME_HOSTNAME***.
+        logging_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"steps"`):
            The logging strategy to adopt during training. Possible values are:

-                * :obj:`"no"`: No logging is done during training.
-                * :obj:`"epoch"`: Logging is done at the end of each epoch.
-                * :obj:`"steps"`: Logging is done every :obj:`logging_steps`.
+                - `"no"`: No logging is done during training.
+                - `"epoch"`: Logging is done at the end of each epoch.
+                - `"steps"`: Logging is done every `logging_steps`.

-        logging_first_step (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to log and evaluate the first :obj:`global_step` or not.
-        logging_steps (:obj:`int`, `optional`, defaults to 500):
-            Number of update steps between two logs if :obj:`logging_strategy="steps"`.
-        save_strategy (:obj:`str` or :class:`~transformers.trainer_utils.IntervalStrategy`, `optional`, defaults to :obj:`"steps"`):
+        logging_first_step (`bool`, *optional*, defaults to `False`):
+            Whether to log and evaluate the first `global_step` or not.
+        logging_steps (`int`, *optional*, defaults to 500):
+            Number of update steps between two logs if `logging_strategy="steps"`.
+        save_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"steps"`):
            The checkpoint save strategy to adopt during training. Possible values are:

-                * :obj:`"no"`: No save is done during training.
-                * :obj:`"epoch"`: Save is done at the end of each epoch.
-                * :obj:`"steps"`: Save is done every :obj:`save_steps`.
+                - `"no"`: No save is done during training.
+                - `"epoch"`: Save is done at the end of each epoch.
+                - `"steps"`: Save is done every `save_steps`.

-        save_steps (:obj:`int`, `optional`, defaults to 500):
-            Number of updates steps before two checkpoint saves if :obj:`save_strategy="steps"`.
-        save_total_limit (:obj:`int`, `optional`):
+        save_steps (`int`, *optional*, defaults to 500):
+            Number of updates steps before two checkpoint saves if `save_strategy="steps"`.
+        save_total_limit (`int`, *optional*):
            If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
-            :obj:`output_dir`.
-        no_cuda (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            `output_dir`.
+        no_cuda (`bool`, *optional*, defaults to `False`):
            Whether to not use CUDA even when it is available or not.
-        seed (:obj:`int`, `optional`, defaults to 42):
+        seed (`int`, *optional*, defaults to 42):
            Random seed that will be set at the beginning of training.
-        fp16 (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        fp16 (`bool`, *optional*, defaults to `False`):
            Whether to use 16-bit (mixed) precision training (through NVIDIA Apex) instead of 32-bit training.
-        fp16_opt_level (:obj:`str`, `optional`, defaults to 'O1'):
-            For :obj:`fp16` training, Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details
-            on the `Apex documentation <https://nvidia.github.io/apex/amp.html>`__.
-        local_rank (:obj:`int`, `optional`, defaults to -1):
+        fp16_opt_level (`str`, *optional*, defaults to 'O1'):
+            For `fp16` training, Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details
+            on the [Apex documentation](https://nvidia.github.io/apex/amp).
+        local_rank (`int`, *optional*, defaults to -1):
            During distributed training, the rank of the process.
-        tpu_num_cores (:obj:`int`, `optional`):
+        tpu_num_cores (`int`, *optional*):
            When training on TPU, the number of TPU cores (automatically passed by launcher script).
-        debug (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        debug (`bool`, *optional*, defaults to `False`):
            Whether to activate the trace to record computation graphs and profiling information or not.
-        dataloader_drop_last (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        dataloader_drop_last (`bool`, *optional*, defaults to `False`):
            Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size)
            or not.
-        eval_steps (:obj:`int`, `optional`, defaults to 1000):
+        eval_steps (`int`, *optional*, defaults to 1000):
            Number of update steps before two evaluations.
-        past_index (:obj:`int`, `optional`, defaults to -1):
-            Some models like :doc:`TransformerXL <../model_doc/transformerxl>` or :doc`XLNet <../model_doc/xlnet>` can
+        past_index (`int`, *optional*, defaults to -1):
+            Some models like [TransformerXL](../model_doc/transformerxl) or :doc*XLNet <../model_doc/xlnet>* can
            make use of the past hidden states for their predictions. If this argument is set to a positive int, the
-            ``Trainer`` will use the corresponding output (usually index 2) as the past state and feed it to the model
-            at the next training step under the keyword argument ``mems``.
-        tpu_name (:obj:`str`, `optional`):
+            `Trainer` will use the corresponding output (usually index 2) as the past state and feed it to the model
+            at the next training step under the keyword argument `mems`.
+        tpu_name (`str`, *optional*):
            The name of the TPU the process is running on.
-        tpu_zone (:obj:`str`, `optional`):
+        tpu_zone (`str`, *optional*):
            The zone of the TPU the process is running on. If not specified, we will attempt to automatically detect
            from metadata.
-        gcp_project (:obj:`str`, `optional`):
+        gcp_project (`str`, *optional*):
            Google Cloud Project name for the Cloud TPU-enabled project. If not specified, we will attempt to
            automatically detect from metadata.
-        run_name (:obj:`str`, `optional`):
+        run_name (`str`, *optional*):
            A descriptor for the run. Notably used for wandb logging.
-        xla (:obj:`bool`, `optional`):
+        xla (`bool`, *optional*):
            Whether to activate the XLA compilation or not.
    """

@@ -259,7 +260,7 @@ class TFTrainingArguments(TrainingArguments):
    @property
    def train_batch_size(self) -> int:
        """
-        The actual batch size for training (may differ from :obj:`per_gpu_train_batch_size` in distributed training).
+        The actual batch size for training (may differ from `per_gpu_train_batch_size` in distributed training).
        """
        if self.per_gpu_train_batch_size:
            logger.warning(
@@ -272,7 +273,7 @@ class TFTrainingArguments(TrainingArguments):
    @property
    def eval_batch_size(self) -> int:
        """
-        The actual batch size for evaluation (may differ from :obj:`per_gpu_eval_batch_size` in distributed training).
+        The actual batch size for evaluation (may differ from `per_gpu_eval_batch_size` in distributed training).
        """
        if self.per_gpu_eval_batch_size:
            logger.warning(

--- a/src/transformers/utils/fx.py
+++ b/src/transformers/utils/fx.py
@@ -404,12 +404,12 @@ class HFTracer(Tracer):

    def path_of_module(self, mod: nn.Module) -> str:
        """
-        Helper method to find the qualified name of ``mod`` in the Module hierarchy of ``root``. For example, if
-        ``root`` has a submodule named ``foo``, which has a submodule named ``bar``, passing ``bar`` into this function
+        Helper method to find the qualified name of `mod` in the Module hierarchy of `root`. For example, if
+        `root` has a submodule named `foo`, which has a submodule named `bar`, passing `bar` into this function
        will return the string "foo.bar".

        Args:
-            mod (str): The ``Module`` to retrieve the qualified name for.
+            mod (str): The `Module` to retrieve the qualified name for.
        """
        # Prefer the O(1) algorithm
        if hasattr(self, "submodule_paths") and self.submodule_paths:
@@ -506,32 +506,32 @@ def symbolic_trace(
    Performs symbolic tracing on the model.

    Args:
-        model (:obj:`PretrainedModel`):
+        model ([`PretrainedModel`]):
            The model to trace.
-        input_names (:obj:`List[str]`, `optional`):
+        input_names (`List[str]`, *optional*):
            The names of the inputs of the traced model. If unset, model.dummy_inputs().keys() are used instead.
-        batch_size (:obj:`int`, `optional`, defaults to 1):
+        batch_size (`int`, *optional*, defaults to 1):
            The batch size of the traced model inputs.
-        sequence_length (:obj:`int` or :obj:`List[int]]`):
+        sequence_length (`int` or `List[int]]`):
            The sequence length of the traced model inputs. For sequence-to-sequence models with different sequence
-            lengths between the encoder and the decoder inputs, this must be :obj:`[encoder_sequence_length,
-            decoder_sequence_length]`.
-        num_choices (:obj:`int`, `optional`, defaults to -1):
+            lengths between the encoder and the decoder inputs, this must be `[encoder_sequence_length, decoder_sequence_length]`.
+        num_choices (`int`, *optional*, defaults to -1):
            The number of possible choices for a multiple choice task.

    Returns:
-        :obj:`torch.fx.GraphModule`: A GraphModule constructed by recording operations seen while tracing the model.
-
-    Example::
-
-        from transformers.utils.fx import symbolic_trace
-        traced_model = symbolic_trace(
-            model,
-            input_names=["input_ids", "attention_mask", "token_type_ids"],
-            batch_size=1,
-            sequence_length=128,
-        )
-    """
+        `torch.fx.GraphModule`: A GraphModule constructed by recording operations seen while tracing the model.
+
+    Example:
+
+    ```python
+    from transformers.utils.fx import symbolic_trace
+    traced_model = symbolic_trace(
+        model,
+        input_names=["input_ids", "attention_mask", "token_type_ids"],
+        batch_size=1,
+        sequence_length=128,
+    )
+    ```"""
    if input_names is None:
        input_names = model.dummy_inputs.keys()


--- a/src/transformers/utils/logging.py
+++ b/src/transformers/utils/logging.py
@@ -46,7 +46,7 @@ _default_log_level = logging.WARNING
 def _get_default_logging_level():
    """
    If TRANSFORMERS_VERBOSITY env var is set to one of the valid choices return that as the new default level. If it is
-    not - fall back to ``_default_log_level``
+    not - fall back to `_default_log_level`
    """
    env_level_str = os.getenv("TRANSFORMERS_VERBOSITY", None)
    if env_level_str:
@@ -125,18 +125,19 @@ def get_verbosity() -> int:
    Return the current level for the 🤗 Transformers's root logger as an int.

    Returns:
-        :obj:`int`: The logging level.
+        `int`: The logging level.

-    .. note::
+    <Tip>

-        🤗 Transformers has following logging levels:
+    🤗 Transformers has following logging levels:

-        - 50: ``transformers.logging.CRITICAL`` or ``transformers.logging.FATAL``
-        - 40: ``transformers.logging.ERROR``
-        - 30: ``transformers.logging.WARNING`` or ``transformers.logging.WARN``
-        - 20: ``transformers.logging.INFO``
-        - 10: ``transformers.logging.DEBUG``
-    """
+    - 50: `transformers.logging.CRITICAL` or `transformers.logging.FATAL`
+    - 40: `transformers.logging.ERROR`
+    - 30: `transformers.logging.WARNING` or `transformers.logging.WARN`
+    - 20: `transformers.logging.INFO`
+    - 10: `transformers.logging.DEBUG`
+
+    </Tip>"""

    _configure_library_root_logger()
    return _get_library_root_logger().getEffectiveLevel()
@@ -147,14 +148,14 @@ def set_verbosity(verbosity: int) -> None:
    Set the verbosity level for the 🤗 Transformers's root logger.

    Args:
-        verbosity (:obj:`int`):
+        verbosity (`int`):
            Logging level, e.g., one of:

-            - ``transformers.logging.CRITICAL`` or ``transformers.logging.FATAL``
-            - ``transformers.logging.ERROR``
-            - ``transformers.logging.WARNING`` or ``transformers.logging.WARN``
-            - ``transformers.logging.INFO``
-            - ``transformers.logging.DEBUG``
+            - `transformers.logging.CRITICAL` or `transformers.logging.FATAL`
+            - `transformers.logging.ERROR`
+            - `transformers.logging.WARNING` or `transformers.logging.WARN`
+            - `transformers.logging.INFO`
+            - `transformers.logging.DEBUG`
    """

    _configure_library_root_logger()
@@ -162,22 +163,22 @@ def set_verbosity(verbosity: int) -> None:


 def set_verbosity_info():
-    """Set the verbosity to the :obj:`INFO` level."""
+    """Set the verbosity to the `INFO` level."""
    return set_verbosity(INFO)


 def set_verbosity_warning():
-    """Set the verbosity to the :obj:`WARNING` level."""
+    """Set the verbosity to the `WARNING` level."""
    return set_verbosity(WARNING)


 def set_verbosity_debug():
-    """Set the verbosity to the :obj:`DEBUG` level."""
+    """Set the verbosity to the `DEBUG` level."""
    return set_verbosity(DEBUG)


 def set_verbosity_error():
-    """Set the verbosity to the :obj:`ERROR` level."""
+    """Set the verbosity to the `ERROR` level."""
    return set_verbosity(ERROR)



--- a/src/transformers/utils/notebook.py
+++ b/src/transformers/utils/notebook.py
@@ -65,36 +65,37 @@ class NotebookProgressBar:

    Class attributes (overridden by derived classes)

-        - **warmup** (:obj:`int`) -- The number of iterations to do at the beginning while ignoring
-          :obj:`update_every`.
-        - **update_every** (:obj:`float`) -- Since calling the time takes some time, we only do it every presumed
-          :obj:`update_every` seconds. The progress bar uses the average time passed up until now to guess the next
+        - **warmup** (`int`) -- The number of iterations to do at the beginning while ignoring
+          `update_every`.
+        - **update_every** (`float`) -- Since calling the time takes some time, we only do it every presumed
+          `update_every` seconds. The progress bar uses the average time passed up until now to guess the next
          value for which it will call the update.

    Args:
-        total (:obj:`int`):
+        total (`int`):
            The total number of iterations to reach.
-        prefix (:obj:`str`, `optional`):
+        prefix (`str`, *optional*):
            A prefix to add before the progress bar.
-        leave (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        leave (`bool`, *optional*, defaults to `True`):
            Whether or not to leave the progress bar once it's completed. You can always call the
-            :meth:`~transformers.utils.notebook.NotebookProgressBar.close` method to make the bar disappear.
-        parent (:class:`~transformers.notebook.NotebookTrainingTracker`, `optional`):
-            A parent object (like :class:`~transformers.utils.notebook.NotebookTrainingTracker`) that spawns progress
-            bars and handle their display. If set, the object passed must have a :obj:`display()` method.
-        width (:obj:`int`, `optional`, defaults to 300):
+            [`~utils.notebook.NotebookProgressBar.close`] method to make the bar disappear.
+        parent ([`~notebook.NotebookTrainingTracker`], *optional*):
+            A parent object (like [`~utils.notebook.NotebookTrainingTracker`]) that spawns progress
+            bars and handle their display. If set, the object passed must have a `display()` method.
+        width (`int`, *optional*, defaults to 300):
            The width (in pixels) that the bar will take.

-    Example::
+    Example:

-        import time
+    ```python
+    import time

-        pbar = NotebookProgressBar(100)
-        for val in range(100):
-            pbar.update(val)
-            time.sleep(0.07)
-        pbar.update(100)
-    """
+    pbar = NotebookProgressBar(100)
+    for val in range(100):
+        pbar.update(val)
+        time.sleep(0.07)
+    pbar.update(100)
+    ```"""

    warmup = 5
    update_every = 0.2
@@ -118,17 +119,17 @@ class NotebookProgressBar:

    def update(self, value: int, force_update: bool = False, comment: str = None):
        """
-        The main method to update the progress bar to :obj:`value`.
+        The main method to update the progress bar to `value`.

        Args:

-            value (:obj:`int`):
-                The value to use. Must be between 0 and :obj:`total`.
-            force_update (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            value (`int`):
+                The value to use. Must be between 0 and `total`.
+            force_update (`bool`, *optional*, defaults to `False`):
                Whether or not to force and update of the internal state and display (by default, the bar will wait for
-                :obj:`value` to reach the value it predicted corresponds to a time of more than the :obj:`update_every`
+                `value` to reach the value it predicted corresponds to a time of more than the `update_every`
                attribute since the last update to avoid adding boilerplate).
-            comment (:obj:`str`, `optional`):
+            comment (`str`, *optional*):
                A comment to add on the left of the progress bar.
        """
        self.value = value
@@ -203,10 +204,10 @@ class NotebookTrainingTracker(NotebookProgressBar):

    Args:

-        num_steps (:obj:`int`): The number of steps during training.
-        column_names (:obj:`List[str]`, `optional`):
+        num_steps (`int`): The number of steps during training.
+        column_names (`List[str]`, *optional*):
            The list of column names for the metrics table (will be inferred from the first call to
-            :meth:`~transformers.utils.notebook.NotebookTrainingTracker.write_line` if not set).
+            [`~utils.notebook.NotebookTrainingTracker.write_line`] if not set).
    """

    def __init__(self, num_steps, column_names=None):
@@ -230,7 +231,7 @@ class NotebookTrainingTracker(NotebookProgressBar):
        Write the values in the inner table.

        Args:
-            values (:obj:`Dict[str, float]`): The values to display.
+            values (`Dict[str, float]`): The values to display.
        """
        if self.inner_table is None:
            self.inner_table = [list(values.keys()), list(values.values())]
@@ -250,9 +251,9 @@ class NotebookTrainingTracker(NotebookProgressBar):
        easily updated).

        Args:
-            total (:obj:`int`): The number of iterations for the child progress bar.
-            prefix (:obj:`str`, `optional`): A prefix to write on the left of the progress bar.
-            width (:obj:`int`, `optional`, defaults to 300): The width (in pixels) of the progress bar.
+            total (`int`): The number of iterations for the child progress bar.
+            prefix (`str`, *optional*): A prefix to write on the left of the progress bar.
+            width (`int`, *optional*, defaults to 300): The width (in pixels) of the progress bar.
        """
        self.child_bar = NotebookProgressBar(total, prefix=prefix, parent=self, width=width)
        return self.child_bar
@@ -267,7 +268,7 @@ class NotebookTrainingTracker(NotebookProgressBar):

 class NotebookProgressCallback(TrainerCallback):
    """
-    A :class:`~transformers.TrainerCallback` that displays the progress of training or evaluation, optimized for
+    A [`TrainerCallback`] that displays the progress of training or evaluation, optimized for
    Jupyter Notebooks or Google colab.
    """


--- a/src/transformers/utils/versions.py
+++ b/src/transformers/utils/versions.py
@@ -55,18 +55,18 @@ def require_version(requirement: str, hint: Optional[str] = None) -> None:
    """
    Perform a runtime check of the dependency versions, using the exact same syntax used by pip.

-    The installed module version comes from the `site-packages` dir via `importlib_metadata`.
+    The installed module version comes from the *site-packages* dir via *importlib_metadata*.

    Args:
-        requirement (:obj:`str`): pip style definition, e.g.,  "tokenizers==0.9.4", "tqdm>=4.27", "numpy"
-        hint (:obj:`str`, `optional`): what suggestion to print in case of requirements not being met
+        requirement (`str`): pip style definition, e.g.,  "tokenizers==0.9.4", "tqdm>=4.27", "numpy"
+        hint (`str`, *optional*): what suggestion to print in case of requirements not being met

-    Example::
+    Example:

-       require_version("pandas>1.1.2")
-       require_version("numpy>1.18.5", "this is important to have for whatever reason")
-
-    """
+    ```python
+    require_version("pandas>1.1.2")
+    require_version("numpy>1.18.5", "this is important to have for whatever reason")
+    ```"""

    hint = f"\n{hint}" if hint is not None else ""


--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py
@@ -28,108 +28,110 @@ logger = logging.get_logger(__name__)

 class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.{{cookiecutter.camelcase_modelname}}Model`.
+    This is the configuration class to store the configuration of a [`~{{cookiecutter.camelcase_modelname}}Model`].
    It is used to instantiate an {{cookiecutter.modelname}} model according to the specified arguments, defining the model
    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-    the {{cookiecutter.modelname}} `{{cookiecutter.checkpoint_identifier}} <https://huggingface.co/{{cookiecutter.checkpoint_identifier}}>`__ architecture.
+    the {{cookiecutter.modelname}} [{{cookiecutter.checkpoint_identifier}}](https://huggingface.co/{{cookiecutter.checkpoint_identifier}}) architecture.

-    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    Configuration objects inherit from  [`PretrainedConfig`] and can be used
+    to control the model outputs. Read the documentation from  [`PretrainedConfig`]
    for more information.


    Args:
        {% if cookiecutter.is_encoder_decoder_model == "False" -%}
-        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+        vocab_size (`int`, *optional*, defaults to 30522):
            Vocabulary size of the {{cookiecutter.modelname}} model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.{{cookiecutter.camelcase_modelname}}Model` or
-            :class:`~transformers.TF{{cookiecutter.camelcase_modelname}}Model`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            `inputs_ids` passed when calling [`~{{cookiecutter.camelcase_modelname}}Model`] or
+            [`~TF{{cookiecutter.camelcase_modelname}}Model`].
+        hidden_size (`int`, *optional*, defaults to 768):
            Dimension of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler.
-            If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
            The maximum sequence length that this model might ever be used with.
            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.{{cookiecutter.camelcase_modelname}}Model` or
-            :class:`~transformers.TF{{cookiecutter.camelcase_modelname}}Model`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`~{{cookiecutter.camelcase_modelname}}Model`] or
+            [`~TF{{cookiecutter.camelcase_modelname}}Model`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if ``config.is_decoder=True``.
+            relevant if `config.is_decoder=True`.
        {% else -%}
-        vocab_size (:obj:`int`, `optional`, defaults to 50265):
+        vocab_size (`int`, *optional*, defaults to 50265):
            Vocabulary size of the {{cookiecutter.modelname}} model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.{{cookiecutter.camelcase_modelname}}Model` or
-            :class:`~transformers.TF{{cookiecutter.camelcase_modelname}}Model`.
-        d_model (:obj:`int`, `optional`, defaults to 1024):
+            `inputs_ids` passed when calling [`~{{cookiecutter.camelcase_modelname}}Model`] or
+            [`~TF{{cookiecutter.camelcase_modelname}}Model`].
+        d_model (`int`, *optional*, defaults to 1024):
            Dimension of the layers and the pooler layer.
-        encoder_layers (:obj:`int`, `optional`, defaults to 12):
+        encoder_layers (`int`, *optional*, defaults to 12):
            Number of encoder layers.
-        decoder_layers (:obj:`int`, `optional`, defaults to 12):
+        decoder_layers (`int`, *optional*, defaults to 12):
            Number of decoder layers.
-        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
-        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
-        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
-        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        activation_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for classifier.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
+        max_position_embeddings (`int`, *optional*, defaults to 1024):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
-        init_std (:obj:`float`, `optional`, defaults to 0.02):
+        init_std (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
        {% endif -%}

-    Example::
+    Example:

-        >>> from transformers import {{cookiecutter.camelcase_modelname}}Model, {{cookiecutter.camelcase_modelname}}Config
+    ```python
+    >>> from transformers import {{cookiecutter.camelcase_modelname}}Model, {{cookiecutter.camelcase_modelname}}Config

-        >>> # Initializing a {{cookiecutter.modelname}} {{cookiecutter.checkpoint_identifier}} style configuration
-        >>> configuration = {{cookiecutter.camelcase_modelname}}Config()
+    >>> # Initializing a {{cookiecutter.modelname}} {{cookiecutter.checkpoint_identifier}} style configuration
+    >>> configuration = {{cookiecutter.camelcase_modelname}}Config()

-        >>> # Initializing a model from the {{cookiecutter.checkpoint_identifier}} style configuration
-        >>> model = {{cookiecutter.camelcase_modelname}}Model(configuration)
+    >>> # Initializing a model from the {{cookiecutter.checkpoint_identifier}} style configuration
+    >>> model = {{cookiecutter.camelcase_modelname}}Model(configuration)

-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+"""
    model_type = "{{cookiecutter.lowercase_modelname}}"
    {% if cookiecutter.is_encoder_decoder_model == "False" -%}
    {% else -%}

--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_fast_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_fast_{{cookiecutter.lowercase_modelname}}.py
@@ -42,12 +42,12 @@ PRETRAINED_INIT_CONFIGURATION = {

 class {{cookiecutter.camelcase_modelname}}TokenizerFast(BertTokenizerFast):
    r"""
-    Construct a "fast" {{cookiecutter.modelname}} tokenizer (backed by HuggingFace's `tokenizers` library).
+    Construct a "fast" {{cookiecutter.modelname}} tokenizer (backed by HuggingFace's *tokenizers* library).

-    :class:`~transformers.{{cookiecutter.camelcase_modelname}}TokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
+    [`~{{cookiecutter.camelcase_modelname}}TokenizerFast`] is identical to [`BertTokenizerFast`] and runs
    end-to-end tokenization: punctuation splitting and wordpiece.

-    Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning
    parameters.
    """

@@ -86,12 +86,12 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {

 class {{cookiecutter.camelcase_modelname}}TokenizerFast(BartTokenizerFast):
    r"""
-    Construct a "fast" {{cookiecutter.modelname}} tokenizer (backed by HuggingFace's `tokenizers` library).
+    Construct a "fast" {{cookiecutter.modelname}} tokenizer (backed by HuggingFace's *tokenizers* library).

-    :class:`~transformers.{{cookiecutter.camelcase_modelname}}TokenizerFast` is identical to :class:`~transformers.BartTokenizerFast` and runs
+    [`~{{cookiecutter.camelcase_modelname}}TokenizerFast`] is identical to [`BartTokenizerFast`] and runs
    end-to-end tokenization: punctuation splitting and wordpiece.

-    Refer to superclass :class:`~transformers.BartTokenizerFast` for usage examples and documentation concerning
+    Refer to superclass [`BartTokenizerFast`] for usage examples and documentation concerning
    parameters.
    """

@@ -129,10 +129,10 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {

 class {{cookiecutter.camelcase_modelname}}TokenizerFast(PreTrainedTokenizerFast):
    """
-    Construct a "fast" {{cookiecutter.modelname}} tokenizer (backed by HuggingFace's `tokenizers` library).
+    Construct a "fast" {{cookiecutter.modelname}} tokenizer (backed by HuggingFace's *tokenizers* library).

    Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
            Path to the vocabulary file.
    """

@@ -182,13 +182,13 @@ class {{cookiecutter.camelcase_modelname}}TokenizerFast(PreTrainedTokenizerFast)
        {{cookiecutter.modelname}} does not make use of token type ids, therefore a list of zeros is returned.

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`:  List of zeros.
+            `List[int]`:  List of zeros.
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py
@@ -43,10 +43,10 @@ class {{cookiecutter.camelcase_modelname}}Tokenizer(BertTokenizer):
    r"""
    Construct a {{cookiecutter.modelname}} tokenizer.

-    :class:`~transformers.{{cookiecutter.camelcase_modelname}}Tokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
+    [`~{{cookiecutter.camelcase_modelname}}Tokenizer`] is identical to [`BertTokenizer`] and runs end-to-end
    tokenization: punctuation splitting and wordpiece.

-    Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning
    parameters.
    """

@@ -85,10 +85,10 @@ class {{cookiecutter.camelcase_modelname}}Tokenizer(BartTokenizer):
    """
    Construct a {{cookiecutter.modelname}} tokenizer.

-    :class:`~transformers.{{cookiecutter.camelcase_modelname}}Tokenizer` is identical to :class:`~transformers.BartTokenizer` and runs end-to-end
+    [`~{{cookiecutter.camelcase_modelname}}Tokenizer`] is identical to [`BartTokenizer`] and runs end-to-end
    tokenization: punctuation splitting and wordpiece.

-    Refer to superclass :class:`~transformers.BartTokenizer` for usage examples and documentation concerning
+    Refer to superclass [`BartTokenizer`] for usage examples and documentation concerning
    parameters.
    """

@@ -125,7 +125,7 @@ class {{cookiecutter.camelcase_modelname}}Tokenizer(PreTrainedTokenizer):
    Construct a {{cookiecutter.modelname}} tokenizer. Based on byte-level Byte-Pair-Encoding.

    Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
            Path to the vocabulary file.
    """

@@ -173,11 +173,11 @@ class {{cookiecutter.camelcase_modelname}}Tokenizer(PreTrainedTokenizer):
        Save the vocabulary and special tokens file to a directory.

        Args:
-            save_directory (:obj:`str`):
+            save_directory (`str`):
                The directory in which to save the vocabulary.

        Returns:
-            :obj:`Tuple(str)`: Paths to the files saved.
+            `Tuple(str)`: Paths to the files saved.
        """

    def build_inputs_with_special_tokens(
@@ -188,17 +188,17 @@ class {{cookiecutter.camelcase_modelname}}Tokenizer(PreTrainedTokenizer):
        by concatenating and adding special tokens.
        A {{cookiecutter.modelname}} sequence has the following format:

-        - single sequence: ``<s> X </s>``
-        - pair of sequences: ``<s> A </s></s> B </s>``
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
@@ -211,18 +211,18 @@ class {{cookiecutter.camelcase_modelname}}Tokenizer(PreTrainedTokenizer):
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
@@ -241,13 +241,13 @@ class {{cookiecutter.camelcase_modelname}}Tokenizer(PreTrainedTokenizer):
        {{cookiecutter.modelname}} does not make use of token type ids, therefore a list of zeros is returned.

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`:  List of zeros.
+            `List[int]`:  List of zeros.
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
@@ -264,10 +264,10 @@ class {{cookiecutter.camelcase_modelname}}Tokenizer(PreTrainedTokenizer):

 class {{cookiecutter.camelcase_modelname}}TokenizerFast(PreTrainedTokenizerFast):
    """
-    Construct a "fast" {{cookiecutter.modelname}} tokenizer (backed by HuggingFace's `tokenizers` library).
+    Construct a "fast" {{cookiecutter.modelname}} tokenizer (backed by HuggingFace's *tokenizers* library).

    Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
            Path to the vocabulary file.
    """

@@ -317,13 +317,13 @@ class {{cookiecutter.camelcase_modelname}}TokenizerFast(PreTrainedTokenizerFast)
        {{cookiecutter.modelname}} does not make use of token type ids, therefore a list of zeros is returned.

        Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
-            :obj:`List[int]`:  List of zeros.
+            `List[int]`:  List of zeros.
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -593,6 +593,51 @@ def check_all_objects_are_documented():
            "The following objects are in the public init so should be documented:\n - "
            + "\n - ".join(undocumented_objs)
        )
+    check_docstrings_are_in_md()
+
+
+# Re pattern to catch :obj:`xx`, :class:`xx`, :func:`xx` or :meth:`xx`.
+_re_rst_special_words = re.compile(r":(?:obj|func|class|meth):`([^`]+)`")
+# Re pattern to catch things between double backquotes.
+_re_double_backquotes = re.compile(r"(^|[^`])``([^`]+)``([^`]|$)")
+# Re pattern to catch example introduction.
+_re_rst_example = re.compile(r"^\s*Example.*::\s*$", flags=re.MULTILINE)
+
+
+def is_rst_docstring(docstring):
+    """
+    Returns `True` if `docstring` is written in rst.
+    """
+    if _re_rst_special_words.search(docstring) is not None:
+        return True
+    if _re_double_backquotes.search(docstring) is not None:
+        return True
+    if _re_rst_example.search(docstring) is not None:
+        return True
+    return False
+
+
+def check_docstrings_are_in_md():
+    """Check all docstrings are in md"""
+    files_with_rst = []
+    for file in Path(PATH_TO_TRANSFORMERS).glob("**/*.py"):
+        with open(file, "r") as f:
+            code = f.read()
+        docstrings = code.split('"""')
+
+        for idx, docstring in enumerate(docstrings):
+            if idx % 2 == 0 or not is_rst_docstring(docstring):
+                continue
+            files_with_rst.append(file)
+            break
+
+    if len(files_with_rst) > 0:
+        raise ValueError(
+            "The following files have docstrings written in rst:\n"
+            + "\n".join([f"- {f}" for f in files_with_rst])
+            + "To fix this run `doc_builder convert path_to_py_file` after installing `doc_builder`\n"
+            "(`pip install git+https://github.com/huggingface/doc-builder`)"
+        )


 def check_repo_quality():